Lets make wifi fast again!
 help / color / mirror / Atom feed
* [Make-wifi-fast] [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue.
@ 2016-08-17 12:58 Toke Høiland-Jørgensen
  2016-08-17 13:08 ` Johannes Berg
  2016-08-17 14:45 ` [Make-wifi-fast] [PATCH v2] " Toke Høiland-Jørgensen
  0 siblings, 2 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-17 12:58 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless
  Cc: Toke Høiland-Jørgensen, Felix Fietkau

The FQ portion of the intermediate queues will reorder packets, which
means that crypto IV generation needs to happen after dequeue when they
are enabled, or the receiver will throw packets away when receiving
them.

This fixes the performance regression introduced by enabling softq in
ath9k.

Cc: Felix Fietkau <nbd@nbd.name>
Tested-by: Dave Taht <dave@taht.net>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 include/net/mac80211.h  |  2 ++
 net/mac80211/sta_info.h |  3 +--
 net/mac80211/tx.c       | 55 +++++++++++++++++++++++++++++++------------------
 3 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..b23deba 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1556,6 +1556,7 @@ enum ieee80211_key_flags {
  * @tx_pn: PN used for TX keys, may be used by the driver as well if it
  *	needs to do software PN assignment by itself (e.g. due to TSO)
  * @flags: key flags, see &enum ieee80211_key_flags.
+ * @pn_offs: offset where to put PN for crypto (or 0 if not needed)
  * @keyidx: the key index (0-3)
  * @keylen: key material length
  * @key: key material. For ALG_TKIP the key is encoded as a 256-bit (32 byte)
@@ -1573,6 +1574,7 @@ struct ieee80211_key_conf {
 	u8 iv_len;
 	u8 hw_key_idx;
 	u8 flags;
+	u8 pn_offs;
 	s8 keyidx;
 	u8 keylen;
 	u8 key[0];
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 0556be3..c9d4d69 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -266,7 +266,6 @@ struct sta_ampdu_mlme {
  * @hdr_len: actual 802.11 header length
  * @sa_offs: offset of the SA
  * @da_offs: offset of the DA
- * @pn_offs: offset where to put PN for crypto (or 0 if not needed)
  * @band: band this will be transmitted on, for tx_info
  * @rcu_head: RCU head to free this struct
  *
@@ -277,7 +276,7 @@ struct sta_ampdu_mlme {
 struct ieee80211_fast_tx {
 	struct ieee80211_key *key;
 	u8 hdr_len;
-	u8 sa_offs, da_offs, pn_offs;
+	u8 sa_offs, da_offs;
 	u8 band;
 	u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV +
 	       sizeof(rfc1042_header)] __aligned(2);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746d..4ae1f2c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1074,6 +1074,33 @@ ieee80211_tx_h_calculate_duration(struct ieee80211_tx_data *tx)
 	return TX_CONTINUE;
 }
 
+static inline void ieee80211_set_crypto_pn(struct ieee80211_key_conf *conf,
+					   struct sk_buff *skb)
+{
+	u64 pn;
+	u8 *crypto_hdr = skb->data + conf->pn_offs;
+
+	if (!conf->pn_offs)
+		return;
+
+	switch (conf->cipher) {
+	case WLAN_CIPHER_SUITE_CCMP:
+	case WLAN_CIPHER_SUITE_CCMP_256:
+	case WLAN_CIPHER_SUITE_GCMP:
+	case WLAN_CIPHER_SUITE_GCMP_256:
+		pn = atomic64_inc_return(&conf->tx_pn);
+		crypto_hdr[0] = pn;
+		crypto_hdr[1] = pn >> 8;
+		crypto_hdr[4] = pn >> 16;
+		crypto_hdr[5] = pn >> 24;
+		crypto_hdr[6] = pn >> 32;
+		crypto_hdr[7] = pn >> 40;
+		break;
+	}
+}
+
+
+
 /* actual transmit path */
 
 static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
@@ -1503,6 +1530,10 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 						    sta);
 		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 
+		if (info->control.hw_key) {
+			ieee80211_set_crypto_pn(info->control.hw_key, skb);
+		}
+
 		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
 		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
 			info->flags |= IEEE80211_TX_CTL_AMPDU;
@@ -2874,7 +2905,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
 			if (gen_iv) {
 				(build.hdr + build.hdr_len)[3] =
 					0x20 | (build.key->conf.keyidx << 6);
-				build.pn_offs = build.hdr_len;
+				build.key->conf.pn_offs = build.hdr_len;
 			}
 			if (gen_iv || iv_spc)
 				build.hdr_len += IEEE80211_CCMP_HDR_LEN;
@@ -2885,7 +2916,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
 			if (gen_iv) {
 				(build.hdr + build.hdr_len)[3] =
 					0x20 | (build.key->conf.keyidx << 6);
-				build.pn_offs = build.hdr_len;
+				build.key->conf.pn_offs = build.hdr_len;
 			}
 			if (gen_iv || iv_spc)
 				build.hdr_len += IEEE80211_GCMP_HDR_LEN;
@@ -3289,24 +3320,8 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
 	sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
 
-	if (fast_tx->pn_offs) {
-		u64 pn;
-		u8 *crypto_hdr = skb->data + fast_tx->pn_offs;
-
-		switch (fast_tx->key->conf.cipher) {
-		case WLAN_CIPHER_SUITE_CCMP:
-		case WLAN_CIPHER_SUITE_CCMP_256:
-		case WLAN_CIPHER_SUITE_GCMP:
-		case WLAN_CIPHER_SUITE_GCMP_256:
-			pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn);
-			crypto_hdr[0] = pn;
-			crypto_hdr[1] = pn >> 8;
-			crypto_hdr[4] = pn >> 16;
-			crypto_hdr[5] = pn >> 24;
-			crypto_hdr[6] = pn >> 32;
-			crypto_hdr[7] = pn >> 40;
-			break;
-		}
+	if (fast_tx->key && !local->ops->wake_tx_queue) {
+		ieee80211_set_crypto_pn(&fast_tx->key->conf, skb);
 	}
 
 	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-- 
2.9.2

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 12:58 [Make-wifi-fast] [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue Toke Høiland-Jørgensen
@ 2016-08-17 13:08 ` Johannes Berg
  2016-08-17 13:16   ` Toke Høiland-Jørgensen
  2016-08-17 14:45 ` [Make-wifi-fast] [PATCH v2] " Toke Høiland-Jørgensen
  1 sibling, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-08-17 13:08 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless
  Cc: Felix Fietkau


> @@ -1573,6 +1574,7 @@ struct ieee80211_key_conf {
>  	u8 iv_len;
>  	u8 hw_key_idx;
>  	u8 flags;
> +	u8 pn_offs;
> 
This is completely wrong.

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 13:08 ` Johannes Berg
@ 2016-08-17 13:16   ` Toke Høiland-Jørgensen
  2016-08-17 13:18     ` Johannes Berg
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-17 13:16 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless, Felix Fietkau

Johannes Berg <johannes@sipsolutions.net> writes:

>> @@ -1573,6 +1574,7 @@ struct ieee80211_key_conf {
>>  	u8 iv_len;
>>  	u8 hw_key_idx;
>>  	u8 flags;
>> +	u8 pn_offs;
>> 
> This is completely wrong.

Well, the ieee80211_fast_tx struct is not available in
ieee80211_tx_dequeue, and I need the offset there. I thought about
sticking it into ieee80211_tx_info, but that is kinda full, and since
the ieee80211_key_conf is already available there, carrying it there
seems to work.

What would be a better way to do this?

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 13:16   ` Toke Høiland-Jørgensen
@ 2016-08-17 13:18     ` Johannes Berg
  2016-08-17 13:23       ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-08-17 13:18 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: make-wifi-fast, linux-wireless, Felix Fietkau

On Wed, 2016-08-17 at 15:16 +0200, Toke Høiland-Jørgensen wrote:
> Johannes Berg <johannes@sipsolutions.net> writes:
> 
> > 
> > > 
> > > @@ -1573,6 +1574,7 @@ struct ieee80211_key_conf {
> > >  	u8 iv_len;
> > >  	u8 hw_key_idx;
> > >  	u8 flags;
> > > +	u8 pn_offs;
> > > 
> > This is completely wrong.
> 
> Well, the ieee80211_fast_tx struct is not available in
> ieee80211_tx_dequeue, and I need the offset there. I thought about
> sticking it into ieee80211_tx_info, but that is kinda full, and since
> the ieee80211_key_conf is already available there, carrying it there
> seems to work.

For very limited testing, perhaps. But this isn't static across all
usages of the key, so this is still completely broken.

> What would be a better way to do this?
> 

Some redesign/rearchitecture, probably. Or just do it all in the driver
like iwlmvm?

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 13:18     ` Johannes Berg
@ 2016-08-17 13:23       ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-17 13:23 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless, Felix Fietkau

Johannes Berg <johannes@sipsolutions.net> writes:

> On Wed, 2016-08-17 at 15:16 +0200, Toke Høiland-Jørgensen wrote:
>> Johannes Berg <johannes@sipsolutions.net> writes:
>> 
>> > 
>> > > 
>> > > @@ -1573,6 +1574,7 @@ struct ieee80211_key_conf {
>> > >  	u8 iv_len;
>> > >  	u8 hw_key_idx;
>> > >  	u8 flags;
>> > > +	u8 pn_offs;
>> > > 
>> > This is completely wrong.
>> 
>> Well, the ieee80211_fast_tx struct is not available in
>> ieee80211_tx_dequeue, and I need the offset there. I thought about
>> sticking it into ieee80211_tx_info, but that is kinda full, and since
>> the ieee80211_key_conf is already available there, carrying it there
>> seems to work.
>
> For very limited testing, perhaps. But this isn't static across all
> usages of the key, so this is still completely broken.

OK, noted.

>> What would be a better way to do this?
>> 
>
> Some redesign/rearchitecture, probably. Or just do it all in the driver
> like iwlmvm?

Will look it over again. Should be possible to re-calculate the offset,
I guess.

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 12:58 [Make-wifi-fast] [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue Toke Høiland-Jørgensen
  2016-08-17 13:08 ` Johannes Berg
@ 2016-08-17 14:45 ` Toke Høiland-Jørgensen
  2016-08-17 15:47   ` Noah Causin
                     ` (2 more replies)
  1 sibling, 3 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-17 14:45 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless
  Cc: Toke Høiland-Jørgensen, Felix Fietkau

The FQ portion of the intermediate queues will reorder packets, which
means that crypto IV generation needs to happen after dequeue when they
are enabled, or the receiver will throw packets away when receiving
them.

This fixes the performance regression introduced by enabling softq in
ath9k.

Cc: Felix Fietkau <nbd@nbd.name>
Tested-by: Dave Taht <dave@taht.net>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
Changes since v1:
  - Recalculate pn_offs when needed instead of storing it.

 net/mac80211/sta_info.h |  3 +-
 net/mac80211/tx.c       | 85 +++++++++++++++++++++++++++++++++++++------------
 2 files changed, 66 insertions(+), 22 deletions(-)

diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 0556be3..c9d4d69 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -266,7 +266,6 @@ struct sta_ampdu_mlme {
  * @hdr_len: actual 802.11 header length
  * @sa_offs: offset of the SA
  * @da_offs: offset of the DA
- * @pn_offs: offset where to put PN for crypto (or 0 if not needed)
  * @band: band this will be transmitted on, for tx_info
  * @rcu_head: RCU head to free this struct
  *
@@ -277,7 +276,7 @@ struct sta_ampdu_mlme {
 struct ieee80211_fast_tx {
 	struct ieee80211_key *key;
 	u8 hdr_len;
-	u8 sa_offs, da_offs, pn_offs;
+	u8 sa_offs, da_offs;
 	u8 band;
 	u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV +
 	       sizeof(rfc1042_header)] __aligned(2);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746d..9caf75f 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1074,6 +1074,64 @@ ieee80211_tx_h_calculate_duration(struct ieee80211_tx_data *tx)
 	return TX_CONTINUE;
 }
 
+static void ieee80211_gen_crypto_iv(struct ieee80211_key_conf *conf,
+					   struct sta_info *sta, struct sk_buff *skb)
+{
+	struct ieee80211_sub_if_data *sdata;
+	u64 pn;
+	u8 *crypto_hdr;
+	u8 pn_offs = 0;
+
+	if (!conf || !sta || !(conf->flags & IEEE80211_KEY_FLAG_GENERATE_IV))
+		return;
+
+	sdata = sta->sdata;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_STATION:
+		if (sdata->u.mgd.use_4addr) {
+			pn_offs = 30;
+			break;
+		}
+		pn_offs = 24;
+		break;
+	case NL80211_IFTYPE_AP_VLAN:
+		if (sdata->wdev.use_4addr) {
+			pn_offs = 30;
+			break;
+		}
+		/* fall through */
+	case NL80211_IFTYPE_ADHOC:
+	case NL80211_IFTYPE_AP:
+		pn_offs = 24;
+		break;
+	default:
+		return;
+	}
+
+	if (sta->sta.wme) {
+		pn_offs += 2;
+	}
+
+	crypto_hdr = skb->data + pn_offs;
+	switch (conf->cipher) {
+	case WLAN_CIPHER_SUITE_CCMP:
+	case WLAN_CIPHER_SUITE_CCMP_256:
+	case WLAN_CIPHER_SUITE_GCMP:
+	case WLAN_CIPHER_SUITE_GCMP_256:
+		pn = atomic64_inc_return(&conf->tx_pn);
+		crypto_hdr[0] = pn;
+		crypto_hdr[1] = pn >> 8;
+		crypto_hdr[4] = pn >> 16;
+		crypto_hdr[5] = pn >> 24;
+		crypto_hdr[6] = pn >> 32;
+		crypto_hdr[7] = pn >> 40;
+		break;
+	}
+}
+
+
+
 /* actual transmit path */
 
 static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
@@ -1503,6 +1561,11 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 						    sta);
 		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 
+		if (info->control.hw_key) {
+			ieee80211_gen_crypto_iv(info->control.hw_key,
+			container_of(txq->sta, struct sta_info, sta), skb);
+		}
+
 		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
 		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
 			info->flags |= IEEE80211_TX_CTL_AMPDU;
@@ -2874,7 +2937,6 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
 			if (gen_iv) {
 				(build.hdr + build.hdr_len)[3] =
 					0x20 | (build.key->conf.keyidx << 6);
-				build.pn_offs = build.hdr_len;
 			}
 			if (gen_iv || iv_spc)
 				build.hdr_len += IEEE80211_CCMP_HDR_LEN;
@@ -2885,7 +2947,6 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
 			if (gen_iv) {
 				(build.hdr + build.hdr_len)[3] =
 					0x20 | (build.key->conf.keyidx << 6);
-				build.pn_offs = build.hdr_len;
 			}
 			if (gen_iv || iv_spc)
 				build.hdr_len += IEEE80211_GCMP_HDR_LEN;
@@ -3289,24 +3350,8 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
 	sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
 
-	if (fast_tx->pn_offs) {
-		u64 pn;
-		u8 *crypto_hdr = skb->data + fast_tx->pn_offs;
-
-		switch (fast_tx->key->conf.cipher) {
-		case WLAN_CIPHER_SUITE_CCMP:
-		case WLAN_CIPHER_SUITE_CCMP_256:
-		case WLAN_CIPHER_SUITE_GCMP:
-		case WLAN_CIPHER_SUITE_GCMP_256:
-			pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn);
-			crypto_hdr[0] = pn;
-			crypto_hdr[1] = pn >> 8;
-			crypto_hdr[4] = pn >> 16;
-			crypto_hdr[5] = pn >> 24;
-			crypto_hdr[6] = pn >> 32;
-			crypto_hdr[7] = pn >> 40;
-			break;
-		}
+	if (fast_tx->key && !local->ops->wake_tx_queue) {
+		ieee80211_gen_crypto_iv(&fast_tx->key->conf, sta, skb);
 	}
 
 	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-- 
2.9.2

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 14:45 ` [Make-wifi-fast] [PATCH v2] " Toke Høiland-Jørgensen
@ 2016-08-17 15:47   ` Noah Causin
  2016-08-17 22:33     ` Toke Høiland-Jørgensen
  2016-08-17 19:49   ` Johannes Berg
  2016-08-24 16:20   ` [Make-wifi-fast] [PATCH v3] mac80211: Move reorder-sensitive TX handlers " Toke Høiland-Jørgensen
  2 siblings, 1 reply; 77+ messages in thread
From: Noah Causin @ 2016-08-17 15:47 UTC (permalink / raw)
  To: make-wifi-fast

Hi,

Thank you and others for all the work you all have done.

I have a question:
How do I apply this to Felix Fietkaus's lede staging branch?

Thank you,

Noah Causin

On 8/17/2016 10:45 AM, Toke Høiland-Jørgensen wrote:
> The FQ portion of the intermediate queues will reorder packets, which
> means that crypto IV generation needs to happen after dequeue when they
> are enabled, or the receiver will throw packets away when receiving
> them.
>
> This fixes the performance regression introduced by enabling softq in
> ath9k.
>
> Cc: Felix Fietkau <nbd@nbd.name>
> Tested-by: Dave Taht <dave@taht.net>
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
> ---
> Changes since v1:
>    - Recalculate pn_offs when needed instead of storing it.
>
>   net/mac80211/sta_info.h |  3 +-
>   net/mac80211/tx.c       | 85 +++++++++++++++++++++++++++++++++++++------------
>   2 files changed, 66 insertions(+), 22 deletions(-)
>
> diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
> index 0556be3..c9d4d69 100644
> --- a/net/mac80211/sta_info.h
> +++ b/net/mac80211/sta_info.h
> @@ -266,7 +266,6 @@ struct sta_ampdu_mlme {
>    * @hdr_len: actual 802.11 header length
>    * @sa_offs: offset of the SA
>    * @da_offs: offset of the DA
> - * @pn_offs: offset where to put PN for crypto (or 0 if not needed)
>    * @band: band this will be transmitted on, for tx_info
>    * @rcu_head: RCU head to free this struct
>    *
> @@ -277,7 +276,7 @@ struct sta_ampdu_mlme {
>   struct ieee80211_fast_tx {
>   	struct ieee80211_key *key;
>   	u8 hdr_len;
> -	u8 sa_offs, da_offs, pn_offs;
> +	u8 sa_offs, da_offs;
>   	u8 band;
>   	u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV +
>   	       sizeof(rfc1042_header)] __aligned(2);
> diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
> index 1d0746d..9caf75f 100644
> --- a/net/mac80211/tx.c
> +++ b/net/mac80211/tx.c
> @@ -1074,6 +1074,64 @@ ieee80211_tx_h_calculate_duration(struct ieee80211_tx_data *tx)
>   	return TX_CONTINUE;
>   }
>   
> +static void ieee80211_gen_crypto_iv(struct ieee80211_key_conf *conf,
> +					   struct sta_info *sta, struct sk_buff *skb)
> +{
> +	struct ieee80211_sub_if_data *sdata;
> +	u64 pn;
> +	u8 *crypto_hdr;
> +	u8 pn_offs = 0;
> +
> +	if (!conf || !sta || !(conf->flags & IEEE80211_KEY_FLAG_GENERATE_IV))
> +		return;
> +
> +	sdata = sta->sdata;
> +
> +	switch (sdata->vif.type) {
> +	case NL80211_IFTYPE_STATION:
> +		if (sdata->u.mgd.use_4addr) {
> +			pn_offs = 30;
> +			break;
> +		}
> +		pn_offs = 24;
> +		break;
> +	case NL80211_IFTYPE_AP_VLAN:
> +		if (sdata->wdev.use_4addr) {
> +			pn_offs = 30;
> +			break;
> +		}
> +		/* fall through */
> +	case NL80211_IFTYPE_ADHOC:
> +	case NL80211_IFTYPE_AP:
> +		pn_offs = 24;
> +		break;
> +	default:
> +		return;
> +	}
> +
> +	if (sta->sta.wme) {
> +		pn_offs += 2;
> +	}
> +
> +	crypto_hdr = skb->data + pn_offs;
> +	switch (conf->cipher) {
> +	case WLAN_CIPHER_SUITE_CCMP:
> +	case WLAN_CIPHER_SUITE_CCMP_256:
> +	case WLAN_CIPHER_SUITE_GCMP:
> +	case WLAN_CIPHER_SUITE_GCMP_256:
> +		pn = atomic64_inc_return(&conf->tx_pn);
> +		crypto_hdr[0] = pn;
> +		crypto_hdr[1] = pn >> 8;
> +		crypto_hdr[4] = pn >> 16;
> +		crypto_hdr[5] = pn >> 24;
> +		crypto_hdr[6] = pn >> 32;
> +		crypto_hdr[7] = pn >> 40;
> +		break;
> +	}
> +}
> +
> +
> +
>   /* actual transmit path */
>   
>   static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
> @@ -1503,6 +1561,11 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
>   						    sta);
>   		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
>   
> +		if (info->control.hw_key) {
> +			ieee80211_gen_crypto_iv(info->control.hw_key,
> +			container_of(txq->sta, struct sta_info, sta), skb);
> +		}
> +
>   		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
>   		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
>   			info->flags |= IEEE80211_TX_CTL_AMPDU;
> @@ -2874,7 +2937,6 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
>   			if (gen_iv) {
>   				(build.hdr + build.hdr_len)[3] =
>   					0x20 | (build.key->conf.keyidx << 6);
> -				build.pn_offs = build.hdr_len;
>   			}
>   			if (gen_iv || iv_spc)
>   				build.hdr_len += IEEE80211_CCMP_HDR_LEN;
> @@ -2885,7 +2947,6 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
>   			if (gen_iv) {
>   				(build.hdr + build.hdr_len)[3] =
>   					0x20 | (build.key->conf.keyidx << 6);
> -				build.pn_offs = build.hdr_len;
>   			}
>   			if (gen_iv || iv_spc)
>   				build.hdr_len += IEEE80211_GCMP_HDR_LEN;
> @@ -3289,24 +3350,8 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
>   	sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
>   	sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
>   
> -	if (fast_tx->pn_offs) {
> -		u64 pn;
> -		u8 *crypto_hdr = skb->data + fast_tx->pn_offs;
> -
> -		switch (fast_tx->key->conf.cipher) {
> -		case WLAN_CIPHER_SUITE_CCMP:
> -		case WLAN_CIPHER_SUITE_CCMP_256:
> -		case WLAN_CIPHER_SUITE_GCMP:
> -		case WLAN_CIPHER_SUITE_GCMP_256:
> -			pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn);
> -			crypto_hdr[0] = pn;
> -			crypto_hdr[1] = pn >> 8;
> -			crypto_hdr[4] = pn >> 16;
> -			crypto_hdr[5] = pn >> 24;
> -			crypto_hdr[6] = pn >> 32;
> -			crypto_hdr[7] = pn >> 40;
> -			break;
> -		}
> +	if (fast_tx->key && !local->ops->wake_tx_queue) {
> +		ieee80211_gen_crypto_iv(&fast_tx->key->conf, sta, skb);
>   	}
>   
>   	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 14:45 ` [Make-wifi-fast] [PATCH v2] " Toke Høiland-Jørgensen
  2016-08-17 15:47   ` Noah Causin
@ 2016-08-17 19:49   ` Johannes Berg
  2016-08-17 20:07     ` Dave Taht
  2016-08-24 16:20   ` [Make-wifi-fast] [PATCH v3] mac80211: Move reorder-sensitive TX handlers " Toke Høiland-Jørgensen
  2 siblings, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-08-17 19:49 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless
  Cc: Felix Fietkau

Hi,

You need to work on coding style, a lot of your indentation is
completely messed up.

> +	switch (sdata->vif.type) {
> +	case NL80211_IFTYPE_STATION:
> +		if (sdata->u.mgd.use_4addr) {
> +			pn_offs = 30;
> +			break;
> +		}
> +		pn_offs = 24;
> +		break;
> +	case NL80211_IFTYPE_AP_VLAN:
> +		if (sdata->wdev.use_4addr) {
> +			pn_offs = 30;
> +			break;
> +		}
> +		/* fall through */
> +	case NL80211_IFTYPE_ADHOC:
> +	case NL80211_IFTYPE_AP:
> +		pn_offs = 24;
> +		break;
> +	default:
> +		return;
> +	}
> +
> +	if (sta->sta.wme) {
> +		pn_offs += 2;
> +	}

I think you just reinvented ieee80211_hdrlen(). No?

> -	if (fast_tx->pn_offs) {
> -		u64 pn;
> -		u8 *crypto_hdr = skb->data + fast_tx->pn_offs;

No need to undo the pn_offs optimisation for the !txq case, you can
pass it in to the new function that will fill it.

However, you're still doing it wrong - now you haven't fixed anything
for TKIP, which won't hit the fastpath.

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 19:49   ` Johannes Berg
@ 2016-08-17 20:07     ` Dave Taht
  2016-08-17 20:43       ` Johannes Berg
  0 siblings, 1 reply; 77+ messages in thread
From: Dave Taht @ 2016-08-17 20:07 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless,
	Felix Fietkau

On Wed, Aug 17, 2016 at 9:49 PM, Johannes Berg
<johannes@sipsolutions.net> wrote:
> Hi,
>
> You need to work on coding style, a lot of your indentation is
> completely messed up.
>
>> +     switch (sdata->vif.type) {
>> +     case NL80211_IFTYPE_STATION:
>> +             if (sdata->u.mgd.use_4addr) {
>> +                     pn_offs = 30;
>> +                     break;
>> +             }
>> +             pn_offs = 24;
>> +             break;
>> +     case NL80211_IFTYPE_AP_VLAN:
>> +             if (sdata->wdev.use_4addr) {
>> +                     pn_offs = 30;
>> +                     break;
>> +             }
>> +             /* fall through */
>> +     case NL80211_IFTYPE_ADHOC:
>> +     case NL80211_IFTYPE_AP:
>> +             pn_offs = 24;
>> +             break;
>> +     default:
>> +             return;
>> +     }
>> +
>> +     if (sta->sta.wme) {
>> +             pn_offs += 2;
>> +     }
>
> I think you just reinvented ieee80211_hdrlen(). No?
>
>> -     if (fast_tx->pn_offs) {
>> -             u64 pn;
>> -             u8 *crypto_hdr = skb->data + fast_tx->pn_offs;
>
> No need to undo the pn_offs optimisation for the !txq case, you can
> pass it in to the new function that will fill it.
>
> However, you're still doing it wrong - now you haven't fixed anything
> for TKIP, which won't hit the fastpath.

well, we're getting there. the results of both patch attempts were
really nice, and brought encrypted performance with fq back into line
with unencrypted. Still running crypted tests as I write...

So fixing TKIP would be next, forcing the AP to use that? What other
scenarios do we have to worry about? WDS?


> johannes
> _______________________________________________
> Make-wifi-fast mailing list
> Make-wifi-fast@lists.bufferbloat.net
> https://lists.bufferbloat.net/listinfo/make-wifi-fast



-- 
Dave Täht
Let's go make home routers and wifi faster! With better software!
http://blog.cerowrt.org

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 20:07     ` Dave Taht
@ 2016-08-17 20:43       ` Johannes Berg
  2016-08-22 14:47         ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-08-17 20:43 UTC (permalink / raw)
  To: Dave Taht
  Cc: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless,
	Felix Fietkau


> well, we're getting there. the results of both patch attempts were
> really nice, and brought encrypted performance with fq back into line
> with unencrypted. Still running crypted tests as I write...
> 
> So fixing TKIP would be next, forcing the AP to use that? What other
> scenarios do we have to worry about? WDS?
> 

I don't think there's anything else, I just don't really feel it's
getting anywhere. This is a mere symptom of the design.

Felix had worked around the SN assignment in a similar way, but I feel
that perhaps the whole thing isn't quite the right architecture. Why
are we applying FQ after the wifi conversion, when clearly that doesn't
work well? Seems to me that it would make more sense to let the frames
sit on the queues as they come in, and do most of the wifi handling
only when needed (obviously, things like control port would still have
to be done).
We even count those packets that are dropped for TX statistics, which
would seem to be a big behavioural difference vs. applying a qdisc.

Now, it's unlikely to be that simple - fragmentation, for example,
might mess this up.

Overall though, I'm definitely wondering if it should be this way,
since all the special cases just add complexity.

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 15:47   ` Noah Causin
@ 2016-08-17 22:33     ` Toke Høiland-Jørgensen
  2016-08-19  3:06       ` Noah Causin
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-17 22:33 UTC (permalink / raw)
  To: Noah Causin; +Cc: make-wifi-fast

Noah Causin <n0manletter@gmail.com> writes:

> Hi,
>
> Thank you and others for all the work you all have done.
>
> I have a question:
> How do I apply this to Felix Fietkaus's lede staging branch?

Just drop it into package/kernel/mac80211/patches with a suitable name
and rebuild the mac80211 package. :)

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 22:33     ` Toke Høiland-Jørgensen
@ 2016-08-19  3:06       ` Noah Causin
  2016-08-22 14:24         ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 77+ messages in thread
From: Noah Causin @ 2016-08-19  3:06 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast

I have been trying to combine the airtime fairness patches with this 
crypto patch.

If I was to do this, what would be a good filename for the crypto patch?

Would I also have to do any modifications to get this working? Do I need 
to use the staging branch?

Noah

On 8/17/2016 6:33 PM, Toke Høiland-Jørgensen wrote:
> Noah Causin <n0manletter@gmail.com> writes:
>
>> Hi,
>>
>> Thank you and others for all the work you all have done.
>>
>> I have a question:
>> How do I apply this to Felix Fietkaus's lede staging branch?
> Just drop it into package/kernel/mac80211/patches with a suitable name
> and rebuild the mac80211 package. :)
>
> -Toke


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-19  3:06       ` Noah Causin
@ 2016-08-22 14:24         ` Toke Høiland-Jørgensen
  2016-08-23 17:06           ` Noah Causin
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-22 14:24 UTC (permalink / raw)
  To: Noah Causin; +Cc: make-wifi-fast

Noah Causin <n0manletter@gmail.com> writes:

> I have been trying to combine the airtime fairness patches with this crypto
> patch.
>
> If I was to do this, what would be a good filename for the crypto
> patch?

I've been using package/kernel/mac80211/patches/345-iv-fix.patch -
though I don't think the patch touches anything the earlier patches do,
so it probably doesn't matter too much. If the compile doesn't fail, you
should be fine :)

> Would I also have to do any modifications to get this working? Do I
> need to use the staging branch?

You'll need
package/kernel/mac80211/patches/337-ath9k-Switch-to-using-mac80211-intermediate-software.patch
to enable the software queues in ath9k, which is in the staging branch.

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 20:43       ` Johannes Berg
@ 2016-08-22 14:47         ` Toke Høiland-Jørgensen
  2016-08-26  8:38           ` Johannes Berg
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-22 14:47 UTC (permalink / raw)
  To: Johannes Berg; +Cc: Dave Taht, make-wifi-fast, linux-wireless, Felix Fietkau

Johannes Berg <johannes@sipsolutions.net> writes:

>> well, we're getting there. the results of both patch attempts were
>> really nice, and brought encrypted performance with fq back into line
>> with unencrypted. Still running crypted tests as I write...
>> 
>> So fixing TKIP would be next, forcing the AP to use that? What other
>> scenarios do we have to worry about? WDS?
>> 
>
> I don't think there's anything else, I just don't really feel it's
> getting anywhere. This is a mere symptom of the design.
>
> Felix had worked around the SN assignment in a similar way, but I feel
> that perhaps the whole thing isn't quite the right architecture. Why
> are we applying FQ after the wifi conversion, when clearly that doesn't
> work well? Seems to me that it would make more sense to let the frames
> sit on the queues as they come in, and do most of the wifi handling
> only when needed (obviously, things like control port would still have
> to be done).

I suppose that could be a way to do it (i.e. have ieee80211_tx_dequeue
call all the TX hooks etc), but am not sure whether there would be
problems doing all this work in the loop that's building aggregates
(which is what would happen for ath9k at least).

An alternative could be to split the process up in two: An "early" and
"late" stage, where the early stage does everything that is not
sensitive to reordering and the occasional drop, and the late stage is
everything that is. Then the queueing step could happen in-between the
two stages, and the non-queueing path could just call both stages at
once. In effect, this would just make the current work-arounds be more
explicit in the structure, rather than being marked as exceptions.

> We even count those packets that are dropped for TX statistics, which
> would seem to be a big behavioural difference vs. applying a qdisc.

While you're right in principle, in practice I don't think this has too
big of an impact. In normal operation, CoDel drops (at most) dozens of
packets per *minute*, so it's not going to skew the statistics too much.

> Now, it's unlikely to be that simple - fragmentation, for example,
> might mess this up.
>
> Overall though, I'm definitely wondering if it should be this way,
> since all the special cases just add complexity.

I agree that the work-arounds are iffy, but I do also think it's
important to keep in mind that we are improving latency by orders of
magnitude here. A few special cases is worth it to achieve that, IMO.
And then iterating towards a design that don't need them, of course :)

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-22 14:24         ` Toke Høiland-Jørgensen
@ 2016-08-23 17:06           ` Noah Causin
  2016-08-23 17:51             ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 77+ messages in thread
From: Noah Causin @ 2016-08-23 17:06 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast

I ran into this issue where using the fix-iv patch combined with an 
ath9k WiFi card does not work correctly.

Clients are able to connect but are unable to get IP addresses by DHCP.

When I built an image for a router with a mt76, the WiFi works correctly 
with the patch.

The router I use is an Archer c7 v2.

Maybe the issue relates to the email formatting.

Would you please send the patch as an email attachment?


On 8/22/2016 10:24 AM, Toke Høiland-Jørgensen wrote:
> Noah Causin <n0manletter@gmail.com> writes:
>
>> I have been trying to combine the airtime fairness patches with this crypto
>> patch.
>>
>> If I was to do this, what would be a good filename for the crypto
>> patch?
> I've been using package/kernel/mac80211/patches/345-iv-fix.patch -
> though I don't think the patch touches anything the earlier patches do,
> so it probably doesn't matter too much. If the compile doesn't fail, you
> should be fine :)
>
>> Would I also have to do any modifications to get this working? Do I
>> need to use the staging branch?
> You'll need
> package/kernel/mac80211/patches/337-ath9k-Switch-to-using-mac80211-intermediate-software.patch
> to enable the software queues in ath9k, which is in the staging branch.
>
> -Toke


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-23 17:06           ` Noah Causin
@ 2016-08-23 17:51             ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-23 17:51 UTC (permalink / raw)
  To: Noah Causin; +Cc: make-wifi-fast

Noah Causin <n0manletter@gmail.com> writes:

> I ran into this issue where using the fix-iv patch combined with an ath9k WiFi
> card does not work correctly.
>
> Clients are able to connect but are unable to get IP addresses by DHCP.
>
> When I built an image for a router with a mt76, the WiFi works correctly with
> the patch.
>
> The router I use is an Archer c7 v2.
>
> Maybe the issue relates to the email formatting.

If the patch is mangled by the email, it would fail to apply (and the
build would fail) rather than cause what you're describing.

However, the current version of the patch is only a partial solution, so
I'm guessing you're simply running into one of the cases where it
doesn't work. I'm working on a more complete solution. Will post it once
it's ready :)

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [Make-wifi-fast] [PATCH v3] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-17 14:45 ` [Make-wifi-fast] [PATCH v2] " Toke Høiland-Jørgensen
  2016-08-17 15:47   ` Noah Causin
  2016-08-17 19:49   ` Johannes Berg
@ 2016-08-24 16:20   ` Toke Høiland-Jørgensen
  2016-08-24 22:40     ` Noah Causin
  2016-08-30 13:15     ` [Make-wifi-fast] [PATCH v4] " Toke Høiland-Jørgensen
  2 siblings, 2 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-24 16:20 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

To avoid having to deal with fragmentation on dequeue, the split is set
to be after the fragmentation handler. This means that some reordering
of TX handlers is necessary, and some handlers had to be made aware of
fragmentation due to this reordering.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
Changes since v2:

This is a completely different approach: Instead of adding exceptions
for TXQ handling, split up the entire TX path in an early and late part,
and apply the latter after TXQ dequeue. This should fix things that
don't hit the fast path as well.

I've tested this with both unencrypted traffic and with CCMP and TKIP
and it appears to fix the previous performance regression seen with
softq-enabled ath9k. I most likely haven't hit all code paths, though
(not sure how I would even go about ensuring that), but looks promising
so far.

 include/net/mac80211.h |   2 +
 net/mac80211/tx.c      | 276 ++++++++++++++++++++++++++++++++++++++-----------
 net/mac80211/wpa.c     |  18 +++-
 3 files changed, 235 insertions(+), 61 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
 };
 
 /*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746d..7042d2c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
 
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit);
+
 /* misc utils */
 
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -585,20 +591,27 @@ static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_key *key;
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_hdr *hdr;
+	struct sk_buff *skb = tx->skb;
+
+	if (!skb)
+		skb = skb_peek(&tx->skbs);
+
+	info = IEEE80211_SKB_CB(skb);
+	hdr = (struct ieee80211_hdr *)skb->data;
 
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
 		tx->key = NULL;
 	else if (tx->sta &&
 		 (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx])))
 		tx->key = key;
-	else if (ieee80211_is_group_privacy_action(tx->skb) &&
+	else if (ieee80211_is_group_privacy_action(skb) &&
 		(key = rcu_dereference(tx->sdata->default_multicast_key)))
 		tx->key = key;
 	else if (ieee80211_is_mgmt(hdr->frame_control) &&
 		 is_multicast_ether_addr(hdr->addr1) &&
-		 ieee80211_is_robust_mgmt_frame(tx->skb) &&
+		 ieee80211_is_robust_mgmt_frame(skb) &&
 		 (key = rcu_dereference(tx->sdata->default_mgmt_key)))
 		tx->key = key;
 	else if (is_multicast_ether_addr(hdr->addr1) &&
@@ -628,8 +641,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 		case WLAN_CIPHER_SUITE_GCMP_256:
 			if (!ieee80211_is_data_present(hdr->frame_control) &&
 			    !ieee80211_use_mfp(hdr->frame_control, tx->sta,
-					       tx->skb) &&
-			    !ieee80211_is_group_privacy_action(tx->skb))
+					       skb) &&
+			    !ieee80211_is_group_privacy_action(skb))
 				tx->key = NULL;
 			else
 				skip_hw = (tx->key->conf.flags &
@@ -799,10 +812,12 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid)
 static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct sk_buff *skb = skb_peek(&tx->skbs);
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
 	u8 *qc;
 	int tid;
+	u16 fragnum, seq;
 
 	/*
 	 * Packet injection may want to control the sequence
@@ -829,10 +844,16 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	 */
 	if (!ieee80211_is_data_qos(hdr->frame_control) ||
 	    is_multicast_ether_addr(hdr->addr1)) {
-		/* driver should assign sequence number */
-		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
-		/* for pure STA mode without beacons, we can do it */
-		hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number);
+		fragnum = 0;
+		seq = cpu_to_le16(tx->sdata->sequence_number);
+		skb_queue_walk(&tx->skbs, skb) {
+			info = IEEE80211_SKB_CB(skb);
+			hdr = (struct ieee80211_hdr *)skb->data;
+			/* driver should assign sequence number */
+			info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+			/* for pure STA mode without beacons, we can do it */
+			hdr->seq_ctrl = seq | fragnum++;
+		}
 		tx->sdata->sequence_number += 0x10;
 		if (tx->sta)
 			tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++;
@@ -853,8 +874,14 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
 
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+	if (!tx->sta->sta.txq[0]) {
+		seq = ieee80211_tx_next_seq(tx->sta, tid);
+		fragnum = 0;
+		skb_queue_walk(&tx->skbs, skb) {
+			hdr = (struct ieee80211_hdr *)skb->data;
+			hdr->seq_ctrl = seq | fragnum++;
+		}
+	}
 
 	return TX_CONTINUE;
 }
@@ -927,7 +954,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	struct ieee80211_hdr *hdr = (void *)skb->data;
 	int frag_threshold = tx->local->hw.wiphy->frag_threshold;
 	int hdrlen;
-	int fragnum;
 
 	/* no matter what happens, tx->skb moves to tx->skbs */
 	__skb_queue_tail(&tx->skbs, skb);
@@ -964,9 +990,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	if (ieee80211_fragment(tx, skb, hdrlen, frag_threshold))
 		return TX_DROP;
 
-	/* update duration/seq/flags of fragments */
-	fragnum = 0;
-
 	skb_queue_walk(&tx->skbs, skb) {
 		const __le16 morefrags = cpu_to_le16(IEEE80211_FCTL_MOREFRAGS);
 
@@ -987,8 +1010,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 		} else {
 			hdr->frame_control &= ~morefrags;
 		}
-		hdr->seq_ctrl |= cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG);
-		fragnum++;
 	}
 
 	return TX_CONTINUE;
@@ -1481,33 +1502,59 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct txq_info *txqi = container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb = NULL;
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
+	struct ieee80211_tx_info *info;
 
 	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
+begin:
 	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
 
 	ieee80211_set_skb_vif(skb, txqi);
 
-	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info = IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
 
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx = rcu_dereference(sta->fast_tx);
+		if (!fast_tx ||
+		    !ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb,
+						false)) {
+			/* fast xmit was started, but fails to finish */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+	} else {
+		struct ieee80211_tx_data tx = { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local = local;
+		if (txq->sta) {
+			struct sta_info *sta = container_of(txq->sta,
+							    struct sta_info,
+							    sta);
+			tx.sta = container_of(txq->sta, struct sta_info, sta);
+			tx.sdata = sta->sdata;
+		} else {
+			tx.sdata = container_of(info->control.vif,
+					struct ieee80211_sub_if_data, vif);
+		}
+
+		__skb_queue_tail(&tx.skbs, skb);
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		__skb_unlink(skb, &tx.skbs);
 	}
 
 out:
@@ -1521,6 +1568,77 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
 
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_vif *vif,
+				struct ieee80211_sta *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct fq *fq = &local->fq;
+	struct txq_info *txqi = ieee80211_get_txq(local, vif, sta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif = vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
+static bool ieee80211_queue_frags(struct ieee80211_local *local,
+				  struct sta_info *sta,
+				  struct sk_buff_head *skbs)
+{
+	struct txq_info *txqi;
+	struct sk_buff *skb, *tmp;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_vif *vif;
+	struct ieee80211_sta *pubsta;
+
+	if (WARN_ON(skb_queue_empty(skbs)))
+		return true;
+
+	skb = skb_peek(skbs);
+	info = IEEE80211_SKB_CB(skb);
+	sdata = vif_to_sdata(info->control.vif);
+	if (sta && !sta->uploaded)
+		sta = NULL;
+
+	if (sta)
+		pubsta = &sta->sta;
+	else
+		pubsta = NULL;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_MONITOR:
+		return false;
+	case NL80211_IFTYPE_AP_VLAN:
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+		/* fall through */
+	default:
+		vif = &sdata->vif;
+		break;
+	}
+
+	skb_queue_walk_safe(skbs, skb, tmp) {
+		txqi = ieee80211_get_txq(local, vif, pubsta, skb);
+		if (txqi) {
+			__skb_unlink(skb, skbs);
+			ieee80211_queue_skb(local, vif, pubsta, skb);
+		}
+	}
+
+	return !!skb_queue_empty(skbs);
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1646,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control = {};
-	struct fq *fq = &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1661,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
-		txqi = ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif = vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,8 +1781,12 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res = TX_DROP;
@@ -1697,7 +1802,6 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
 
@@ -1706,11 +1810,32 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 		tx->skb = NULL;
 		goto txh_done;
 	}
+	CALL_TXH(ieee80211_tx_h_fragment);
+
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
 
+	return 0;
+}
+
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	ieee80211_tx_result res = TX_DROP;
+
+	/* late tx handlers must be aware of tx info fragmentation! */
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
-	CALL_TXH(ieee80211_tx_h_fragment);
-	/* handlers after fragment must be aware of tx info fragmentation! */
 	CALL_TXH(ieee80211_tx_h_stats);
 	CALL_TXH(ieee80211_tx_h_encrypt);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
@@ -1733,6 +1858,11 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	return 0;
 }
 
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	return invoke_tx_handlers_early(tx) || invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1782,7 +1912,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_tx_data tx;
 	ieee80211_tx_result res_prepare;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
-	bool result = true;
+	bool result = true, queue = !!(local->ops->wake_tx_queue);
 	int led_len;
 
 	if (unlikely(skb->len < 10)) {
@@ -1807,7 +1937,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		info->hw_queue =
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
 
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (queue && ieee80211_queue_frags(local, tx.sta, &tx.skbs))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result = __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
 
@@ -3170,10 +3306,9 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx = NULL;
 	u8 tid = IEEE80211_NUM_TIDS;
+	bool queue = !!(local->ops->wake_tx_queue);
 
 	/* control port protocol needs a lot of special handling */
 	if (cpu_to_be16(ethertype) == sdata->control_port_protocol)
@@ -3240,8 +3375,32 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	if (queue && ieee80211_queue_skb(local, &sdata->vif, &sta->sta, skb))
+		return true;
+
+	return ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb, true);
+}
+
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+	u8 tid = IEEE80211_NUM_TIDS;
 
 	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
 		*ieee80211_get_qos_ctl(hdr) = tid;
 		if (!sta->sta.txq[0])
 			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
@@ -3309,12 +3468,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-		sdata = container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
+	if (xmit) {
+		__skb_queue_tail(&tx.skbs, skb);
+		ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+	}
 
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
 
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index b48c1e1..71c479a 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -28,13 +28,13 @@
 #include "wpa.h"
 
 ieee80211_tx_result
-ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+ieee80211_tx_h_michael_mic_add_skb(struct ieee80211_tx_data *tx,
+				   struct sk_buff *skb)
 {
 	u8 *data, *key, *mic;
 	size_t data_len;
 	unsigned int hdrlen;
 	struct ieee80211_hdr *hdr;
-	struct sk_buff *skb = tx->skb;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	int tail;
 
@@ -83,6 +83,20 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
 	return TX_CONTINUE;
 }
 
+ieee80211_tx_result
+ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb;
+	ieee80211_tx_result r;
+
+	skb_queue_walk(&tx->skbs, skb) {
+		r = ieee80211_tx_h_michael_mic_add_skb(tx, skb);
+		if (r != TX_CONTINUE)
+			return r;
+	}
+	return TX_CONTINUE;
+}
+
 
 ieee80211_rx_result
 ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
-- 
2.9.3

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v3] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-24 16:20   ` [Make-wifi-fast] [PATCH v3] mac80211: Move reorder-sensitive TX handlers " Toke Høiland-Jørgensen
@ 2016-08-24 22:40     ` Noah Causin
  2016-08-25 12:45       ` Toke Høiland-Jørgensen
  2016-08-30 13:15     ` [Make-wifi-fast] [PATCH v4] " Toke Høiland-Jørgensen
  1 sibling, 1 reply; 77+ messages in thread
From: Noah Causin @ 2016-08-24 22:40 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast

[-- Attachment #1: Type: text/plain, Size: 21573 bytes --]

This is what I get when I try building LEDE from the staging branch with 
this patch:

Applying ./patches/345-iv-fix.patch using plaintext:
patching file include/net/mac80211.h
patching file net/mac80211/tx.c
Hunk #2 FAILED at 591.
Hunk #3 FAILED at 634.
Hunk #4 succeeded at 801 (offset -4 lines).
Hunk #5 succeeded at 833 (offset -4 lines).
Hunk #6 succeeded at 863 (offset -4 lines).
Hunk #7 FAILED at 947.
Hunk #8 succeeded at 978 (offset -6 lines).
Hunk #9 succeeded at 998 (offset -6 lines).
Hunk #10 succeeded at 1487 (offset -9 lines).
Hunk #11 succeeded at 1553 (offset -9 lines).
Hunk #12 succeeded at 1631 (offset -9 lines).
Hunk #13 succeeded at 1646 (offset -9 lines).
Hunk #14 succeeded at 1766 (offset -9 lines).
Hunk #15 succeeded at 1787 (offset -9 lines).
Hunk #16 succeeded at 1795 (offset -9 lines).
Hunk #17 succeeded at 1843 (offset -9 lines).
Hunk #18 succeeded at 1897 (offset -9 lines).
Hunk #19 succeeded at 1922 (offset -9 lines).
Hunk #20 succeeded at 3322 (offset 22 lines).
Hunk #21 succeeded at 3391 (offset 22 lines).
Hunk #22 succeeded at 3484 (offset 22 lines).
3 out of 22 hunks FAILED -- saving rejects to file net/mac80211/tx.c.rej
patching file net/mac80211/wpa.c
Patch failed!  Please fix ./patches/345-iv-fix.patch!


On 8/24/2016 12:20 PM, Toke Høiland-Jørgensen wrote:
> The TXQ intermediate queues can cause packet reordering when more than
> one flow is active to a single station. Since some of the wifi-specific
> packet handling (notably sequence number and encryption handling) is
> sensitive to re-ordering, things break if they are applied before the
> TXQ.
>
> This splits up the TX handlers and fast_xmit logic into two parts: An
> early part and a late part. The former is applied before TXQ enqueue,
> and the latter after dequeue. The non-TXQ path just applies both parts
> at once.
>
> To avoid having to deal with fragmentation on dequeue, the split is set
> to be after the fragmentation handler. This means that some reordering
> of TX handlers is necessary, and some handlers had to be made aware of
> fragmentation due to this reordering.
>
> This approach avoids having to scatter special cases for when TXQ is
> enabled, at the cost of making the fast_xmit and TX handler code
> slightly more complex.
>
> Signed-off-by: Toke Høiland-Jørgensen<toke@toke.dk>
> ---
> Changes since v2:
>
> This is a completely different approach: Instead of adding exceptions
> for TXQ handling, split up the entire TX path in an early and late part,
> and apply the latter after TXQ dequeue. This should fix things that
> don't hit the fast path as well.
>
> I've tested this with both unencrypted traffic and with CCMP and TKIP
> and it appears to fix the previous performance regression seen with
> softq-enabled ath9k. I most likely haven't hit all code paths, though
> (not sure how I would even go about ensuring that), but looks promising
> so far.
>
>   include/net/mac80211.h |   2 +
>   net/mac80211/tx.c      | 276 ++++++++++++++++++++++++++++++++++++++-----------
>   net/mac80211/wpa.c     |  18 +++-
>   3 files changed, 235 insertions(+), 61 deletions(-)
>
> diff --git a/include/net/mac80211.h b/include/net/mac80211.h
> index cca510a..9a6a3e9 100644
> --- a/include/net/mac80211.h
> +++ b/include/net/mac80211.h
> @@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
>    *	frame (PS-Poll or uAPSD).
>    * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
>    * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
> + * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
>    *
>    * These flags are used in tx_info->control.flags.
>    */
> @@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
>   	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
>   	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
>   	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
> +	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
>   };
>   
>   /*
> diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
> index 1d0746d..7042d2c 100644
> --- a/net/mac80211/tx.c
> +++ b/net/mac80211/tx.c
> @@ -38,6 +38,12 @@
>   #include "wme.h"
>   #include "rate.h"
>   
> +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
> +static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
> +				       struct sta_info *sta,
> +				       struct ieee80211_fast_tx *fast_tx,
> +				       struct sk_buff *skb, bool xmit);
> +
>   /* misc utils */
>   
>   static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
> @@ -585,20 +591,27 @@ static ieee80211_tx_result debug_noinline
>   ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
>   {
>   	struct ieee80211_key *key;
> -	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
> -	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
> +	struct ieee80211_tx_info *info;
> +	struct ieee80211_hdr *hdr;
> +	struct sk_buff *skb = tx->skb;
> +
> +	if (!skb)
> +		skb = skb_peek(&tx->skbs);
> +
> +	info = IEEE80211_SKB_CB(skb);
> +	hdr = (struct ieee80211_hdr *)skb->data;
>   
>   	if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
>   		tx->key = NULL;
>   	else if (tx->sta &&
>   		 (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx])))
>   		tx->key = key;
> -	else if (ieee80211_is_group_privacy_action(tx->skb) &&
> +	else if (ieee80211_is_group_privacy_action(skb) &&
>   		(key = rcu_dereference(tx->sdata->default_multicast_key)))
>   		tx->key = key;
>   	else if (ieee80211_is_mgmt(hdr->frame_control) &&
>   		 is_multicast_ether_addr(hdr->addr1) &&
> -		 ieee80211_is_robust_mgmt_frame(tx->skb) &&
> +		 ieee80211_is_robust_mgmt_frame(skb) &&
>   		 (key = rcu_dereference(tx->sdata->default_mgmt_key)))
>   		tx->key = key;
>   	else if (is_multicast_ether_addr(hdr->addr1) &&
> @@ -628,8 +641,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
>   		case WLAN_CIPHER_SUITE_GCMP_256:
>   			if (!ieee80211_is_data_present(hdr->frame_control) &&
>   			    !ieee80211_use_mfp(hdr->frame_control, tx->sta,
> -					       tx->skb) &&
> -			    !ieee80211_is_group_privacy_action(tx->skb))
> +					       skb) &&
> +			    !ieee80211_is_group_privacy_action(skb))
>   				tx->key = NULL;
>   			else
>   				skip_hw = (tx->key->conf.flags &
> @@ -799,10 +812,12 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid)
>   static ieee80211_tx_result debug_noinline
>   ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
>   {
> -	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
> -	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
> +	struct sk_buff *skb = skb_peek(&tx->skbs);
> +	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
> +	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
>   	u8 *qc;
>   	int tid;
> +	u16 fragnum, seq;
>   
>   	/*
>   	 * Packet injection may want to control the sequence
> @@ -829,10 +844,16 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
>   	 */
>   	if (!ieee80211_is_data_qos(hdr->frame_control) ||
>   	    is_multicast_ether_addr(hdr->addr1)) {
> -		/* driver should assign sequence number */
> -		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
> -		/* for pure STA mode without beacons, we can do it */
> -		hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number);
> +		fragnum = 0;
> +		seq = cpu_to_le16(tx->sdata->sequence_number);
> +		skb_queue_walk(&tx->skbs, skb) {
> +			info = IEEE80211_SKB_CB(skb);
> +			hdr = (struct ieee80211_hdr *)skb->data;
> +			/* driver should assign sequence number */
> +			info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
> +			/* for pure STA mode without beacons, we can do it */
> +			hdr->seq_ctrl = seq | fragnum++;
> +		}
>   		tx->sdata->sequence_number += 0x10;
>   		if (tx->sta)
>   			tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++;
> @@ -853,8 +874,14 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
>   	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
>   	tx->sta->tx_stats.msdu[tid]++;
>   
> -	if (!tx->sta->sta.txq[0])
> -		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
> +	if (!tx->sta->sta.txq[0]) {
> +		seq = ieee80211_tx_next_seq(tx->sta, tid);
> +		fragnum = 0;
> +		skb_queue_walk(&tx->skbs, skb) {
> +			hdr = (struct ieee80211_hdr *)skb->data;
> +			hdr->seq_ctrl = seq | fragnum++;
> +		}
> +	}
>   
>   	return TX_CONTINUE;
>   }
> @@ -927,7 +954,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
>   	struct ieee80211_hdr *hdr = (void *)skb->data;
>   	int frag_threshold = tx->local->hw.wiphy->frag_threshold;
>   	int hdrlen;
> -	int fragnum;
>   
>   	/* no matter what happens, tx->skb moves to tx->skbs */
>   	__skb_queue_tail(&tx->skbs, skb);
> @@ -964,9 +990,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
>   	if (ieee80211_fragment(tx, skb, hdrlen, frag_threshold))
>   		return TX_DROP;
>   
> -	/* update duration/seq/flags of fragments */
> -	fragnum = 0;
> -
>   	skb_queue_walk(&tx->skbs, skb) {
>   		const __le16 morefrags = cpu_to_le16(IEEE80211_FCTL_MOREFRAGS);
>   
> @@ -987,8 +1010,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
>   		} else {
>   			hdr->frame_control &= ~morefrags;
>   		}
> -		hdr->seq_ctrl |= cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG);
> -		fragnum++;
>   	}
>   
>   	return TX_CONTINUE;
> @@ -1481,33 +1502,59 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
>   {
>   	struct ieee80211_local *local = hw_to_local(hw);
>   	struct txq_info *txqi = container_of(txq, struct txq_info, txq);
> -	struct ieee80211_hdr *hdr;
>   	struct sk_buff *skb = NULL;
>   	struct fq *fq = &local->fq;
>   	struct fq_tin *tin = &txqi->tin;
> +	struct ieee80211_tx_info *info;
>   
>   	spin_lock_bh(&fq->lock);
>   
>   	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
>   		goto out;
>   
> +begin:
>   	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
>   	if (!skb)
>   		goto out;
>   
>   	ieee80211_set_skb_vif(skb, txqi);
>   
> -	hdr = (struct ieee80211_hdr *)skb->data;
> -	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
> +	info = IEEE80211_SKB_CB(skb);
> +	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
>   		struct sta_info *sta = container_of(txq->sta, struct sta_info,
>   						    sta);
> -		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
> +		struct ieee80211_fast_tx *fast_tx;
>   
> -		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
> -		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
> -			info->flags |= IEEE80211_TX_CTL_AMPDU;
> -		else
> -			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
> +		fast_tx = rcu_dereference(sta->fast_tx);
> +		if (!fast_tx ||
> +		    !ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb,
> +						false)) {
> +			/* fast xmit was started, but fails to finish */
> +			ieee80211_free_txskb(hw, skb);
> +			goto begin;
> +		}
> +	} else {
> +		struct ieee80211_tx_data tx = { };
> +
> +		__skb_queue_head_init(&tx.skbs);
> +		tx.local = local;
> +		if (txq->sta) {
> +			struct sta_info *sta = container_of(txq->sta,
> +							    struct sta_info,
> +							    sta);
> +			tx.sta = container_of(txq->sta, struct sta_info, sta);
> +			tx.sdata = sta->sdata;
> +		} else {
> +			tx.sdata = container_of(info->control.vif,
> +					struct ieee80211_sub_if_data, vif);
> +		}
> +
> +		__skb_queue_tail(&tx.skbs, skb);
> +
> +		if (invoke_tx_handlers_late(&tx))
> +			goto begin;
> +
> +		__skb_unlink(skb, &tx.skbs);
>   	}
>   
>   out:
> @@ -1521,6 +1568,77 @@ out:
>   }
>   EXPORT_SYMBOL(ieee80211_tx_dequeue);
>   
> +static bool ieee80211_queue_skb(struct ieee80211_local *local,
> +				struct ieee80211_vif *vif,
> +				struct ieee80211_sta *sta,
> +				struct sk_buff *skb)
> +{
> +	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
> +	struct fq *fq = &local->fq;
> +	struct txq_info *txqi = ieee80211_get_txq(local, vif, sta, skb);
> +
> +	if (!txqi)
> +		return false;
> +
> +	info->control.vif = vif;
> +
> +	spin_lock_bh(&fq->lock);
> +	ieee80211_txq_enqueue(local, txqi, skb);
> +	spin_unlock_bh(&fq->lock);
> +
> +	drv_wake_tx_queue(local, txqi);
> +
> +	return true;
> +}
> +
> +static bool ieee80211_queue_frags(struct ieee80211_local *local,
> +				  struct sta_info *sta,
> +				  struct sk_buff_head *skbs)
> +{
> +	struct txq_info *txqi;
> +	struct sk_buff *skb, *tmp;
> +	struct ieee80211_tx_info *info;
> +	struct ieee80211_sub_if_data *sdata;
> +	struct ieee80211_vif *vif;
> +	struct ieee80211_sta *pubsta;
> +
> +	if (WARN_ON(skb_queue_empty(skbs)))
> +		return true;
> +
> +	skb = skb_peek(skbs);
> +	info = IEEE80211_SKB_CB(skb);
> +	sdata = vif_to_sdata(info->control.vif);
> +	if (sta && !sta->uploaded)
> +		sta = NULL;
> +
> +	if (sta)
> +		pubsta = &sta->sta;
> +	else
> +		pubsta = NULL;
> +
> +	switch (sdata->vif.type) {
> +	case NL80211_IFTYPE_MONITOR:
> +		return false;
> +	case NL80211_IFTYPE_AP_VLAN:
> +		sdata = container_of(sdata->bss,
> +				     struct ieee80211_sub_if_data, u.ap);
> +		/* fall through */
> +	default:
> +		vif = &sdata->vif;
> +		break;
> +	}
> +
> +	skb_queue_walk_safe(skbs, skb, tmp) {
> +		txqi = ieee80211_get_txq(local, vif, pubsta, skb);
> +		if (txqi) {
> +			__skb_unlink(skb, skbs);
> +			ieee80211_queue_skb(local, vif, pubsta, skb);
> +		}
> +	}
> +
> +	return !!skb_queue_empty(skbs);
> +}
> +
>   static bool ieee80211_tx_frags(struct ieee80211_local *local,
>   			       struct ieee80211_vif *vif,
>   			       struct ieee80211_sta *sta,
> @@ -1528,9 +1646,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
>   			       bool txpending)
>   {
>   	struct ieee80211_tx_control control = {};
> -	struct fq *fq = &local->fq;
>   	struct sk_buff *skb, *tmp;
> -	struct txq_info *txqi;
>   	unsigned long flags;
>   
>   	skb_queue_walk_safe(skbs, skb, tmp) {
> @@ -1545,21 +1661,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
>   		}
>   #endif
>   
> -		txqi = ieee80211_get_txq(local, vif, sta, skb);
> -		if (txqi) {
> -			info->control.vif = vif;
> -
> -			__skb_unlink(skb, skbs);
> -
> -			spin_lock_bh(&fq->lock);
> -			ieee80211_txq_enqueue(local, txqi, skb);
> -			spin_unlock_bh(&fq->lock);
> -
> -			drv_wake_tx_queue(local, txqi);
> -
> -			continue;
> -		}
> -
>   		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
>   		if (local->queue_stop_reasons[q] ||
>   		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
> @@ -1680,8 +1781,12 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
>   /*
>    * Invoke TX handlers, return 0 on success and non-zero if the
>    * frame was dropped or queued.
> + *
> + * The handlers are split into an early and late part. The latter is everything
> + * that can be sensitive to reordering, and will be deferred to after packets
> + * are dequeued from the intermediate queues (when they are enabled).
>    */
> -static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
> +static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
>   {
>   	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
>   	ieee80211_tx_result res = TX_DROP;
> @@ -1697,7 +1802,6 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
>   	CALL_TXH(ieee80211_tx_h_check_assoc);
>   	CALL_TXH(ieee80211_tx_h_ps_buf);
>   	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
> -	CALL_TXH(ieee80211_tx_h_select_key);
>   	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>   		CALL_TXH(ieee80211_tx_h_rate_ctrl);
>   
> @@ -1706,11 +1810,32 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
>   		tx->skb = NULL;
>   		goto txh_done;
>   	}
> +	CALL_TXH(ieee80211_tx_h_fragment);
> +
> + txh_done:
> +	if (unlikely(res == TX_DROP)) {
> +		I802_DEBUG_INC(tx->local->tx_handlers_drop);
> +		if (tx->skb)
> +			ieee80211_free_txskb(&tx->local->hw, tx->skb);
> +		else
> +			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
> +		return -1;
> +	} else if (unlikely(res == TX_QUEUED)) {
> +		I802_DEBUG_INC(tx->local->tx_handlers_queued);
> +		return -1;
> +	}
>   
> +	return 0;
> +}
> +
> +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
> +{
> +	ieee80211_tx_result res = TX_DROP;
> +
> +	/* late tx handlers must be aware of tx info fragmentation! */
> +	CALL_TXH(ieee80211_tx_h_select_key);
>   	CALL_TXH(ieee80211_tx_h_michael_mic_add);
>   	CALL_TXH(ieee80211_tx_h_sequence);
> -	CALL_TXH(ieee80211_tx_h_fragment);
> -	/* handlers after fragment must be aware of tx info fragmentation! */
>   	CALL_TXH(ieee80211_tx_h_stats);
>   	CALL_TXH(ieee80211_tx_h_encrypt);
>   	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
> @@ -1733,6 +1858,11 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
>   	return 0;
>   }
>   
> +static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
> +{
> +	return invoke_tx_handlers_early(tx) || invoke_tx_handlers_late(tx);
> +}
> +
>   bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
>   			      struct ieee80211_vif *vif, struct sk_buff *skb,
>   			      int band, struct ieee80211_sta **sta)
> @@ -1782,7 +1912,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
>   	struct ieee80211_tx_data tx;
>   	ieee80211_tx_result res_prepare;
>   	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
> -	bool result = true;
> +	bool result = true, queue = !!(local->ops->wake_tx_queue);
>   	int led_len;
>   
>   	if (unlikely(skb->len < 10)) {
> @@ -1807,7 +1937,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
>   		info->hw_queue =
>   			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
>   
> -	if (!invoke_tx_handlers(&tx))
> +	if (invoke_tx_handlers_early(&tx))
> +		return false;
> +
> +	if (queue && ieee80211_queue_frags(local, tx.sta, &tx.skbs))
> +		return true;
> +
> +	if (!invoke_tx_handlers_late(&tx))
>   		result = __ieee80211_tx(local, &tx.skbs, led_len,
>   					tx.sta, txpending);
>   
> @@ -3170,10 +3306,9 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
>   	struct ethhdr eth;
>   	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
>   	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
> -	struct ieee80211_tx_data tx;
> -	ieee80211_tx_result r;
>   	struct tid_ampdu_tx *tid_tx = NULL;
>   	u8 tid = IEEE80211_NUM_TIDS;
> +	bool queue = !!(local->ops->wake_tx_queue);
>   
>   	/* control port protocol needs a lot of special handling */
>   	if (cpu_to_be16(ethertype) == sdata->control_port_protocol)
> @@ -3240,8 +3375,32 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
>   	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
>   		      IEEE80211_TX_CTL_DONTFRAG |
>   		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
> +	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
> +
> +	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
> +		sdata = container_of(sdata->bss,
> +				     struct ieee80211_sub_if_data, u.ap);
> +
> +	if (queue && ieee80211_queue_skb(local, &sdata->vif, &sta->sta, skb))
> +		return true;
> +
> +	return ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb, true);
> +}
> +
> +static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
> +				       struct sta_info *sta,
> +				       struct ieee80211_fast_tx *fast_tx,
> +				       struct sk_buff *skb, bool xmit)
> +{
> +	struct ieee80211_local *local = sdata->local;
> +	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
> +	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
> +	struct ieee80211_tx_data tx;
> +	ieee80211_tx_result r;
> +	u8 tid = IEEE80211_NUM_TIDS;
>   
>   	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
> +		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
>   		*ieee80211_get_qos_ctl(hdr) = tid;
>   		if (!sta->sta.txq[0])
>   			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
> @@ -3309,12 +3468,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
>   		}
>   	}
>   
> -	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
> -		sdata = container_of(sdata->bss,
> -				     struct ieee80211_sub_if_data, u.ap);
> +	if (xmit) {
> +		__skb_queue_tail(&tx.skbs, skb);
> +		ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
> +	}
>   
> -	__skb_queue_tail(&tx.skbs, skb);
> -	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
>   	return true;
>   }
>   
> diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
> index b48c1e1..71c479a 100644
> --- a/net/mac80211/wpa.c
> +++ b/net/mac80211/wpa.c
> @@ -28,13 +28,13 @@
>   #include "wpa.h"
>   
>   ieee80211_tx_result
> -ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
> +ieee80211_tx_h_michael_mic_add_skb(struct ieee80211_tx_data *tx,
> +				   struct sk_buff *skb)
>   {
>   	u8 *data, *key, *mic;
>   	size_t data_len;
>   	unsigned int hdrlen;
>   	struct ieee80211_hdr *hdr;
> -	struct sk_buff *skb = tx->skb;
>   	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
>   	int tail;
>   
> @@ -83,6 +83,20 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
>   	return TX_CONTINUE;
>   }
>   
> +ieee80211_tx_result
> +ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
> +{
> +	struct sk_buff *skb;
> +	ieee80211_tx_result r;
> +
> +	skb_queue_walk(&tx->skbs, skb) {
> +		r = ieee80211_tx_h_michael_mic_add_skb(tx, skb);
> +		if (r != TX_CONTINUE)
> +			return r;
> +	}
> +	return TX_CONTINUE;
> +}
> +
>   
>   ieee80211_rx_result
>   ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
> -- 2.9.3 _______________________________________________ 
> Make-wifi-fast mailing list Make-wifi-fast@lists.bufferbloat.net 
> https://lists.bufferbloat.net/listinfo/make-wifi-fast


[-- Attachment #2: Type: text/html, Size: 22242 bytes --]

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v3] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-24 22:40     ` Noah Causin
@ 2016-08-25 12:45       ` Toke Høiland-Jørgensen
  2016-08-26 14:30         ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-25 12:45 UTC (permalink / raw)
  To: Noah Causin; +Cc: make-wifi-fast

[-- Attachment #1: Type: text/plain, Size: 250 bytes --]

Noah Causin <n0manletter@gmail.com> writes:

> This is what I get when I try building LEDE from the staging branch
> with this patch:

Yes, that patch requires some modifications to apply to the LEDE tree.
The attached version should work :)

-Toke


[-- Attachment #2: 345-mac80211-Move-reorder-sensitive-TX-handlers-to-af.patch --]
[-- Type: text/x-diff, Size: 19385 bytes --]

From 14a53dbb27a7a9cbdaac399aae7a37ab726bec46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@toke.dk>
Date: Tue, 23 Aug 2016 20:14:07 +0200
Subject: [PATCH v3] mac80211: Move reorder-sensitive TX handlers to after TXQ
 dequeue.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

To avoid having to deal with fragmentation on dequeue, the split is set
to be after the fragmentation handler. This means that some reordering
of TX handlers is necessary, and some handlers had to be made aware of
fragmentation due to this reordering.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
Changes since v2:

This is a completely different approach: Instead of adding exceptions
for TXQ handling, split up the entire TX path in an early and late part,
and apply the latter after TXQ dequeue. This should fix things that
don't hit the fast path as well.

I've tested this with both unencrypted traffic and with CCMP and TKIP
and it appears to fix the previous performance regression seen with
softq-enabled ath9k. I most likely haven't hit all code paths, though
(not sure how I would even go about ensuring that), but looks promising
so far.

 include/net/mac80211.h |   2 +
 net/mac80211/tx.c      | 276 ++++++++++++++++++++++++++++++++++++++-----------
 net/mac80211/wpa.c     |  18 +++-
 3 files changed, 235 insertions(+), 61 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ea4b661..2097422 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
 };
 
 /*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 86b5400..f4c33c7 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
 
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit);
+
 /* misc utils */
 
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -585,8 +591,15 @@ static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_key *key;
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_hdr *hdr;
+	struct sk_buff *skb = tx->skb;
+
+	if (!skb)
+		skb = skb_peek(&tx->skbs);
+
+	info = IEEE80211_SKB_CB(skb);
+	hdr = (struct ieee80211_hdr *)skb->data;
 
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
 		tx->key = NULL;
@@ -595,7 +608,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 		tx->key = key;
 	else if (ieee80211_is_mgmt(hdr->frame_control) &&
 		 is_multicast_ether_addr(hdr->addr1) &&
-		 ieee80211_is_robust_mgmt_frame(tx->skb) &&
+		 ieee80211_is_robust_mgmt_frame(skb) &&
 		 (key = rcu_dereference(tx->sdata->default_mgmt_key)))
 		tx->key = key;
 	else if (is_multicast_ether_addr(hdr->addr1) &&
@@ -625,7 +638,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 		case WLAN_CIPHER_SUITE_GCMP_256:
 			if (!ieee80211_is_data_present(hdr->frame_control) &&
 			    !ieee80211_use_mfp(hdr->frame_control, tx->sta,
-					       tx->skb))
+					       skb))
 				tx->key = NULL;
 			else
 				skip_hw = (tx->key->conf.flags &
@@ -795,10 +808,12 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid)
 static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct sk_buff *skb = skb_peek(&tx->skbs);
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
 	u8 *qc;
 	int tid;
+	u16 fragnum, seq;
 
 	/*
 	 * Packet injection may want to control the sequence
@@ -825,10 +840,16 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	 */
 	if (!ieee80211_is_data_qos(hdr->frame_control) ||
 	    is_multicast_ether_addr(hdr->addr1)) {
-		/* driver should assign sequence number */
-		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
-		/* for pure STA mode without beacons, we can do it */
-		hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number);
+		fragnum = 0;
+		seq = cpu_to_le16(tx->sdata->sequence_number);
+		skb_queue_walk(&tx->skbs, skb) {
+			info = IEEE80211_SKB_CB(skb);
+			hdr = (struct ieee80211_hdr *)skb->data;
+			/* driver should assign sequence number */
+			info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+			/* for pure STA mode without beacons, we can do it */
+			hdr->seq_ctrl = seq | fragnum++;
+		}
 		tx->sdata->sequence_number += 0x10;
 		if (tx->sta)
 			tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++;
@@ -849,8 +870,14 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
 
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+	if (!tx->sta->sta.txq[0]) {
+		seq = ieee80211_tx_next_seq(tx->sta, tid);
+		fragnum = 0;
+		skb_queue_walk(&tx->skbs, skb) {
+			hdr = (struct ieee80211_hdr *)skb->data;
+			hdr->seq_ctrl = seq | fragnum++;
+		}
+	}
 
 	return TX_CONTINUE;
 }
@@ -923,7 +950,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	struct ieee80211_hdr *hdr = (void *)skb->data;
 	int frag_threshold = tx->local->hw.wiphy->frag_threshold;
 	int hdrlen = tx->hdrlen;
-	int fragnum;
 
 	/* no matter what happens, tx->skb moves to tx->skbs */
 	__skb_queue_tail(&tx->skbs, skb);
@@ -960,9 +986,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	if (ieee80211_fragment(tx, skb, hdrlen, frag_threshold))
 		return TX_DROP;
 
-	/* update duration/seq/flags of fragments */
-	fragnum = 0;
-
 	skb_queue_walk(&tx->skbs, skb) {
 		const __le16 morefrags = cpu_to_le16(IEEE80211_FCTL_MOREFRAGS);
 
@@ -983,8 +1006,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 		} else {
 			hdr->frame_control &= ~morefrags;
 		}
-		hdr->seq_ctrl |= cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG);
-		fragnum++;
 	}
 
 	return TX_CONTINUE;
@@ -1473,33 +1494,59 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct txq_info *txqi = container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb = NULL;
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
+	struct ieee80211_tx_info *info;
 
 	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
+begin:
 	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
 
 	ieee80211_set_skb_vif(skb, txqi);
 
-	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info = IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
 
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx = rcu_dereference(sta->fast_tx);
+		if (!fast_tx ||
+		    !ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb,
+						false)) {
+			/* fast xmit was started, but fails to finish */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+	} else {
+		struct ieee80211_tx_data tx = { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local = local;
+		if (txq->sta) {
+			struct sta_info *sta = container_of(txq->sta,
+							    struct sta_info,
+							    sta);
+			tx.sta = container_of(txq->sta, struct sta_info, sta);
+			tx.sdata = sta->sdata;
+		} else {
+			tx.sdata = container_of(info->control.vif,
+					struct ieee80211_sub_if_data, vif);
+		}
+
+		__skb_queue_tail(&tx.skbs, skb);
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		__skb_unlink(skb, &tx.skbs);
 	}
 
 out:
@@ -1513,6 +1560,77 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
 
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_vif *vif,
+				struct ieee80211_sta *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct fq *fq = &local->fq;
+	struct txq_info *txqi = ieee80211_get_txq(local, vif, sta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif = vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
+static bool ieee80211_queue_frags(struct ieee80211_local *local,
+				  struct sta_info *sta,
+				  struct sk_buff_head *skbs)
+{
+	struct txq_info *txqi;
+	struct sk_buff *skb, *tmp;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_vif *vif;
+	struct ieee80211_sta *pubsta;
+
+	if (WARN_ON(skb_queue_empty(skbs)))
+		return true;
+
+	skb = skb_peek(skbs);
+	info = IEEE80211_SKB_CB(skb);
+	sdata = vif_to_sdata(info->control.vif);
+	if (sta && !sta->uploaded)
+		sta = NULL;
+
+	if (sta)
+		pubsta = &sta->sta;
+	else
+		pubsta = NULL;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_MONITOR:
+		return false;
+	case NL80211_IFTYPE_AP_VLAN:
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+		/* fall through */
+	default:
+		vif = &sdata->vif;
+		break;
+	}
+
+	skb_queue_walk_safe(skbs, skb, tmp) {
+		txqi = ieee80211_get_txq(local, vif, pubsta, skb);
+		if (txqi) {
+			__skb_unlink(skb, skbs);
+			ieee80211_queue_skb(local, vif, pubsta, skb);
+		}
+	}
+
+	return !!skb_queue_empty(skbs);
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1520,9 +1638,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control = {};
-	struct fq *fq = &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1537,21 +1653,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
-		txqi = ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif = vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1672,8 +1773,12 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res = TX_DROP;
@@ -1689,7 +1794,6 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
 
@@ -1698,11 +1802,32 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 		tx->skb = NULL;
 		goto txh_done;
 	}
+	CALL_TXH(ieee80211_tx_h_fragment);
+
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
 
+	return 0;
+}
+
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	ieee80211_tx_result res = TX_DROP;
+
+	/* late tx handlers must be aware of tx info fragmentation! */
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
-	CALL_TXH(ieee80211_tx_h_fragment);
-	/* handlers after fragment must be aware of tx info fragmentation! */
 	CALL_TXH(ieee80211_tx_h_stats);
 	CALL_TXH(ieee80211_tx_h_encrypt);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
@@ -1725,6 +1850,11 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	return 0;
 }
 
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	return invoke_tx_handlers_early(tx) || invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1774,7 +1904,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_tx_data tx;
 	ieee80211_tx_result res_prepare;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
-	bool result = true;
+	bool result = true, queue = !!(local->ops->wake_tx_queue);
 	int led_len;
 
 	if (unlikely(skb->len < 10)) {
@@ -1799,7 +1929,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		info->hw_queue =
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
 
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (queue && ieee80211_queue_frags(local, tx.sta, &tx.skbs))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result = __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
 
@@ -3183,10 +3319,9 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx = NULL;
 	u8 tid = IEEE80211_NUM_TIDS;
+	bool queue = !!(local->ops->wake_tx_queue);
 
 	/* control port protocol needs a lot of special handling */
 	if (cpu_to_be16(ethertype) == sdata->control_port_protocol)
@@ -3253,8 +3388,32 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	if (queue && ieee80211_queue_skb(local, &sdata->vif, &sta->sta, skb))
+		return true;
+
+	return ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb, true);
+}
+
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+	u8 tid = IEEE80211_NUM_TIDS;
 
 	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
 		*ieee80211_get_qos_ctl(hdr) = tid;
 		if (!sta->sta.txq[0])
 			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
@@ -3322,12 +3481,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-		sdata = container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
+	if (xmit) {
+		__skb_queue_tail(&tx.skbs, skb);
+		ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+	}
 
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
 
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index b48c1e1..71c479a 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -28,13 +28,13 @@
 #include "wpa.h"
 
 ieee80211_tx_result
-ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+ieee80211_tx_h_michael_mic_add_skb(struct ieee80211_tx_data *tx,
+				   struct sk_buff *skb)
 {
 	u8 *data, *key, *mic;
 	size_t data_len;
 	unsigned int hdrlen;
 	struct ieee80211_hdr *hdr;
-	struct sk_buff *skb = tx->skb;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	int tail;
 
@@ -83,6 +83,20 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
 	return TX_CONTINUE;
 }
 
+ieee80211_tx_result
+ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb;
+	ieee80211_tx_result r;
+
+	skb_queue_walk(&tx->skbs, skb) {
+		r = ieee80211_tx_h_michael_mic_add_skb(tx, skb);
+		if (r != TX_CONTINUE)
+			return r;
+	}
+	return TX_CONTINUE;
+}
+
 
 ieee80211_rx_result
 ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
-- 
2.9.3


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-22 14:47         ` Toke Høiland-Jørgensen
@ 2016-08-26  8:38           ` Johannes Berg
  2016-08-26  8:54             ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-08-26  8:38 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Dave Taht, make-wifi-fast, linux-wireless, Felix Fietkau

On Mon, 2016-08-22 at 16:47 +0200, Toke Høiland-Jørgensen wrote:
> 
> I suppose that could be a way to do it (i.e. have
> ieee80211_tx_dequeue call all the TX hooks etc), but am not sure
> whether there would be problems doing all this work in the loop
> that's building aggregates (which is what would happen for ath9k at
> least).

I don't know, but it seems that it's worth trying.

> An alternative could be to split the process up in two: An "early"
> and "late" stage, where the early stage does everything that is not
> sensitive to reordering and the occasional drop, and the late stage
> is everything that is. Then the queueing step could happen in-between 
> the two stages, and the non-queueing path could just call both stages
> at once. In effect, this would just make the current work-arounds be
> more explicit in the structure, rather than being marked as
> exceptions.

I'm not sure that works the way you think it does.

What you did works for fast-xmit, but *only* because that doesn't do
software crypto. If, for some reason, the TXQ stuff combines with
software crypto, which doesn't seem impossible (ath9k even has a module
parameter, iirc), then you have no way for this to work.

> > Now, it's unlikely to be that simple - fragmentation, for example,
> > might mess this up.
> > 
> > Overall though, I'm definitely wondering if it should be this way,
> > since all the special cases just add complexity.
> 
> I agree that the work-arounds are iffy, but I do also think it's
> important to keep in mind that we are improving latency by orders of
> magnitude here. A few special cases is worth it to achieve that, IMO.
> And then iterating towards a design that don't need them, of course
> :)

I don't really agree, I'm not going to treat this unlike any other
feature, which gets merged when it's ready for that.

Right now, your code here obviously isn't, since it doesn't even
address the cases that ath9k could run into, so either ath9k shouldn't
use this mac80211 feature, or the mac80211 feature needs to be fixed
before ath9k can use it.

I have no problems with documenting that the TXQ stuff can only be used
with full hardware crypto, but then we should add some checks and
warnings in mac80211 to ensure that, i.e. not allow software keys when
TXQ stuff is used, nor allow keys with mac80211 PN assignment, etc.

Even QoS-seqno assignment will be broken btw, so you do need a bunch
more offloads to make this work.

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-26  8:38           ` Johannes Berg
@ 2016-08-26  8:54             ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-26  8:54 UTC (permalink / raw)
  To: Johannes Berg; +Cc: Dave Taht, make-wifi-fast, linux-wireless, Felix Fietkau

Johannes Berg <johannes@sipsolutions.net> writes:

> On Mon, 2016-08-22 at 16:47 +0200, Toke Høiland-Jørgensen wrote:
>> 
>> I suppose that could be a way to do it (i.e. have
>> ieee80211_tx_dequeue call all the TX hooks etc), but am not sure
>> whether there would be problems doing all this work in the loop
>> that's building aggregates (which is what would happen for ath9k at
>> least).
>
> I don't know, but it seems that it's worth trying.
>
>> An alternative could be to split the process up in two: An "early"
>> and "late" stage, where the early stage does everything that is not
>> sensitive to reordering and the occasional drop, and the late stage
>> is everything that is. Then the queueing step could happen in-between 
>> the two stages, and the non-queueing path could just call both stages
>> at once. In effect, this would just make the current work-arounds be
>> more explicit in the structure, rather than being marked as
>> exceptions.
>
> I'm not sure that works the way you think it does.
>
> What you did works for fast-xmit, but *only* because that doesn't do
> software crypto. If, for some reason, the TXQ stuff combines with
> software crypto, which doesn't seem impossible (ath9k even has a module
> parameter, iirc), then you have no way for this to work.

Yeah, I realised that when I started reviewing the slow path (sorry for
not realising that straight away). The v3 takes the "split handlers"
approach for this reason. That saved having to deal with fragmentation
on TXQ dequeue, and it means that some of the processing can be done
before queueing (such as GSO splitting; having packets be as small as
possible before applying FQ to them is a good thing if we want to
realise the full potential).

It seems there are still some bugs to work out with that patch, but I'd
be grateful if you could glance at it and comment on whether you think
this is a viable way forward (provided we can work out all the bugs, of
course).

>> > Now, it's unlikely to be that simple - fragmentation, for example,
>> > might mess this up.
>> > 
>> > Overall though, I'm definitely wondering if it should be this way,
>> > since all the special cases just add complexity.
>> 
>> I agree that the work-arounds are iffy, but I do also think it's
>> important to keep in mind that we are improving latency by orders of
>> magnitude here. A few special cases is worth it to achieve that, IMO.
>> And then iterating towards a design that don't need them, of course
>> :)
>
> I don't really agree, I'm not going to treat this unlike any other
> feature, which gets merged when it's ready for that.
>
> Right now, your code here obviously isn't, since it doesn't even
> address the cases that ath9k could run into, so either ath9k shouldn't
> use this mac80211 feature, or the mac80211 feature needs to be fixed
> before ath9k can use it.

Yeah, I agree now that I've looked at it some more :)

> I have no problems with documenting that the TXQ stuff can only be
> used with full hardware crypto, but then we should add some checks and
> warnings in mac80211 to ensure that, i.e. not allow software keys when
> TXQ stuff is used, nor allow keys with mac80211 PN assignment, etc.

I'd much rather fix things so it works in all cases. My patch to ath9k
to use this stuff completely removes the old TX path, and things like
the airtime fairness scheduler needs the intermediate queues to work.

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v3] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-25 12:45       ` Toke Høiland-Jørgensen
@ 2016-08-26 14:30         ` Toke Høiland-Jørgensen
  2016-08-26 14:51           ` Dave Taht
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-26 14:30 UTC (permalink / raw)
  To: Noah Causin; +Cc: make-wifi-fast

[-- Attachment #1: Type: text/plain, Size: 437 bytes --]

Toke Høiland-Jørgensen <toke@toke.dk> writes:

> Noah Causin <n0manletter@gmail.com> writes:
>
>> This is what I get when I try building LEDE from the staging branch
>> with this patch:
>
> Yes, that patch requires some modifications to apply to the LEDE tree.
> The attached version should work :)

And here is an updated version that hopefully fixes (some of) the issues
Dave found during his initial testing.

-Toke


[-- Attachment #2: 345-mac80211-Move-reorder-sensitive-TX-handlers-to-af.patch --]
[-- Type: text/x-diff, Size: 18128 bytes --]

commit c53c88316936223bc189cf717ab1b5e846c279ed
Author: Toke Høiland-Jørgensen <toke@toke.dk>
Date:   Tue Aug 23 20:14:07 2016 +0200

    mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
    
    The TXQ intermediate queues can cause packet reordering when more than
    one flow is active to a single station. Since some of the wifi-specific
    packet handling (notably sequence number and encryption handling) is
    sensitive to re-ordering, things break if they are applied before the
    TXQ.
    
    This splits up the TX handlers and fast_xmit logic into two parts: An
    early part and a late part. The former is applied before TXQ enqueue,
    and the latter after dequeue. The non-TXQ path just applies both parts
    at once.
    
    To avoid having to deal with fragmentation on dequeue, the split is set
    to be after the fragmentation handler. This means that some reordering
    of TX handlers is necessary, and some handlers had to be made aware of
    fragmentation due to this reordering.
    
    This approach avoids having to scatter special cases for when TXQ is
    enabled, at the cost of making the fast_xmit and TX handler code
    slightly more complex.
    
    Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c9def15..fe8206b 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -710,7 +710,8 @@ enum mac80211_tx_info_flags {
  * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -717,7 +719,8 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PORT_CTRL_PROTO	= BIT(0),
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
 };
 
 /*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 8dc9ae9..344671f 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
 
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit);
+
 /* misc utils */
 
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -585,8 +591,15 @@ static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_key *key;
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_hdr *hdr;
+	struct sk_buff *skb = tx->skb;
+
+	if (!skb)
+		skb = skb_peek(&tx->skbs);
+
+	info = IEEE80211_SKB_CB(skb);
+	hdr = (struct ieee80211_hdr *)skb->data;
 
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
 		tx->key = NULL;
@@ -595,7 +608,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 		tx->key = key;
 	else if (ieee80211_is_mgmt(hdr->frame_control) &&
 		 is_multicast_ether_addr(hdr->addr1) &&
-		 ieee80211_is_robust_mgmt_frame(tx->skb) &&
+		 ieee80211_is_robust_mgmt_frame(skb) &&
 		 (key = rcu_dereference(tx->sdata->default_mgmt_key)))
 		tx->key = key;
 	else if (is_multicast_ether_addr(hdr->addr1) &&
@@ -625,7 +638,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 		case WLAN_CIPHER_SUITE_GCMP_256:
 			if (!ieee80211_is_data_present(hdr->frame_control) &&
 			    !ieee80211_use_mfp(hdr->frame_control, tx->sta,
-					       tx->skb))
+					       skb))
 				tx->key = NULL;
 			else
 				skip_hw = (tx->key->conf.flags &
@@ -795,10 +808,12 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid)
 static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct sk_buff *skb = skb_peek(&tx->skbs);
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
 	u8 *qc;
 	int tid;
+	u16 fragnum, seq;
 
 	/*
 	 * Packet injection may want to control the sequence
@@ -825,10 +840,16 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	 */
 	if (!ieee80211_is_data_qos(hdr->frame_control) ||
 	    is_multicast_ether_addr(hdr->addr1)) {
-		/* driver should assign sequence number */
-		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
-		/* for pure STA mode without beacons, we can do it */
-		hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number);
+		fragnum = 0;
+		seq = cpu_to_le16(tx->sdata->sequence_number);
+		skb_queue_walk(&tx->skbs, skb) {
+			info = IEEE80211_SKB_CB(skb);
+			hdr = (struct ieee80211_hdr *)skb->data;
+			/* driver should assign sequence number */
+			info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+			/* for pure STA mode without beacons, we can do it */
+			hdr->seq_ctrl = seq | fragnum++;
+		}
 		tx->sdata->sequence_number += 0x10;
 		if (tx->sta)
 			tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++;
@@ -849,8 +870,14 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
 
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+	if (!tx->sta->sta.txq[0]) {
+		seq = ieee80211_tx_next_seq(tx->sta, tid);
+		fragnum = 0;
+		skb_queue_walk(&tx->skbs, skb) {
+			hdr = (struct ieee80211_hdr *)skb->data;
+			hdr->seq_ctrl = seq | fragnum++;
+		}
+	}
 
 	return TX_CONTINUE;
 }
@@ -923,7 +950,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	struct ieee80211_hdr *hdr = (void *)skb->data;
 	int frag_threshold = tx->local->hw.wiphy->frag_threshold;
 	int hdrlen = tx->hdrlen;
-	int fragnum;
 
 	/* no matter what happens, tx->skb moves to tx->skbs */
 	__skb_queue_tail(&tx->skbs, skb);
@@ -960,9 +986,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	if (ieee80211_fragment(tx, skb, hdrlen, frag_threshold))
 		return TX_DROP;
 
-	/* update duration/seq/flags of fragments */
-	fragnum = 0;
-
 	skb_queue_walk(&tx->skbs, skb) {
 		const __le16 morefrags = cpu_to_le16(IEEE80211_FCTL_MOREFRAGS);
 
@@ -983,8 +1006,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 		} else {
 			hdr->frame_control &= ~morefrags;
 		}
-		hdr->seq_ctrl |= cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG);
-		fragnum++;
 	}
 
 	return TX_CONTINUE;
@@ -1473,33 +1494,59 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct txq_info *txqi = container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb = NULL;
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
+	struct ieee80211_tx_info *info;
 
 	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
+begin:
 	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
 
 	ieee80211_set_skb_vif(skb, txqi);
 
-	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info = IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
 
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx = rcu_dereference(sta->fast_tx);
+		if (!fast_tx ||
+		    !ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb,
+						false)) {
+			/* fast xmit was started, but fails to finish */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+	} else {
+		struct ieee80211_tx_data tx = { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local = local;
+		if (txq->sta) {
+			struct sta_info *sta = container_of(txq->sta,
+							    struct sta_info,
+							    sta);
+			tx.sta = container_of(txq->sta, struct sta_info, sta);
+			tx.sdata = sta->sdata;
+		} else {
+			tx.sdata = container_of(info->control.vif,
+					struct ieee80211_sub_if_data, vif);
+		}
+
+		__skb_queue_tail(&tx.skbs, skb);
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		__skb_unlink(skb, &tx.skbs);
 	}
 
 out:
@@ -1509,6 +1556,71 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
 
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct ieee80211_sta *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct fq *fq = &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+
+	if (!local->ops->wake_tx_queue)
+		return false;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				struct ieee80211_sub_if_data, u.ap);
+
+	vif = &sdata->vif;
+	txqi = ieee80211_get_txq(local, vif, sta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif = vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
+static bool ieee80211_queue_frags(struct ieee80211_local *local,
+				  struct ieee80211_sub_if_data *sdata,
+				  struct sta_info *sta,
+				  struct sk_buff_head *skbs)
+{
+	struct sk_buff *skb;
+	struct ieee80211_sta *pubsta;
+
+	if (WARN_ON(skb_queue_empty(skbs)))
+		return true;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type == NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta = &sta->sta;
+	else
+		pubsta = NULL;
+
+	while(!skb_queue_empty(skbs)) {
+		skb = __skb_dequeue(skbs);
+		if (unlikely(!ieee80211_queue_skb(local, sdata, pubsta, skb))) {
+			__skb_queue_head(skbs, skb);
+			return false;
+		}
+	}
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1516,9 +1628,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control = {};
-	struct fq *fq = &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1533,21 +1643,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
-		txqi = ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif = vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1668,8 +1763,12 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res = TX_DROP;
@@ -1685,7 +1784,6 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
 
@@ -1694,11 +1792,32 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 		tx->skb = NULL;
 		goto txh_done;
 	}
+	CALL_TXH(ieee80211_tx_h_fragment);
+
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
 
+	return 0;
+}
+
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	ieee80211_tx_result res = TX_DROP;
+
+	/* late tx handlers must be aware of tx info fragmentation! */
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
-	CALL_TXH(ieee80211_tx_h_fragment);
-	/* handlers after fragment must be aware of tx info fragmentation! */
 	CALL_TXH(ieee80211_tx_h_stats);
 	CALL_TXH(ieee80211_tx_h_encrypt);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
@@ -1721,6 +1840,11 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	return 0;
 }
 
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	return invoke_tx_handlers_early(tx) || invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1795,7 +1919,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		info->hw_queue =
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
 
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_frags(local, sdata, tx.sta, &tx.skbs))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result = __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
 
@@ -2979,8 +3109,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx = NULL;
 	u8 tid = IEEE80211_NUM_TIDS;
 
@@ -3045,11 +3173,30 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
+
+	if (ieee80211_queue_skb(local, sdata, &sta->sta, skb))
+		return true;
+
+	return ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb, true);
+}
+
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+	u8 tid = IEEE80211_NUM_TIDS;
 
 	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
 		*ieee80211_get_qos_ctl(hdr) = tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
 	} else {
 		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
 		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
@@ -3114,12 +3261,15 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-		sdata = container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
+	if (xmit) {
+		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+			sdata = container_of(sdata->bss,
+					struct ieee80211_sub_if_data, u.ap);
+
+		__skb_queue_tail(&tx.skbs, skb);
+		ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+	}
 
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
 
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 1884825..48270a91 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -28,13 +28,13 @@
 #include "wpa.h"
 
 ieee80211_tx_result
-ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+ieee80211_tx_h_michael_mic_add_skb(struct ieee80211_tx_data *tx,
+				   struct sk_buff *skb)
 {
 	u8 *data, *key, *mic;
 	size_t data_len;
 	unsigned int hdrlen;
 	struct ieee80211_hdr *hdr;
-	struct sk_buff *skb = tx->skb;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	int tail;
 
@@ -83,6 +83,20 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
 	return TX_CONTINUE;
 }
 
+ieee80211_tx_result
+ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb;
+	ieee80211_tx_result r;
+
+	skb_queue_walk(&tx->skbs, skb) {
+		r = ieee80211_tx_h_michael_mic_add_skb(tx, skb);
+		if (r != TX_CONTINUE)
+			return r;
+	}
+	return TX_CONTINUE;
+}
+
 
 ieee80211_rx_result
 ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v3] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-26 14:30         ` Toke Høiland-Jørgensen
@ 2016-08-26 14:51           ` Dave Taht
  0 siblings, 0 replies; 77+ messages in thread
From: Dave Taht @ 2016-08-26 14:51 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: Noah Causin, make-wifi-fast

nope. hangs on osx.

On Fri, Aug 26, 2016 at 7:30 AM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
> Toke Høiland-Jørgensen <toke@toke.dk> writes:
>
>> Noah Causin <n0manletter@gmail.com> writes:
>>
>>> This is what I get when I try building LEDE from the staging branch
>>> with this patch:
>>
>> Yes, that patch requires some modifications to apply to the LEDE tree.
>> The attached version should work :)
>
> And here is an updated version that hopefully fixes (some of) the issues
> Dave found during his initial testing.
>
> -Toke
>
>
> _______________________________________________
> Make-wifi-fast mailing list
> Make-wifi-fast@lists.bufferbloat.net
> https://lists.bufferbloat.net/listinfo/make-wifi-fast
>



-- 
Dave Täht
Let's go make home routers and wifi faster! With better software!
http://blog.cerowrt.org

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [Make-wifi-fast] [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-24 16:20   ` [Make-wifi-fast] [PATCH v3] mac80211: Move reorder-sensitive TX handlers " Toke Høiland-Jørgensen
  2016-08-24 22:40     ` Noah Causin
@ 2016-08-30 13:15     ` Toke Høiland-Jørgensen
  2016-08-30 13:17       ` Toke Høiland-Jørgensen
                         ` (2 more replies)
  1 sibling, 3 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-30 13:15 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

To avoid having to deal with fragmentation on dequeue, the split is set
to be after the fragmentation handler. This means that some reordering
of TX handlers is necessary, and some handlers had to be made aware of
fragmentation due to this reordering.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
Changes since v3:
  - Fix sequence number assignment in the fast path.
  - Code cleanup.

 include/net/mac80211.h |   2 +
 net/mac80211/tx.c      | 269 ++++++++++++++++++++++++++++++++++++++-----------
 net/mac80211/wpa.c     |  18 +++-
 3 files changed, 227 insertions(+), 62 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
 };
 
 /*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746d..56dca2d 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
 
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit);
+
 /* misc utils */
 
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -585,20 +591,27 @@ static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_key *key;
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_hdr *hdr;
+	struct sk_buff *skb = tx->skb;
+
+	if (!skb)
+		skb = skb_peek(&tx->skbs);
+
+	info = IEEE80211_SKB_CB(skb);
+	hdr = (struct ieee80211_hdr *)skb->data;
 
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
 		tx->key = NULL;
 	else if (tx->sta &&
 		 (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx])))
 		tx->key = key;
-	else if (ieee80211_is_group_privacy_action(tx->skb) &&
+	else if (ieee80211_is_group_privacy_action(skb) &&
 		(key = rcu_dereference(tx->sdata->default_multicast_key)))
 		tx->key = key;
 	else if (ieee80211_is_mgmt(hdr->frame_control) &&
 		 is_multicast_ether_addr(hdr->addr1) &&
-		 ieee80211_is_robust_mgmt_frame(tx->skb) &&
+		 ieee80211_is_robust_mgmt_frame(skb) &&
 		 (key = rcu_dereference(tx->sdata->default_mgmt_key)))
 		tx->key = key;
 	else if (is_multicast_ether_addr(hdr->addr1) &&
@@ -628,8 +641,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 		case WLAN_CIPHER_SUITE_GCMP_256:
 			if (!ieee80211_is_data_present(hdr->frame_control) &&
 			    !ieee80211_use_mfp(hdr->frame_control, tx->sta,
-					       tx->skb) &&
-			    !ieee80211_is_group_privacy_action(tx->skb))
+					       skb) &&
+			    !ieee80211_is_group_privacy_action(skb))
 				tx->key = NULL;
 			else
 				skip_hw = (tx->key->conf.flags &
@@ -799,10 +812,12 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid)
 static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct sk_buff *skb = skb_peek(&tx->skbs);
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
 	u8 *qc;
 	int tid;
+	u16 fragnum, seq;
 
 	/*
 	 * Packet injection may want to control the sequence
@@ -829,10 +844,16 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	 */
 	if (!ieee80211_is_data_qos(hdr->frame_control) ||
 	    is_multicast_ether_addr(hdr->addr1)) {
-		/* driver should assign sequence number */
-		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
-		/* for pure STA mode without beacons, we can do it */
-		hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number);
+		fragnum = 0;
+		seq = cpu_to_le16(tx->sdata->sequence_number);
+		skb_queue_walk(&tx->skbs, skb) {
+			info = IEEE80211_SKB_CB(skb);
+			hdr = (struct ieee80211_hdr *)skb->data;
+			/* driver should assign sequence number */
+			info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+			/* for pure STA mode without beacons, we can do it */
+			hdr->seq_ctrl = seq | fragnum++;
+		}
 		tx->sdata->sequence_number += 0x10;
 		if (tx->sta)
 			tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++;
@@ -853,8 +874,14 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
 
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+	if (!tx->sta->sta.txq[0]) {
+		seq = ieee80211_tx_next_seq(tx->sta, tid);
+		fragnum = 0;
+		skb_queue_walk(&tx->skbs, skb) {
+			hdr = (struct ieee80211_hdr *)skb->data;
+			hdr->seq_ctrl = seq | fragnum++;
+		}
+	}
 
 	return TX_CONTINUE;
 }
@@ -927,7 +954,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	struct ieee80211_hdr *hdr = (void *)skb->data;
 	int frag_threshold = tx->local->hw.wiphy->frag_threshold;
 	int hdrlen;
-	int fragnum;
 
 	/* no matter what happens, tx->skb moves to tx->skbs */
 	__skb_queue_tail(&tx->skbs, skb);
@@ -964,9 +990,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	if (ieee80211_fragment(tx, skb, hdrlen, frag_threshold))
 		return TX_DROP;
 
-	/* update duration/seq/flags of fragments */
-	fragnum = 0;
-
 	skb_queue_walk(&tx->skbs, skb) {
 		const __le16 morefrags = cpu_to_le16(IEEE80211_FCTL_MOREFRAGS);
 
@@ -987,8 +1010,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 		} else {
 			hdr->frame_control &= ~morefrags;
 		}
-		hdr->seq_ctrl |= cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG);
-		fragnum++;
 	}
 
 	return TX_CONTINUE;
@@ -1481,33 +1502,58 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct txq_info *txqi = container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb = NULL;
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
+	struct ieee80211_tx_info *info;
 
 	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
+begin:
 	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
 
 	ieee80211_set_skb_vif(skb, txqi);
 
-	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info = IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
 
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx = rcu_dereference(sta->fast_tx);
+		if (!fast_tx ||
+		    !ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb,
+						false)) {
+			/* fast xmit was started, but fails to finish */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+	} else {
+		struct ieee80211_tx_data tx = { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local = local;
+		if (txq->sta) {
+			struct sta_info *sta = container_of(txq->sta,
+							    struct sta_info,
+							    sta);
+			tx.sta = container_of(txq->sta, struct sta_info, sta);
+			tx.sdata = sta->sdata;
+		} else {
+			tx.sdata = vif_to_sdata(info->control.vif);
+		}
+
+		__skb_queue_tail(&tx.skbs, skb);
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		__skb_unlink(skb, &tx.skbs);
 	}
 
 out:
@@ -1521,6 +1567,71 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
 
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct ieee80211_sta *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct fq *fq = &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+
+	if (!local->ops->wake_tx_queue)
+		return false;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				struct ieee80211_sub_if_data, u.ap);
+
+	vif = &sdata->vif;
+	txqi = ieee80211_get_txq(local, vif, sta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif = vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
+static bool ieee80211_queue_frags(struct ieee80211_local *local,
+				  struct ieee80211_sub_if_data *sdata,
+				  struct sta_info *sta,
+				  struct sk_buff_head *skbs)
+{
+	struct sk_buff *skb;
+	struct ieee80211_sta *pubsta;
+
+	if (WARN_ON(skb_queue_empty(skbs)))
+		return true;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type == NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta = &sta->sta;
+	else
+		pubsta = NULL;
+
+	while (!skb_queue_empty(skbs)) {
+		skb = __skb_dequeue(skbs);
+		if (unlikely(!ieee80211_queue_skb(local, sdata, pubsta, skb))) {
+			__skb_queue_head(skbs, skb);
+			return false;
+		}
+	}
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1639,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control = {};
-	struct fq *fq = &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1654,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
-		txqi = ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif = vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,8 +1774,12 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res = TX_DROP;
@@ -1697,7 +1795,6 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
 
@@ -1706,11 +1803,32 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 		tx->skb = NULL;
 		goto txh_done;
 	}
+	CALL_TXH(ieee80211_tx_h_fragment);
+
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
 
+	return 0;
+}
+
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	ieee80211_tx_result res = TX_DROP;
+
+	/* late tx handlers must be aware of tx info fragmentation! */
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
-	CALL_TXH(ieee80211_tx_h_fragment);
-	/* handlers after fragment must be aware of tx info fragmentation! */
 	CALL_TXH(ieee80211_tx_h_stats);
 	CALL_TXH(ieee80211_tx_h_encrypt);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
@@ -1733,6 +1851,11 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	return 0;
 }
 
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	return invoke_tx_handlers_early(tx) || invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1807,7 +1930,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		info->hw_queue =
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
 
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_frags(local, sdata, tx.sta, &tx.skbs))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result = __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
 
@@ -3170,8 +3299,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx = NULL;
 	u8 tid = IEEE80211_NUM_TIDS;
 
@@ -3240,11 +3367,30 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
+
+	if (ieee80211_queue_skb(local, sdata, &sta->sta, skb))
+		return true;
+
+	return ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb, true);
+}
+
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+	u8 tid = IEEE80211_NUM_TIDS;
 
 	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
 		*ieee80211_get_qos_ctl(hdr) = tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
 	} else {
 		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
 		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
@@ -3309,12 +3455,15 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-		sdata = container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
+	if (xmit) {
+		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+			sdata = container_of(sdata->bss,
+					struct ieee80211_sub_if_data, u.ap);
+
+		__skb_queue_tail(&tx.skbs, skb);
+		ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+	}
 
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
 
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index b48c1e1..71c479a 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -28,13 +28,13 @@
 #include "wpa.h"
 
 ieee80211_tx_result
-ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+ieee80211_tx_h_michael_mic_add_skb(struct ieee80211_tx_data *tx,
+				   struct sk_buff *skb)
 {
 	u8 *data, *key, *mic;
 	size_t data_len;
 	unsigned int hdrlen;
 	struct ieee80211_hdr *hdr;
-	struct sk_buff *skb = tx->skb;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	int tail;
 
@@ -83,6 +83,20 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
 	return TX_CONTINUE;
 }
 
+ieee80211_tx_result
+ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb;
+	ieee80211_tx_result r;
+
+	skb_queue_walk(&tx->skbs, skb) {
+		r = ieee80211_tx_h_michael_mic_add_skb(tx, skb);
+		if (r != TX_CONTINUE)
+			return r;
+	}
+	return TX_CONTINUE;
+}
+
 
 ieee80211_rx_result
 ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
-- 
2.9.3

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-30 13:15     ` [Make-wifi-fast] [PATCH v4] " Toke Høiland-Jørgensen
@ 2016-08-30 13:17       ` Toke Høiland-Jørgensen
  2016-08-31 21:06       ` Johannes Berg
  2016-09-01 16:03       ` [Make-wifi-fast] [PATCH v5] " Toke Høiland-Jørgensen
  2 siblings, 0 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-30 13:17 UTC (permalink / raw)
  To: make-wifi-fast

[-- Attachment #1: Type: text/plain, Size: 1216 bytes --]

Toke Høiland-Jørgensen <toke@toke.dk> writes:

> The TXQ intermediate queues can cause packet reordering when more than
> one flow is active to a single station. Since some of the wifi-specific
> packet handling (notably sequence number and encryption handling) is
> sensitive to re-ordering, things break if they are applied before the
> TXQ.
>
> This splits up the TX handlers and fast_xmit logic into two parts: An
> early part and a late part. The former is applied before TXQ enqueue,
> and the latter after dequeue. The non-TXQ path just applies both parts
> at once.
>
> To avoid having to deal with fragmentation on dequeue, the split is set
> to be after the fragmentation handler. This means that some reordering
> of TX handlers is necessary, and some handlers had to be made aware of
> fragmentation due to this reordering.
>
> This approach avoids having to scatter special cases for when TXQ is
> enabled, at the cost of making the fast_xmit and TX handler code
> slightly more complex.
>
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>


And here's a version suitable for dropping into a LEDE build... Just
stick it in package/kernel/mac80211/patches.

-Toke


[-- Attachment #2: 345-mac80211-Move-reorder-sensitive-TX-handlers-to-af.patch --]
[-- Type: text/x-diff, Size: 18068 bytes --]

commit ab5b78ff996ee74caf3a6312ff6fc0e29f7db5e2
Author: Toke Høiland-Jørgensen <toke@toke.dk>
Date:   Tue Aug 23 20:14:07 2016 +0200

    mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
    
    The TXQ intermediate queues can cause packet reordering when more than
    one flow is active to a single station. Since some of the wifi-specific
    packet handling (notably sequence number and encryption handling) is
    sensitive to re-ordering, things break if they are applied before the
    TXQ.
    
    This splits up the TX handlers and fast_xmit logic into two parts: An
    early part and a late part. The former is applied before TXQ enqueue,
    and the latter after dequeue. The non-TXQ path just applies both parts
    at once.
    
    To avoid having to deal with fragmentation on dequeue, the split is set
    to be after the fragmentation handler. This means that some reordering
    of TX handlers is necessary, and some handlers had to be made aware of
    fragmentation due to this reordering.
    
    This approach avoids having to scatter special cases for when TXQ is
    enabled, at the cost of making the fast_xmit and TX handler code
    slightly more complex.
    
    Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c9def15..fe8206b 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -710,7 +710,8 @@ enum mac80211_tx_info_flags {
  * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -717,7 +719,8 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PORT_CTRL_PROTO	= BIT(0),
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
 };
 
 /*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 8dc9ae9..2b41824 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
 
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit);
+
 /* misc utils */
 
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -585,8 +591,15 @@ static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_key *key;
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_hdr *hdr;
+	struct sk_buff *skb = tx->skb;
+
+	if (!skb)
+		skb = skb_peek(&tx->skbs);
+
+	info = IEEE80211_SKB_CB(skb);
+	hdr = (struct ieee80211_hdr *)skb->data;
 
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
 		tx->key = NULL;
@@ -595,7 +608,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 		tx->key = key;
 	else if (ieee80211_is_mgmt(hdr->frame_control) &&
 		 is_multicast_ether_addr(hdr->addr1) &&
-		 ieee80211_is_robust_mgmt_frame(tx->skb) &&
+		 ieee80211_is_robust_mgmt_frame(skb) &&
 		 (key = rcu_dereference(tx->sdata->default_mgmt_key)))
 		tx->key = key;
 	else if (is_multicast_ether_addr(hdr->addr1) &&
@@ -625,7 +638,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 		case WLAN_CIPHER_SUITE_GCMP_256:
 			if (!ieee80211_is_data_present(hdr->frame_control) &&
 			    !ieee80211_use_mfp(hdr->frame_control, tx->sta,
-					       tx->skb))
+					       skb))
 				tx->key = NULL;
 			else
 				skip_hw = (tx->key->conf.flags &
@@ -795,10 +808,12 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid)
 static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct sk_buff *skb = skb_peek(&tx->skbs);
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
 	u8 *qc;
 	int tid;
+	u16 fragnum, seq;
 
 	/*
 	 * Packet injection may want to control the sequence
@@ -825,10 +840,16 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	 */
 	if (!ieee80211_is_data_qos(hdr->frame_control) ||
 	    is_multicast_ether_addr(hdr->addr1)) {
-		/* driver should assign sequence number */
-		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
-		/* for pure STA mode without beacons, we can do it */
-		hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number);
+		fragnum = 0;
+		seq = cpu_to_le16(tx->sdata->sequence_number);
+		skb_queue_walk(&tx->skbs, skb) {
+			info = IEEE80211_SKB_CB(skb);
+			hdr = (struct ieee80211_hdr *)skb->data;
+			/* driver should assign sequence number */
+			info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+			/* for pure STA mode without beacons, we can do it */
+			hdr->seq_ctrl = seq | fragnum++;
+		}
 		tx->sdata->sequence_number += 0x10;
 		if (tx->sta)
 			tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++;
@@ -849,8 +870,14 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
 
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+	if (!tx->sta->sta.txq[0]) {
+		seq = ieee80211_tx_next_seq(tx->sta, tid);
+		fragnum = 0;
+		skb_queue_walk(&tx->skbs, skb) {
+			hdr = (struct ieee80211_hdr *)skb->data;
+			hdr->seq_ctrl = seq | fragnum++;
+		}
+	}
 
 	return TX_CONTINUE;
 }
@@ -923,7 +950,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	struct ieee80211_hdr *hdr = (void *)skb->data;
 	int frag_threshold = tx->local->hw.wiphy->frag_threshold;
 	int hdrlen = tx->hdrlen;
-	int fragnum;
 
 	/* no matter what happens, tx->skb moves to tx->skbs */
 	__skb_queue_tail(&tx->skbs, skb);
@@ -960,9 +986,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	if (ieee80211_fragment(tx, skb, hdrlen, frag_threshold))
 		return TX_DROP;
 
-	/* update duration/seq/flags of fragments */
-	fragnum = 0;
-
 	skb_queue_walk(&tx->skbs, skb) {
 		const __le16 morefrags = cpu_to_le16(IEEE80211_FCTL_MOREFRAGS);
 
@@ -983,8 +1006,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 		} else {
 			hdr->frame_control &= ~morefrags;
 		}
-		hdr->seq_ctrl |= cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG);
-		fragnum++;
 	}
 
 	return TX_CONTINUE;
@@ -1477,12 +1498,14 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	struct sk_buff *skb = NULL;
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
+	struct ieee80211_tx_info *info;
 
 	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
+begin:
 	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
@@ -1490,16 +1513,42 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	ieee80211_set_skb_vif(skb, txqi);
 
 	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info = IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
 
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx = rcu_dereference(sta->fast_tx);
+		if (!fast_tx ||
+		    !ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb,
+						false)) {
+			/* fast xmit was started, but fails to finish */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+	} else {
+		struct ieee80211_tx_data tx = { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local = local;
+		tx.hdrlen = ieee80211_hdrlen(hdr->frame_control);
+		if (txq->sta) {
+			struct sta_info *sta = container_of(txq->sta,
+							    struct sta_info,
+							    sta);
+			tx.sta = container_of(txq->sta, struct sta_info, sta);
+			tx.sdata = sta->sdata;
+		} else {
+			tx.sdata = vif_to_sdata(info->control.vif);
+		}
+
+		__skb_queue_tail(&tx.skbs, skb);
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		__skb_unlink(skb, &tx.skbs);
 	}
 
 out:
@@ -1509,6 +1557,71 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
 
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct ieee80211_sta *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct fq *fq = &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+
+	if (!local->ops->wake_tx_queue)
+		return false;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				struct ieee80211_sub_if_data, u.ap);
+
+	vif = &sdata->vif;
+	txqi = ieee80211_get_txq(local, vif, sta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif = vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
+static bool ieee80211_queue_frags(struct ieee80211_local *local,
+				  struct ieee80211_sub_if_data *sdata,
+				  struct sta_info *sta,
+				  struct sk_buff_head *skbs)
+{
+	struct sk_buff *skb;
+	struct ieee80211_sta *pubsta;
+
+	if (WARN_ON(skb_queue_empty(skbs)))
+		return true;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type == NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta = &sta->sta;
+	else
+		pubsta = NULL;
+
+	while (!skb_queue_empty(skbs)) {
+		skb = __skb_dequeue(skbs);
+		if (unlikely(!ieee80211_queue_skb(local, sdata, pubsta, skb))) {
+			__skb_queue_head(skbs, skb);
+			return false;
+		}
+	}
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1516,9 +1629,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control = {};
-	struct fq *fq = &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1533,21 +1644,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
-		txqi = ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif = vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1668,8 +1764,12 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res = TX_DROP;
@@ -1685,7 +1785,6 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
 
@@ -1694,11 +1793,32 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 		tx->skb = NULL;
 		goto txh_done;
 	}
+	CALL_TXH(ieee80211_tx_h_fragment);
+
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	ieee80211_tx_result res = TX_DROP;
 
+	/* late tx handlers must be aware of tx info fragmentation! */
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
-	CALL_TXH(ieee80211_tx_h_fragment);
-	/* handlers after fragment must be aware of tx info fragmentation! */
 	CALL_TXH(ieee80211_tx_h_stats);
 	CALL_TXH(ieee80211_tx_h_encrypt);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
@@ -1721,6 +1841,11 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	return 0;
 }
 
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	return invoke_tx_handlers_early(tx) || invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1795,7 +1920,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		info->hw_queue =
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
 
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_frags(local, sdata, tx.sta, &tx.skbs))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result = __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
 
@@ -2979,8 +3110,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx = NULL;
 	u8 tid = IEEE80211_NUM_TIDS;
 
@@ -3045,11 +3174,30 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
+
+	if (ieee80211_queue_skb(local, sdata, &sta->sta, skb))
+		return true;
+
+	return ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb, true);
+}
+
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+	u8 tid = IEEE80211_NUM_TIDS;
 
 	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
 		*ieee80211_get_qos_ctl(hdr) = tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
 	} else {
 		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
 		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
@@ -3114,12 +3262,15 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-		sdata = container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
+	if (xmit) {
+		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+			sdata = container_of(sdata->bss,
+					struct ieee80211_sub_if_data, u.ap);
+
+		__skb_queue_tail(&tx.skbs, skb);
+		ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+	}
 
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
 
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 1884825..48270a91 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -28,13 +28,13 @@
 #include "wpa.h"
 
 ieee80211_tx_result
-ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+ieee80211_tx_h_michael_mic_add_skb(struct ieee80211_tx_data *tx,
+				   struct sk_buff *skb)
 {
 	u8 *data, *key, *mic;
 	size_t data_len;
 	unsigned int hdrlen;
 	struct ieee80211_hdr *hdr;
-	struct sk_buff *skb = tx->skb;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	int tail;
 
@@ -83,6 +83,20 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
 	return TX_CONTINUE;
 }
 
+ieee80211_tx_result
+ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb;
+	ieee80211_tx_result r;
+
+	skb_queue_walk(&tx->skbs, skb) {
+		r = ieee80211_tx_h_michael_mic_add_skb(tx, skb);
+		if (r != TX_CONTINUE)
+			return r;
+	}
+	return TX_CONTINUE;
+}
+
 
 ieee80211_rx_result
 ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-30 13:15     ` [Make-wifi-fast] [PATCH v4] " Toke Høiland-Jørgensen
  2016-08-30 13:17       ` Toke Høiland-Jørgensen
@ 2016-08-31 21:06       ` Johannes Berg
  2016-09-01  8:23         ` Toke Høiland-Jørgensen
  2016-09-01 16:03       ` [Make-wifi-fast] [PATCH v5] " Toke Høiland-Jørgensen
  2 siblings, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-08-31 21:06 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless

On Tue, 2016-08-30 at 15:15 +0200, Toke Høiland-Jørgensen wrote:

> @@ -829,10 +844,16 @@ ieee80211_tx_h_sequence(struct
> ieee80211_tx_data *tx)
>  	 */
>  	if (!ieee80211_is_data_qos(hdr->frame_control) ||
>  	    is_multicast_ether_addr(hdr->addr1)) {
> -		/* driver should assign sequence number */
> -		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
> -		/* for pure STA mode without beacons, we can do it
> */
> -		hdr->seq_ctrl = cpu_to_le16(tx->sdata-
> >sequence_number);
> +		fragnum = 0;
> +		seq = cpu_to_le16(tx->sdata->sequence_number);
> +		skb_queue_walk(&tx->skbs, skb) {
> +			info = IEEE80211_SKB_CB(skb);
> +			hdr = (struct ieee80211_hdr *)skb->data;
> +			/* driver should assign sequence number */
> +			info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
> +			/* for pure STA mode without beacons, we can
> do it */
> +			hdr->seq_ctrl = seq | fragnum++;

I would very much prefer you kept fragnum assignment in the
fragmentation handler.

Also, you just broke this on big endian, please run sparse on your
patches if you don't see these things directly.

> +		if (!fast_tx ||
> +		    !ieee80211_xmit_fast_finish(sta->sdata, sta,
> fast_tx, skb,
> +						false)) {
> +			/* fast xmit was started, but fails to
> finish */
> +			ieee80211_free_txskb(hw, skb);
> +			goto begin;
> +		}

That obviously cannot happen, it can't fail to finish. See the comments
in xmit_fast() and the return values ...

> +static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
> +{
> +	return invoke_tx_handlers_early(tx) ||
> invoke_tx_handlers_late(tx);
> +}

Ugh, please, no, don't be tricky where it's not necessary. Now every
person reading this has to first look up the return type, and then the
return value, and make sure they understand that success is actually
the value 0 ... that's way too much to ask.
 
> +ieee80211_tx_result
> +ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
> +{
> +	struct sk_buff *skb;
> +	ieee80211_tx_result r;
> +
> +	skb_queue_walk(&tx->skbs, skb) {
> +		r = ieee80211_tx_h_michael_mic_add_skb(tx, skb);
> +		if (r != TX_CONTINUE)
> +			return r;
> +	}
> +	return TX_CONTINUE;
> +}

You just broke TKIP completely again. Adding the MMIC and fragmentation
are not commutative operations.

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-31 21:06       ` Johannes Berg
@ 2016-09-01  8:23         ` Toke Høiland-Jørgensen
  2016-09-01  8:34           ` Johannes Berg
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-01  8:23 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> +static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
>> +{
>> +	return invoke_tx_handlers_early(tx) ||
>> invoke_tx_handlers_late(tx);
>> +}
>
> Ugh, please, no, don't be tricky where it's not necessary. Now every
> person reading this has to first look up the return type, and then the
> return value, and make sure they understand that success is actually
> the value 0 ... that's way too much to ask.

Noted. Any objections to turning these into bool return types?


I'll go through and fix your other comments and send a new version.
Thanks for the feedback :)

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01  8:23         ` Toke Høiland-Jørgensen
@ 2016-09-01  8:34           ` Johannes Berg
  2016-09-01  8:38             ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-09-01  8:34 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless


> > > +static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
> > > +{
> > > +	return invoke_tx_handlers_early(tx) ||
> > > invoke_tx_handlers_late(tx);
> > > +}
> > 
> > Ugh, please, no, don't be tricky where it's not necessary. Now
> > every
> > person reading this has to first look up the return type, and then
> > the
> > return value, and make sure they understand that success is
> > actually
> > the value 0 ... that's way too much to ask.
> 
> Noted. Any objections to turning these into bool return types?

They have three possible values ... :)

johannes


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01  8:34           ` Johannes Berg
@ 2016-09-01  8:38             ` Toke Høiland-Jørgensen
  2016-09-01  9:07               ` Johannes Berg
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-01  8:38 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> > > +static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
>> > > +{
>> > > +	return invoke_tx_handlers_early(tx) ||
>> > > invoke_tx_handlers_late(tx);
>> > > +}
>> > 
>> > Ugh, please, no, don't be tricky where it's not necessary. Now
>> > every
>> > person reading this has to first look up the return type, and then
>> > the
>> > return value, and make sure they understand that success is
>> > actually
>> > the value 0 ... that's way too much to ask.
>> 
>> Noted. Any objections to turning these into bool return types?
>
> They have three possible values ... :)

Ah, no, not the handlers themselves. Meant the invoke_tx_handlers()
function (or all three of them after my patch; hence the plural). To
avoid the "0 means true" confusion you alluded to :)

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01  8:38             ` Toke Høiland-Jørgensen
@ 2016-09-01  9:07               ` Johannes Berg
  2016-09-01  9:20                 ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-09-01  9:07 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless


> > They have three possible values ... :)
> 
> Ah, no, not the handlers themselves. Meant the invoke_tx_handlers()
> function (or all three of them after my patch; hence the plural). To
> avoid the "0 means true" confusion you alluded to :)
> 

Ah. Actually, even I got confused and thought the return value *was*
the same as the handler.

I think it doesn't matter to be tricky, gcc is probably going to (have
to) generate exactly the same code like when you explicitly put an if
statement in there, it seems?

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01  9:07               ` Johannes Berg
@ 2016-09-01  9:20                 ` Toke Høiland-Jørgensen
  2016-09-01  9:27                   ` Johannes Berg
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-01  9:20 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> > They have three possible values ... :)
>> 
>> Ah, no, not the handlers themselves. Meant the invoke_tx_handlers()
>> function (or all three of them after my patch; hence the plural). To
>> avoid the "0 means true" confusion you alluded to :)
>> 
>
> Ah. Actually, even I got confused and thought the return value *was*
> the same as the handler.
>
> I think it doesn't matter to be tricky, gcc is probably going to (have
> to) generate exactly the same code like when you explicitly put an if
> statement in there, it seems?

Yeah, was going to do that anyway. But since I'm touching the code
anyway, this might be an opportunity to avoid constructs like this:

if (!invoke_tx_handlers(tx))
  /* continue sending the packet */

Most other succeed/fail functions seem to be of type bool, so it would
help consistency as well. Unless there is some particular reason why
this function happens to be using 0 to indicate success?

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01  9:20                 ` Toke Høiland-Jørgensen
@ 2016-09-01  9:27                   ` Johannes Berg
  2016-09-01  9:42                     ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-09-01  9:27 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless


> Yeah, was going to do that anyway. But since I'm touching the code
> anyway, this might be an opportunity to avoid constructs like this:
> 
> if (!invoke_tx_handlers(tx))
>   /* continue sending the packet */
> 
> Most other succeed/fail functions seem to be of type bool, so it
> would help consistency as well. Unless there is some particular
> reason why this function happens to be using 0 to indicate success?
> 

It's just convention in the kernel, really.

IMHO if a function has a bool return value it should be have a more
expressive name that indicates better what's going on, like e.g.

bool ieee80211_is_radar_required(...);

but of course that's not always done.

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01  9:27                   ` Johannes Berg
@ 2016-09-01  9:42                     ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-01  9:42 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> Yeah, was going to do that anyway. But since I'm touching the code
>> anyway, this might be an opportunity to avoid constructs like this:
>> 
>> if (!invoke_tx_handlers(tx))
>>   /* continue sending the packet */
>> 
>> Most other succeed/fail functions seem to be of type bool, so it
>> would help consistency as well. Unless there is some particular
>> reason why this function happens to be using 0 to indicate success?
>> 
>
> It's just convention in the kernel, really.
>
> IMHO if a function has a bool return value it should be have a more
> expressive name that indicates better what's going on, like e.g.
>
> bool ieee80211_is_radar_required(...);
>
> but of course that's not always done.

Well, it's applied somewhat inconsistently across mac80211, it seems
(e.g. ieee80211_tx() and ieee80211_tx_prepare_skb() are bool, while
invoke_tx_handlers() and ieee80211_skb_resize() are int). But okay,
don't have that strong an opinion about the colour of this particular
bikeshed so I'll keep it the way it is ;)

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [Make-wifi-fast] [PATCH v5] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-30 13:15     ` [Make-wifi-fast] [PATCH v4] " Toke Høiland-Jørgensen
  2016-08-30 13:17       ` Toke Høiland-Jørgensen
  2016-08-31 21:06       ` Johannes Berg
@ 2016-09-01 16:03       ` Toke Høiland-Jørgensen
  2016-09-01 17:59         ` Johannes Berg
                           ` (2 more replies)
  2 siblings, 3 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-01 16:03 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

To avoid having to deal with fragmentation on dequeue, the split is set
to be after the fragmentation handler. This means that some reordering
of TX handlers is necessary, and some handlers had to be made aware of
fragmentation due to this reordering.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
Changes since v4:
- Keep fragnum assignment in fragmentation handler and fix endianness
  issues in seqno handler.
- Assume xmit_fast_finish can't fail in dequeue handler (and warn if
  fast_tx handle disappears).
- Move TKIP MIC and key selection handlers back before fragmentation
  handler. Turns out the MIC doesn't actually depend on a global
  sequence number, so it can be before the intermediate queueing step.
  The only cost of this is running the key selection handler twice in
  some cases.
- Improve readability of the composite invoke_tx_handlers() function.


 include/net/mac80211.h |   2 +
 net/mac80211/tx.c      | 266 +++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 214 insertions(+), 54 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
 };
 
 /*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746d..f7373c2 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
 
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit);
+
 /* misc utils */
 
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -585,20 +591,27 @@ static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_key *key;
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_hdr *hdr;
+	struct sk_buff *skb = tx->skb;
+
+	if (!skb)
+		skb = skb_peek(&tx->skbs);
+
+	info = IEEE80211_SKB_CB(skb);
+	hdr = (struct ieee80211_hdr *)skb->data;
 
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
 		tx->key = NULL;
 	else if (tx->sta &&
 		 (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx])))
 		tx->key = key;
-	else if (ieee80211_is_group_privacy_action(tx->skb) &&
+	else if (ieee80211_is_group_privacy_action(skb) &&
 		(key = rcu_dereference(tx->sdata->default_multicast_key)))
 		tx->key = key;
 	else if (ieee80211_is_mgmt(hdr->frame_control) &&
 		 is_multicast_ether_addr(hdr->addr1) &&
-		 ieee80211_is_robust_mgmt_frame(tx->skb) &&
+		 ieee80211_is_robust_mgmt_frame(skb) &&
 		 (key = rcu_dereference(tx->sdata->default_mgmt_key)))
 		tx->key = key;
 	else if (is_multicast_ether_addr(hdr->addr1) &&
@@ -628,8 +641,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 		case WLAN_CIPHER_SUITE_GCMP_256:
 			if (!ieee80211_is_data_present(hdr->frame_control) &&
 			    !ieee80211_use_mfp(hdr->frame_control, tx->sta,
-					       tx->skb) &&
-			    !ieee80211_is_group_privacy_action(tx->skb))
+					       skb) &&
+			    !ieee80211_is_group_privacy_action(skb))
 				tx->key = NULL;
 			else
 				skip_hw = (tx->key->conf.flags &
@@ -799,10 +812,12 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid)
 static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct sk_buff *skb = skb_peek(&tx->skbs);
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
 	u8 *qc;
 	int tid;
+	__le16 seq;
 
 	/*
 	 * Packet injection may want to control the sequence
@@ -829,10 +844,15 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	 */
 	if (!ieee80211_is_data_qos(hdr->frame_control) ||
 	    is_multicast_ether_addr(hdr->addr1)) {
-		/* driver should assign sequence number */
-		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
-		/* for pure STA mode without beacons, we can do it */
-		hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number);
+		seq = cpu_to_le16(tx->sdata->sequence_number);
+		skb_queue_walk(&tx->skbs, skb) {
+			info = IEEE80211_SKB_CB(skb);
+			hdr = (struct ieee80211_hdr *)skb->data;
+			/* driver should assign sequence number */
+			info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+			/* for pure STA mode without beacons, we can do it */
+			hdr->seq_ctrl |= seq;
+		}
 		tx->sdata->sequence_number += 0x10;
 		if (tx->sta)
 			tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++;
@@ -853,8 +873,13 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
 
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+	if (!tx->sta->sta.txq[0]) {
+		seq = ieee80211_tx_next_seq(tx->sta, tid);
+		skb_queue_walk(&tx->skbs, skb) {
+			hdr = (struct ieee80211_hdr *)skb->data;
+			hdr->seq_ctrl |= seq;
+		}
+	}
 
 	return TX_CONTINUE;
 }
@@ -1481,33 +1506,57 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct txq_info *txqi = container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb = NULL;
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
+	struct ieee80211_tx_info *info;
 
 	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
+begin:
 	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
 
 	ieee80211_set_skb_vif(skb, txqi);
 
-	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info = IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
 
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx = rcu_dereference(sta->fast_tx);
+		if (WARN_ON(!fast_tx)) {
+			/* lost the fast_tx pointer while the packet was queued */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+		ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb, false);
+	} else {
+		struct ieee80211_tx_data tx = { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local = local;
+		if (txq->sta) {
+			struct sta_info *sta = container_of(txq->sta,
+							    struct sta_info,
+							    sta);
+			tx.sta = container_of(txq->sta, struct sta_info, sta);
+			tx.sdata = sta->sdata;
+		} else {
+			tx.sdata = vif_to_sdata(info->control.vif);
+		}
+
+		__skb_queue_tail(&tx.skbs, skb);
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		__skb_unlink(skb, &tx.skbs);
 	}
 
 out:
@@ -1521,6 +1570,71 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
 
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct ieee80211_sta *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct fq *fq = &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+
+	if (!local->ops->wake_tx_queue)
+		return false;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				struct ieee80211_sub_if_data, u.ap);
+
+	vif = &sdata->vif;
+	txqi = ieee80211_get_txq(local, vif, sta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif = vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
+static bool ieee80211_queue_frags(struct ieee80211_local *local,
+				  struct ieee80211_sub_if_data *sdata,
+				  struct sta_info *sta,
+				  struct sk_buff_head *skbs)
+{
+	struct sk_buff *skb;
+	struct ieee80211_sta *pubsta;
+
+	if (WARN_ON(skb_queue_empty(skbs)))
+		return true;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type == NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta = &sta->sta;
+	else
+		pubsta = NULL;
+
+	while (!skb_queue_empty(skbs)) {
+		skb = __skb_dequeue(skbs);
+		if (unlikely(!ieee80211_queue_skb(local, sdata, pubsta, skb))) {
+			__skb_queue_head(skbs, skb);
+			return false;
+		}
+	}
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1642,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control = {};
-	struct fq *fq = &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1657,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
-		txqi = ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif = vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,8 +1777,12 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res = TX_DROP;
@@ -1708,9 +1807,32 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	}

 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
-	CALL_TXH(ieee80211_tx_h_sequence);
 	CALL_TXH(ieee80211_tx_h_fragment);
-	/* handlers after fragment must be aware of tx info fragmentation! */
+
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* late tx handlers must be aware of tx info fragmentation! */
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	ieee80211_tx_result res = TX_DROP;
+
+	if (!tx->key) /* Not set unless early and late handlers where chained. */
+		CALL_TXH(ieee80211_tx_h_select_key);
+	CALL_TXH(ieee80211_tx_h_sequence);
 	CALL_TXH(ieee80211_tx_h_stats);
 	CALL_TXH(ieee80211_tx_h_encrypt);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
@@ -1733,6 +1856,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	return 0;
 }
 
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r = invoke_tx_handlers_early(tx);
+	if (r)
+		return r;
+
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1807,7 +1939,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		info->hw_queue =
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
 
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_frags(local, sdata, tx.sta, &tx.skbs))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result = __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
 
@@ -3170,8 +3308,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx = NULL;
 	u8 tid = IEEE80211_NUM_TIDS;
 
@@ -3240,11 +3376,30 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
+
+	if (ieee80211_queue_skb(local, sdata, &sta->sta, skb))
+		return true;
+
+	return ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb, true);
+}
+
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+	u8 tid = IEEE80211_NUM_TIDS;
 
 	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
 		*ieee80211_get_qos_ctl(hdr) = tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
 	} else {
 		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
 		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
@@ -3309,12 +3464,15 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-		sdata = container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
+	if (xmit) {
+		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+			sdata = container_of(sdata->bss,
+					struct ieee80211_sub_if_data, u.ap);
+
+		__skb_queue_tail(&tx.skbs, skb);
+		ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+	}
 
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
 
-- 
2.9.3

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v5] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01 16:03       ` [Make-wifi-fast] [PATCH v5] " Toke Høiland-Jørgensen
@ 2016-09-01 17:59         ` Johannes Berg
  2016-09-01 18:30           ` Toke Høiland-Jørgensen
  2016-09-02  2:48         ` Jason Andryuk
  2016-09-02 13:41         ` [Make-wifi-fast] [PATCH v6] " Toke Høiland-Jørgensen
  2 siblings, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-09-01 17:59 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless


> To avoid having to deal with fragmentation on dequeue, the split is
> set to be after the fragmentation handler. This means that some
> reordering of TX handlers is necessary, and some handlers had to be
> made aware of fragmentation due to this reordering.

Come to think of it, that's actually counterproductive.

If a fragment is dropped, or even just if fragments are reordered, the
receiver will not be able to defragment the frame, and will thus drop
it. Therefore, it's all-or-nothing, and we shouldn't transmit any
fragment if we drop/reorder one (*).

So ... I think you'll just have to deal with fragmentation on the
codel/fq/whatever queues and keep fragments together, or do
fragmentation afterwards.

johannes


(*) also, couldn't this mean that we send something completely stupid
like

seq=1,frag=0
seq=2,frag=0
seq=2,frag=1
seq=2,frag=1

if reordering happened?

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v5] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01 17:59         ` Johannes Berg
@ 2016-09-01 18:30           ` Toke Høiland-Jørgensen
  2016-09-01 18:35             ` Johannes Berg
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-01 18:30 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> To avoid having to deal with fragmentation on dequeue, the split is
>> set to be after the fragmentation handler. This means that some
>> reordering of TX handlers is necessary, and some handlers had to be
>> made aware of fragmentation due to this reordering.
>
> Come to think of it, that's actually counterproductive.
>
> If a fragment is dropped, or even just if fragments are reordered, the
> receiver will not be able to defragment the frame, and will thus drop
> it. Therefore, it's all-or-nothing, and we shouldn't transmit any
> fragment if we drop/reorder one (*).
>
> So ... I think you'll just have to deal with fragmentation on the
> codel/fq/whatever queues and keep fragments together, or do
> fragmentation afterwards.

Hmm, guess that makes sense. Bugger. Will think about how to do that.

>
> johannes
>
> (*) also, couldn't this mean that we send something completely stupid
> like
>
> seq=1,frag=0
> seq=2,frag=0
> seq=2,frag=1
> seq=2,frag=1
>
> if reordering happened?

(assuming the last line was supposed to read 'seq=1,frag=1')

Yes, that could happen, in principle (it depends on the fragments' size
in relation to the FQ quantum).


When does fragmentation happen anyway? Is it safe to assume there's no
aggregation when it does?

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v5] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01 18:30           ` Toke Høiland-Jørgensen
@ 2016-09-01 18:35             ` Johannes Berg
  0 siblings, 0 replies; 77+ messages in thread
From: Johannes Berg @ 2016-09-01 18:35 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless

On Thu, 2016-09-01 at 20:30 +0200, Toke Høiland-Jørgensen wrote:

> > seq=1,frag=0
> > seq=2,frag=0
> > seq=2,frag=1
> > seq=2,frag=1
> > 
> > if reordering happened?
> 
> (assuming the last line was supposed to read 'seq=1,frag=1')

I did actually mean seq=2,frag=1, since the seqno assignment happened
after fragmentation in your patch, and after codel reordering, and
would not change the seqno until it encountered a frag=0 packet.

Or maybe that was only with the previous version of the patch.

> When does fragmentation happen anyway? Is it safe to assume there's
> no aggregation when it does?
> 

Yes, fragmented packets are not allowed to be aggregated.

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v5] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01 16:03       ` [Make-wifi-fast] [PATCH v5] " Toke Høiland-Jørgensen
  2016-09-01 17:59         ` Johannes Berg
@ 2016-09-02  2:48         ` Jason Andryuk
  2016-09-02  9:27           ` Toke Høiland-Jørgensen
  2016-09-02 13:41         ` [Make-wifi-fast] [PATCH v6] " Toke Høiland-Jørgensen
  2 siblings, 1 reply; 77+ messages in thread
From: Jason Andryuk @ 2016-09-02  2:48 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless

On Thu, Sep 1, 2016 at 12:03 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
> @@ -1481,33 +1506,57 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
>  {
>         struct ieee80211_local *local = hw_to_local(hw);
>         struct txq_info *txqi = container_of(txq, struct txq_info, txq);
> -       struct ieee80211_hdr *hdr;
>         struct sk_buff *skb = NULL;
>         struct fq *fq = &local->fq;
>         struct fq_tin *tin = &txqi->tin;
> +       struct ieee80211_tx_info *info;
>
>         spin_lock_bh(&fq->lock);
>
>         if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
>                 goto out;
>
> +begin:
>         skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
>         if (!skb)
>                 goto out;
>
>         ieee80211_set_skb_vif(skb, txqi);
>
> -       hdr = (struct ieee80211_hdr *)skb->data;
> -       if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
> +       info = IEEE80211_SKB_CB(skb);
> +       if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
>                 struct sta_info *sta = container_of(txq->sta, struct sta_info,
>                                                     sta);
> -               struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
> +               struct ieee80211_fast_tx *fast_tx;
>
> -               hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
> -               if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
> -                       info->flags |= IEEE80211_TX_CTL_AMPDU;
> -               else
> -                       info->flags &= ~IEEE80211_TX_CTL_AMPDU;
> +               fast_tx = rcu_dereference(sta->fast_tx);
> +               if (WARN_ON(!fast_tx)) {
> +                       /* lost the fast_tx pointer while the packet was queued */
> +                       ieee80211_free_txskb(hw, skb);
> +                       goto begin;
> +               }
> +               ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb, false);
> +       } else {
> +               struct ieee80211_tx_data tx = { };
> +
> +               __skb_queue_head_init(&tx.skbs);
> +               tx.local = local;
> +               if (txq->sta) {
> +                       struct sta_info *sta = container_of(txq->sta,
> +                                                           struct sta_info,
> +                                                           sta);

sta is unneeded give the assignment below?

Regards,
Jason

> +                       tx.sta = container_of(txq->sta, struct sta_info, sta);
> +                       tx.sdata = sta->sdata;
> +               } else {
> +                       tx.sdata = vif_to_sdata(info->control.vif);
> +               }
> +
> +               __skb_queue_tail(&tx.skbs, skb);
> +
> +               if (invoke_tx_handlers_late(&tx))
> +                       goto begin;
> +
> +               __skb_unlink(skb, &tx.skbs);
>         }

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v5] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-02  2:48         ` Jason Andryuk
@ 2016-09-02  9:27           ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-02  9:27 UTC (permalink / raw)
  To: Jason Andryuk; +Cc: make-wifi-fast, linux-wireless

Jason Andryuk <jandryuk@gmail.com> writes:

> On Thu, Sep 1, 2016 at 12:03 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
>> @@ -1481,33 +1506,57 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
>>  {
>>         struct ieee80211_local *local = hw_to_local(hw);
>>         struct txq_info *txqi = container_of(txq, struct txq_info, txq);
>> -       struct ieee80211_hdr *hdr;
>>         struct sk_buff *skb = NULL;
>>         struct fq *fq = &local->fq;
>>         struct fq_tin *tin = &txqi->tin;
>> +       struct ieee80211_tx_info *info;
>>
>>         spin_lock_bh(&fq->lock);
>>
>>         if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
>>                 goto out;
>>
>> +begin:
>>         skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
>>         if (!skb)
>>                 goto out;
>>
>>         ieee80211_set_skb_vif(skb, txqi);
>>
>> -       hdr = (struct ieee80211_hdr *)skb->data;
>> -       if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
>> +       info = IEEE80211_SKB_CB(skb);
>> +       if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
>>                 struct sta_info *sta = container_of(txq->sta, struct sta_info,
>>                                                     sta);
>> -               struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
>> +               struct ieee80211_fast_tx *fast_tx;
>>
>> -               hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
>> -               if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
>> -                       info->flags |= IEEE80211_TX_CTL_AMPDU;
>> -               else
>> -                       info->flags &= ~IEEE80211_TX_CTL_AMPDU;
>> +               fast_tx = rcu_dereference(sta->fast_tx);
>> +               if (WARN_ON(!fast_tx)) {
>> +                       /* lost the fast_tx pointer while the packet was queued */
>> +                       ieee80211_free_txskb(hw, skb);
>> +                       goto begin;
>> +               }
>> +               ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb, false);
>> +       } else {
>> +               struct ieee80211_tx_data tx = { };
>> +
>> +               __skb_queue_head_init(&tx.skbs);
>> +               tx.local = local;
>> +               if (txq->sta) {
>> +                       struct sta_info *sta = container_of(txq->sta,
>> +                                                           struct sta_info,
>> +                                                           sta);
>
> sta is unneeded give the assignment below?

Yeah, you're right. Think that was left over from a previous version.
Thanks for spotting it :)

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [Make-wifi-fast] [PATCH v6] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01 16:03       ` [Make-wifi-fast] [PATCH v5] " Toke Høiland-Jørgensen
  2016-09-01 17:59         ` Johannes Berg
  2016-09-02  2:48         ` Jason Andryuk
@ 2016-09-02 13:41         ` Toke Høiland-Jørgensen
  2016-09-02 14:44           ` Toke Høiland-Jørgensen
  2016-09-05 11:30           ` [Make-wifi-fast] [PATCH v7] " Toke Høiland-Jørgensen
  2 siblings, 2 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-02 13:41 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

Because fragments shouldn't be split up or reordered, the fragmentation
handler is run after dequeue. Any fragments are then kept in the TXQ and
on subsequent dequeues they take precedence over dequeueing from the FQ
structure.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
Changes since v5:
- Move the fragmentation handler to *after* TXQ dequeue. Fragments are
  kept in the TXQ for subsequent dequeues. This change also means that
  the changes to make some of the handlers fragmentation aware are no
  longer necessary.
- One of the TX stats updates in the fast path was done before the
  enqueue step; move that to xmit_fast_finish().
- Move the rate selection handler to after dequeue, so it's run closer
  to the time where the packet is actually transmitted.
  
 include/net/mac80211.h     |   2 +
 net/mac80211/ieee80211_i.h |   2 +
 net/mac80211/tx.c          | 207 +++++++++++++++++++++++++++++++++++----------
 3 files changed, 168 insertions(+), 43 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
 };
 
 /*
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f56d342..de9991d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -813,11 +813,13 @@ enum txq_info_flags {
  * @def_flow: used as a fallback flow when a packet destined to @tin hashes to
  *	a fq_flow which is already owned by a different tin
  * @def_cvars: codel vars for @def_flow
+ * @frags: used to keep fragments created after dequeue
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
 	struct codel_vars def_cvars;
+	struct sk_buff_head frags;
 	unsigned long flags;
 
 	/* keep last! */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746d..a3a4593 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
 
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit);
+
 /* misc utils */
 
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -1403,6 +1409,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 	fq_tin_init(&txqi->tin);
 	fq_flow_init(&txqi->def_flow);
 	codel_vars_init(&txqi->def_cvars);
+	__skb_queue_head_init(&txqi->frags);
 
 	txqi->txq.vif = &sdata->vif;
 
@@ -1425,6 +1432,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
 	struct fq_tin *tin = &txqi->tin;
 
 	fq_tin_reset(fq, tin, fq_skb_free_func);
+	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 }
 
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1481,33 +1489,62 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct txq_info *txqi = container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb = NULL;
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
+	struct ieee80211_tx_info *info;
 
 	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
+	/* Make sure fragments stay together. */
+	skb = __skb_dequeue(&txqi->frags);
+	if (skb)
+		goto out;
+
+begin:
 	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
 
 	ieee80211_set_skb_vif(skb, txqi);
 
-	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info = IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
 
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx = rcu_dereference(sta->fast_tx);
+		if (WARN_ON(!fast_tx)) {
+			/* lost the fast_tx pointer while the packet was queued */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+		ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb, false);
+	} else {
+		struct ieee80211_tx_data tx = { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local = local;
+		tx.skb = skb;
+		if (txq->sta) {
+			tx.sta = container_of(txq->sta, struct sta_info, sta);
+			tx.sdata = tx.sta->sdata;
+		} else {
+			tx.sdata = vif_to_sdata(info->control.vif);
+		}
+
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		skb = __skb_dequeue(&tx.skbs);
+
+		if (!skb_queue_empty(&tx.skbs))
+			skb_queue_splice_tail(&tx.skbs, &txqi->frags);
 	}
 
 out:
@@ -1521,6 +1558,47 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
 
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct fq *fq = &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+	struct ieee80211_sta *pubsta;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type == NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta = &sta->sta;
+	else
+		pubsta = NULL;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	vif = &sdata->vif;
+	txqi = ieee80211_get_txq(local, vif, pubsta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif = vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1606,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control = {};
-	struct fq *fq = &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1621,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
-		txqi = ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif = vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,10 +1741,13 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res = TX_DROP;
 
 #define CALL_TXH(txh) \
@@ -1697,7 +1761,28 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
+
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	ieee80211_tx_result res = TX_DROP;
+
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
 
@@ -1707,6 +1792,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 		goto txh_done;
 	}
 
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
 	CALL_TXH(ieee80211_tx_h_fragment);
@@ -1733,6 +1819,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	return 0;
 }
 
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r = invoke_tx_handlers_early(tx);
+	if (r)
+		return r;
+
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1807,7 +1902,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		info->hw_queue =
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
 
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result = __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
 
@@ -3159,7 +3260,7 @@ out:
 }
 
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
-				struct net_device *dev, struct sta_info *sta,
+				struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
 				struct sk_buff *skb)
 {
@@ -3170,8 +3271,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx = NULL;
 	u8 tid = IEEE80211_NUM_TIDS;
 
@@ -3210,8 +3309,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 			return true;
 	}
 
-	ieee80211_tx_stats(dev, skb->len + extra_head);
-
 	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
 	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
 		return true;
@@ -3240,11 +3337,32 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
+
+	if (ieee80211_queue_skb(local, sdata, sta, skb))
+		return true;
+
+	return ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb, true);
+}
+
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+	u8 tid = IEEE80211_NUM_TIDS;
+
+	ieee80211_tx_stats(skb->dev, skb->len);
 
 	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
 		*ieee80211_get_qos_ctl(hdr) = tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
 	} else {
 		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
 		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
@@ -3309,12 +3427,15 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-		sdata = container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
+	if (xmit) {
+		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+			sdata = container_of(sdata->bss,
+					struct ieee80211_sub_if_data, u.ap);
+
+		__skb_queue_tail(&tx.skbs, skb);
+		ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+	}
 
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
 
@@ -3342,7 +3463,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 		fast_tx = rcu_dereference(sta->fast_tx);
 
 		if (fast_tx &&
-		    ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+		    ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
 			goto out;
 	}
 
-- 
2.9.3

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v6] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-02 13:41         ` [Make-wifi-fast] [PATCH v6] " Toke Høiland-Jørgensen
@ 2016-09-02 14:44           ` Toke Høiland-Jørgensen
  2016-09-05 11:30           ` [Make-wifi-fast] [PATCH v7] " Toke Høiland-Jørgensen
  1 sibling, 0 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-02 14:44 UTC (permalink / raw)
  To: make-wifi-fast; +Cc: linux-wireless

Toke Høiland-Jørgensen <toke@toke.dk> writes:

> The TXQ intermediate queues can cause packet reordering when more than
> one flow is active to a single station. Since some of the wifi-specific
> packet handling (notably sequence number and encryption handling) is
> sensitive to re-ordering, things break if they are applied before the
> TXQ.
>
> This splits up the TX handlers and fast_xmit logic into two parts: An
> early part and a late part. The former is applied before TXQ enqueue,
> and the latter after dequeue. The non-TXQ path just applies both parts
> at once.
>
> Because fragments shouldn't be split up or reordered, the fragmentation
> handler is run after dequeue. Any fragments are then kept in the TXQ and
> on subsequent dequeues they take precedence over dequeueing from the FQ
> structure.
>
> This approach avoids having to scatter special cases for when TXQ is
> enabled, at the cost of making the fast_xmit and TX handler code
> slightly more complex.
>
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
> ---
> Changes since v5:
> - Move the fragmentation handler to *after* TXQ dequeue. Fragments are
>   kept in the TXQ for subsequent dequeues. This change also means that
>   the changes to make some of the handlers fragmentation aware are no
>   longer necessary.
> - One of the TX stats updates in the fast path was done before the
>   enqueue step; move that to xmit_fast_finish().
> - Move the rate selection handler to after dequeue, so it's run closer
>   to the time where the packet is actually transmitted.

Found one other thing that needs fixing shortly after posting this, but
figure that I'm probably not done anyway, so will leave it for the next
round. :)

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-02 13:41         ` [Make-wifi-fast] [PATCH v6] " Toke Høiland-Jørgensen
  2016-09-02 14:44           ` Toke Høiland-Jørgensen
@ 2016-09-05 11:30           ` Toke Høiland-Jørgensen
  2016-09-05 16:06             ` Toke Høiland-Jørgensen
                               ` (3 more replies)
  1 sibling, 4 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-05 11:30 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

Because fragments shouldn't be split up or reordered, the fragmentation
handler is run after dequeue. Any fragments are then kept in the TXQ and
on subsequent dequeues they take precedence over dequeueing from the FQ
structure.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
Changes since v6:
  - Invoking the rate control handler can cause packets to be generated
    (for establishing a BA session). This can cause a deadlock because
    dequeue can happen while sta->lock is held. So this version moves
    the rate control handler back before the intermediate queue step.
  - Fix sequence number allocation on the slow path.
  
 include/net/mac80211.h     |   2 +
 net/mac80211/ieee80211_i.h |   2 +
 net/mac80211/tx.c          | 250 ++++++++++++++++++++++++++++++++++-----------
 3 files changed, 192 insertions(+), 62 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
 };
 
 /*
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9211cce..d36f3b1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -813,11 +813,13 @@ enum txq_info_flags {
  * @def_flow: used as a fallback flow when a packet destined to @tin hashes to
  *	a fq_flow which is already owned by a different tin
  * @def_cvars: codel vars for @def_flow
+ * @frags: used to keep fragments created after dequeue
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
 	struct codel_vars def_cvars;
+	struct sk_buff_head frags;
 	unsigned long flags;
 
 	/* keep last! */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index efc38e7..94f38cc 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
 
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb);
+
 /* misc utils */
 
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -853,8 +859,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
 
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+	hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
 
 	return TX_CONTINUE;
 }
@@ -1403,6 +1408,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 	fq_tin_init(&txqi->tin);
 	fq_flow_init(&txqi->def_flow);
 	codel_vars_init(&txqi->def_cvars);
+	__skb_queue_head_init(&txqi->frags);
 
 	txqi->txq.vif = &sdata->vif;
 
@@ -1425,6 +1431,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
 	struct fq_tin *tin = &txqi->tin;
 
 	fq_tin_reset(fq, tin, fq_skb_free_func);
+	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 }
 
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1481,33 +1488,61 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct txq_info *txqi = container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb = NULL;
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
+	struct ieee80211_tx_info *info;
 
 	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
+	/* Make sure fragments stay together. */
+	skb = __skb_dequeue(&txqi->frags);
+	if (skb)
+		goto out;
+
+begin:
 	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
 
 	ieee80211_set_skb_vif(skb, txqi);
 
-	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info = IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
 
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx = rcu_dereference(sta->fast_tx);
+		if (WARN_ON(!fast_tx)) {
+			/* lost fast_tx pointer while the packet was queued */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+		ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb);
+	} else {
+		struct ieee80211_tx_data tx = { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local = local;
+		tx.skb = skb;
+		if (txq->sta) {
+			tx.sta = container_of(txq->sta, struct sta_info, sta);
+			tx.sdata = tx.sta->sdata;
+		} else {
+			tx.sdata = vif_to_sdata(info->control.vif);
+		}
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		skb = __skb_dequeue(&tx.skbs);
+
+		if (!skb_queue_empty(&tx.skbs))
+			skb_queue_splice_tail(&tx.skbs, &txqi->frags);
 	}
 
 out:
@@ -1521,6 +1556,47 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
 
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct fq *fq = &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+	struct ieee80211_sta *pubsta;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type == NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta = &sta->sta;
+	else
+		pubsta = NULL;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	vif = &sdata->vif;
+	txqi = ieee80211_get_txq(local, vif, pubsta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif = vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1604,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control = {};
-	struct fq *fq = &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1619,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
-		txqi = ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif = vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,10 +1739,13 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res = TX_DROP;
 
 #define CALL_TXH(txh) \
@@ -1697,16 +1759,42 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
+
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
 
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Late handlers can be called while the sta lock is held. Handlers that can
+ * cause packets to be generated will cause deadlock!
+ */
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	ieee80211_tx_result res = TX_CONTINUE;
+
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
 		__skb_queue_tail(&tx->skbs, tx->skb);
 		tx->skb = NULL;
 		goto txh_done;
 	}
 
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
 	CALL_TXH(ieee80211_tx_h_fragment);
@@ -1733,6 +1821,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	return 0;
 }
 
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r = invoke_tx_handlers_early(tx);
+	if (r)
+		return r;
+
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1807,7 +1904,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		info->hw_queue =
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
 
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result = __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
 
@@ -3159,7 +3262,7 @@ out:
 }
 
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
-				struct net_device *dev, struct sta_info *sta,
+				struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
 				struct sk_buff *skb)
 {
@@ -3170,9 +3273,9 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx = NULL;
+	ieee80211_tx_result r;
+	struct ieee80211_tx_data tx;
 	u8 tid = IEEE80211_NUM_TIDS;
 
 	/* control port protocol needs a lot of special handling */
@@ -3210,8 +3313,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 			return true;
 	}
 
-	ieee80211_tx_stats(dev, skb->len + extra_head);
-
 	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
 	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
 		return true;
@@ -3240,24 +3341,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
-
-	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
-		*ieee80211_get_qos_ctl(hdr) = tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
-	} else {
-		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
-		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
-		sdata->sequence_number += 0x10;
-	}
-
-	if (skb_shinfo(skb)->gso_size)
-		sta->tx_stats.msdu[tid] +=
-			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
-	else
-		sta->tx_stats.msdu[tid]++;
-
-	info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
 
 	__skb_queue_head_init(&tx.skbs);
 
@@ -3283,6 +3367,54 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
+	if (ieee80211_queue_skb(local, sdata, sta, skb))
+		return true;
+
+	ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb);
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	__skb_queue_tail(&tx.skbs, skb);
+	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+
+	return true;
+}
+
+/*
+ * Can be called while the sta lock is held. Anything that can cause packets to
+ * be generated will cause deadlock!
+ */
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	u8 tid = IEEE80211_NUM_TIDS;
+
+	ieee80211_tx_stats(skb->dev, skb->len);
+
+	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+		*ieee80211_get_qos_ctl(hdr) = tid;
+		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+	} else {
+		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
+		sdata->sequence_number += 0x10;
+	}
+
+	if (skb_shinfo(skb)->gso_size)
+		sta->tx_stats.msdu[tid] +=
+			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
+	else
+		sta->tx_stats.msdu[tid]++;
+
+	info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
 	/* statistics normally done by ieee80211_tx_h_stats (but that
 	 * has to consider fragmentation, so is more complex)
 	 */
@@ -3309,12 +3441,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-		sdata = container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
-
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
 
@@ -3342,7 +3468,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 		fast_tx = rcu_dereference(sta->fast_tx);
 
 		if (fast_tx &&
-		    ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+		    ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
 			goto out;
 	}
 
-- 
2.9.3

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 11:30           ` [Make-wifi-fast] [PATCH v7] " Toke Høiland-Jørgensen
@ 2016-09-05 16:06             ` Toke Høiland-Jørgensen
  2016-09-05 17:00               ` Dave Taht
  2016-09-05 17:49             ` Felix Fietkau
                               ` (2 subsequent siblings)
  3 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-05 16:06 UTC (permalink / raw)
  To: make-wifi-fast

[-- Attachment #1: Type: text/plain, Size: 1630 bytes --]

Toke Høiland-Jørgensen <toke@toke.dk> writes:

> The TXQ intermediate queues can cause packet reordering when more than
> one flow is active to a single station. Since some of the wifi-specific
> packet handling (notably sequence number and encryption handling) is
> sensitive to re-ordering, things break if they are applied before the
> TXQ.
>
> This splits up the TX handlers and fast_xmit logic into two parts: An
> early part and a late part. The former is applied before TXQ enqueue,
> and the latter after dequeue. The non-TXQ path just applies both parts
> at once.
>
> Because fragments shouldn't be split up or reordered, the fragmentation
> handler is run after dequeue. Any fragments are then kept in the TXQ and
> on subsequent dequeues they take precedence over dequeueing from the FQ
> structure.
>
> This approach avoids having to scatter special cases for when TXQ is
> enabled, at the cost of making the fast_xmit and TX handler code
> slightly more complex.
>
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
> ---
> Changes since v6:
>   - Invoking the rate control handler can cause packets to be generated
>     (for establishing a BA session). This can cause a deadlock because
>     dequeue can happen while sta->lock is held. So this version moves
>     the rate control handler back before the intermediate queue step.
>   - Fix sequence number allocation on the slow path.

Attaching a version suitable for dropping into a LEDE build (where it
replaces 220-fq_disable_hack.patch and
346-mac80211-fix-sequence-number-assignment-for-PS-respo.patch).

-Toke


[-- Attachment #2: 346-mac80211-move-reorder-sensitive-tx-handlers-to-after-TXQ-dequeue.patch --]
[-- Type: text/x-diff, Size: 15697 bytes --]

commit 9659fea5e5561dd851dcc9273e45a0dc8b0c9f69
Author: Toke Høiland-Jørgensen <toke@toke.dk>
Date:   Tue Aug 23 20:14:07 2016 +0200

    mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
    
    The TXQ intermediate queues can cause packet reordering when more than
    one flow is active to a single station. Since some of the wifi-specific
    packet handling (notably sequence number and encryption handling) is
    sensitive to re-ordering, things break if they are applied before the
    TXQ.
    
    This splits up the TX handlers and fast_xmit logic into two parts: An
    early part and a late part. The former is applied before TXQ enqueue,
    and the latter after dequeue. The non-TXQ path just applies both parts
    at once.
    
    Because fragments shouldn't be split up or reordered, the fragmentation
    handler is run after dequeue. Any fragments are then kept in the TXQ and
    on subsequent dequeues they take precedence over dequeueing from the FQ
    structure.
    
    This approach avoids having to scatter special cases for when TXQ is
    enabled, at the cost of making the fast_xmit and TX handler code
    slightly more complex.
    
    Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 26b0ea8..3d6bf45 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -711,6 +711,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -719,6 +720,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
 };
 
 /*
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 81548a8..cf99bc1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -808,11 +808,13 @@ enum txq_info_flags {
  * @def_flow: used as a fallback flow when a packet destined to @tin hashes to
  *	a fq_flow which is already owned by a different tin
  * @def_cvars: codel vars for @def_flow
+ * @frags: used to keep fragments created after dequeue
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
 	struct codel_vars def_cvars;
+	struct sk_buff_head frags;
 	unsigned long flags;
 
 	/* keep last! */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 86e806d..85ec649 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
 
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb);
+
 /* misc utils */
 
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -849,8 +855,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
 
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+	hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
 
 	return TX_CONTINUE;
 }
@@ -1399,6 +1404,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 	fq_tin_init(&txqi->tin);
 	fq_flow_init(&txqi->def_flow);
 	codel_vars_init(&txqi->def_cvars);
+	__skb_queue_head_init(&txqi->frags);
 
 	txqi->txq.vif = &sdata->vif;
 
@@ -1421,6 +1427,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
 	struct fq_tin *tin = &txqi->tin;
 
 	fq_tin_reset(fq, tin, fq_skb_free_func);
+	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 }
 
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1477,12 +1484,19 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	struct sk_buff *skb = NULL;
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
+	struct ieee80211_tx_info *info;
 
 	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
+	/* Make sure fragments stay together. */
+	skb = __skb_dequeue(&txqi->frags);
+	if (skb)
+		goto out;
+
+begin:
 	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
@@ -1490,16 +1504,40 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	ieee80211_set_skb_vif(skb, txqi);
 
 	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info = IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
 
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx = rcu_dereference(sta->fast_tx);
+		if (WARN_ON(!fast_tx)) {
+			/* lost fast_tx pointer while the packet was queued */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+		ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb);
+	} else {
+		struct ieee80211_tx_data tx = { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local = local;
+		tx.skb = skb;
+		tx.hdrlen = ieee80211_padded_hdrlen(hw, hdr->frame_control);
+		if (txq->sta) {
+			tx.sta = container_of(txq->sta, struct sta_info, sta);
+			tx.sdata = tx.sta->sdata;
+		} else {
+			tx.sdata = vif_to_sdata(info->control.vif);
+		}
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		skb = __skb_dequeue(&tx.skbs);
+
+		if (!skb_queue_empty(&tx.skbs))
+			skb_queue_splice_tail(&tx.skbs, &txqi->frags);
 	}
 
 out:
@@ -1513,6 +1551,47 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
 
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct fq *fq = &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+	struct ieee80211_sta *pubsta;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type == NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta = &sta->sta;
+	else
+		pubsta = NULL;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	vif = &sdata->vif;
+	txqi = ieee80211_get_txq(local, vif, pubsta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif = vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1520,9 +1599,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control = {};
-	struct fq *fq = &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1537,21 +1614,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
-		txqi = ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif = vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1672,10 +1734,13 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res = TX_DROP;
 
 #define CALL_TXH(txh) \
@@ -1689,16 +1754,42 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
+
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
 
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Late handlers can be called while the sta lock is held. Handlers that can
+ * cause packets to be generated will cause deadlock!
+ */
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	ieee80211_tx_result res = TX_CONTINUE;
+
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
 		__skb_queue_tail(&tx->skbs, tx->skb);
 		tx->skb = NULL;
 		goto txh_done;
 	}
 
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
 	CALL_TXH(ieee80211_tx_h_fragment);
@@ -1725,6 +1816,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	return 0;
 }
 
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r = invoke_tx_handlers_early(tx);
+	if (r)
+		return r;
+
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1799,7 +1899,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		info->hw_queue =
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
 
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result = __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
 
@@ -3120,7 +3226,7 @@ out:
 }
 
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
-				struct net_device *dev, struct sta_info *sta,
+				struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
 				struct sk_buff *skb)
 {
@@ -3131,9 +3237,9 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx = NULL;
+	ieee80211_tx_result r;
+	struct ieee80211_tx_data tx;
 	u8 tid = IEEE80211_NUM_TIDS;
 
 	/* control port protocol needs a lot of special handling */
@@ -3171,8 +3277,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 			return true;
 	}
 
-	ieee80211_tx_stats(dev, skb->len + extra_head);
-
 	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
 	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
 		return true;
@@ -3201,24 +3305,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
-
-	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
-		*ieee80211_get_qos_ctl(hdr) = tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
-	} else {
-		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
-		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
-		sdata->sequence_number += 0x10;
-	}
-
-	if (skb_shinfo(skb)->gso_size)
-		sta->tx_stats.msdu[tid] +=
-			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
-	else
-		sta->tx_stats.msdu[tid]++;
-
-	info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
 
 	__skb_queue_head_init(&tx.skbs);
 
@@ -3244,6 +3331,54 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
+	if (ieee80211_queue_skb(local, sdata, sta, skb))
+		return true;
+
+	ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb);
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	__skb_queue_tail(&tx.skbs, skb);
+	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+
+	return true;
+}
+
+/*
+ * Can be called while the sta lock is held. Anything that can cause packets to
+ * be generated will cause deadlock!
+ */
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	u8 tid = IEEE80211_NUM_TIDS;
+
+	ieee80211_tx_stats(skb->dev, skb->len);
+
+	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+		*ieee80211_get_qos_ctl(hdr) = tid;
+		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+	} else {
+		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
+		sdata->sequence_number += 0x10;
+	}
+
+	if (skb_shinfo(skb)->gso_size)
+		sta->tx_stats.msdu[tid] +=
+			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
+	else
+		sta->tx_stats.msdu[tid]++;
+
+	info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
 	/* statistics normally done by ieee80211_tx_h_stats (but that
 	 * has to consider fragmentation, so is more complex)
 	 */
@@ -3270,12 +3405,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-		sdata = container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
-
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
 
@@ -3303,7 +3432,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 		fast_tx = rcu_dereference(sta->fast_tx);
 
 		if (fast_tx &&
-		    ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+		    ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
 			goto out;
 	}
 

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 16:06             ` Toke Høiland-Jørgensen
@ 2016-09-05 17:00               ` Dave Taht
  2016-09-05 17:26                 ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 77+ messages in thread
From: Dave Taht @ 2016-09-05 17:00 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast

Just when I thought it was safe to boot up a router. Got a build from this?

(feature request - the UAP-ac-lite is working well, and also gets
built out of the ar71xx tree)

Regarding the rate suggestion back-shift to earlier, does that defer
it until the queue empties, or merely 2 times through the algo, where
intermediate queues, before, it was every time (potentially).

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 17:00               ` Dave Taht
@ 2016-09-05 17:26                 ` Toke Høiland-Jørgensen
  2016-09-05 17:59                   ` Dave Taht
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-05 17:26 UTC (permalink / raw)
  To: Dave Taht; +Cc: make-wifi-fast

Dave Taht <dave.taht@gmail.com> writes:

> Just when I thought it was safe to boot up a router. Got a build from this?
>
> (feature request - the UAP-ac-lite is working well, and also gets
> built out of the ar71xx tree)

https://kau.toke.dk/lede/airtime-fairness-builds/ar71xx/generic/

built from this tree:

https://kau.toke.dk/git/lede/

> Regarding the rate suggestion back-shift to earlier, does that defer
> it until the queue empties, or merely 2 times through the algo, where
> intermediate queues, before, it was every time (potentially).

I have read this paragraph five or six times now, but for the life of me
I can't make sense of it. So, erm... huh?

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 11:30           ` [Make-wifi-fast] [PATCH v7] " Toke Høiland-Jørgensen
  2016-09-05 16:06             ` Toke Høiland-Jørgensen
@ 2016-09-05 17:49             ` Felix Fietkau
  2016-09-05 17:59               ` Toke Høiland-Jørgensen
  2016-09-06 11:43             ` Toke Høiland-Jørgensen
  2016-09-06 11:44             ` [Make-wifi-fast] [PATCH v8] " Toke Høiland-Jørgensen
  3 siblings, 1 reply; 77+ messages in thread
From: Felix Fietkau @ 2016-09-05 17:49 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless

On 2016-09-05 13:30, Toke Høiland-Jørgensen wrote:
> The TXQ intermediate queues can cause packet reordering when more than
> one flow is active to a single station. Since some of the wifi-specific
> packet handling (notably sequence number and encryption handling) is
> sensitive to re-ordering, things break if they are applied before the
> TXQ.
> 
> This splits up the TX handlers and fast_xmit logic into two parts: An
> early part and a late part. The former is applied before TXQ enqueue,
> and the latter after dequeue. The non-TXQ path just applies both parts
> at once.
> 
> Because fragments shouldn't be split up or reordered, the fragmentation
> handler is run after dequeue. Any fragments are then kept in the TXQ and
> on subsequent dequeues they take precedence over dequeueing from the FQ
> structure.
> 
> This approach avoids having to scatter special cases for when TXQ is
> enabled, at the cost of making the fast_xmit and TX handler code
> slightly more complex.
In my test, this one completely breaks ath9k with the txq patch.
One or two packets go through, then tx stalls completely.

- Felix

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 17:49             ` Felix Fietkau
@ 2016-09-05 17:59               ` Toke Høiland-Jørgensen
  2016-09-05 18:45                 ` Felix Fietkau
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-05 17:59 UTC (permalink / raw)
  To: Felix Fietkau; +Cc: make-wifi-fast, linux-wireless

Felix Fietkau <nbd@nbd.name> writes:

> On 2016-09-05 13:30, Toke Høiland-Jørgensen wrote:
>> The TXQ intermediate queues can cause packet reordering when more than
>> one flow is active to a single station. Since some of the wifi-specific
>> packet handling (notably sequence number and encryption handling) is
>> sensitive to re-ordering, things break if they are applied before the
>> TXQ.
>> 
>> This splits up the TX handlers and fast_xmit logic into two parts: An
>> early part and a late part. The former is applied before TXQ enqueue,
>> and the latter after dequeue. The non-TXQ path just applies both parts
>> at once.
>> 
>> Because fragments shouldn't be split up or reordered, the fragmentation
>> handler is run after dequeue. Any fragments are then kept in the TXQ and
>> on subsequent dequeues they take precedence over dequeueing from the FQ
>> structure.
>> 
>> This approach avoids having to scatter special cases for when TXQ is
>> enabled, at the cost of making the fast_xmit and TX handler code
>> slightly more complex.
> In my test, this one completely breaks ath9k with the txq patch.
> One or two packets go through, then tx stalls completely.

I assume you are testing on LEDE? It requires a change to work with the
patch in the LEDE tree that puts hdrlen into ieee80211_tx_data. Did you
fix that? Otherwise multicast (and possibly other things) will break
badly.

I have a version that should work with LEDE here:

https://kau.toke.dk/git/lede/tree/package/kernel/mac80211/patches/346-mac80211-move-reorder-sensitive-tx-handlers-to-after-TXQ-dequeue.patch

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 17:26                 ` Toke Høiland-Jørgensen
@ 2016-09-05 17:59                   ` Dave Taht
  2016-09-05 20:23                     ` Dave Taht
  0 siblings, 1 reply; 77+ messages in thread
From: Dave Taht @ 2016-09-05 17:59 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast

On Mon, Sep 5, 2016 at 10:26 AM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
> Dave Taht <dave.taht@gmail.com> writes:
>
>> Just when I thought it was safe to boot up a router. Got a build from this?
>>
>> (feature request - the UAP-ac-lite is working well, and also gets
>> built out of the ar71xx tree)

THANK YOU VERY MUCH.  I'm writing up the initial results on that...
what you got will
be enough to evaluate what's in there against the ath9k 2.4 ghz chip in there.

It looks as though the ath10k versions of fq_codel are disabled in
lede thus far.

> https://kau.toke.dk/lede/airtime-fairness-builds/ar71xx/generic/
>
> built from this tree:
>
> https://kau.toke.dk/git/lede/

Pulled thanks!

it looks like you missed this, which looks promising - not just for
this but cake also.

https://git.lede-project.org/?p=lede/nbd/staging.git;a=commitdiff;h=93c3dfa7f30185ef425f4a30f0dd77208501c7ed;hp=bf0bde2ae5bf1fb21676be771378a87ecb8e4b97

>> Regarding the rate suggestion back-shift to earlier, does that defer
>> it until the queue empties, or merely 2 times through the algo, where
>> intermediate queues, before, it was every time (potentially).
>
> I have read this paragraph five or six times now, but for the life of me
> I can't make sense of it. So, erm... huh?

Sorry, pre-coffee here - for a possibly amusing look at how I lost
sleep while building up the yurtlab last night, and one of the crazier
ideas I have for testing things see:

http://blog.cerowrt.org/post/need_no_stinkin_blinkenlights/

Will try to restate the question after the coffee kicks in.

> -Toke



-- 
Dave Täht
Let's go make home routers and wifi faster! With better software!
http://blog.cerowrt.org

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 17:59               ` Toke Høiland-Jørgensen
@ 2016-09-05 18:45                 ` Felix Fietkau
  0 siblings, 0 replies; 77+ messages in thread
From: Felix Fietkau @ 2016-09-05 18:45 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless

On 2016-09-05 19:59, Toke Høiland-Jørgensen wrote:
> Felix Fietkau <nbd@nbd.name> writes:
> 
>> On 2016-09-05 13:30, Toke Høiland-Jørgensen wrote:
>>> The TXQ intermediate queues can cause packet reordering when more than
>>> one flow is active to a single station. Since some of the wifi-specific
>>> packet handling (notably sequence number and encryption handling) is
>>> sensitive to re-ordering, things break if they are applied before the
>>> TXQ.
>>> 
>>> This splits up the TX handlers and fast_xmit logic into two parts: An
>>> early part and a late part. The former is applied before TXQ enqueue,
>>> and the latter after dequeue. The non-TXQ path just applies both parts
>>> at once.
>>> 
>>> Because fragments shouldn't be split up or reordered, the fragmentation
>>> handler is run after dequeue. Any fragments are then kept in the TXQ and
>>> on subsequent dequeues they take precedence over dequeueing from the FQ
>>> structure.
>>> 
>>> This approach avoids having to scatter special cases for when TXQ is
>>> enabled, at the cost of making the fast_xmit and TX handler code
>>> slightly more complex.
>> In my test, this one completely breaks ath9k with the txq patch.
>> One or two packets go through, then tx stalls completely.
> 
> I assume you are testing on LEDE? It requires a change to work with the
> patch in the LEDE tree that puts hdrlen into ieee80211_tx_data. Did you
> fix that? Otherwise multicast (and possibly other things) will break
> badly.
You're right, I missed that.

> I have a version that should work with LEDE here:
> 
> https://kau.toke.dk/git/lede/tree/package/kernel/mac80211/patches/346-mac80211-move-reorder-sensitive-tx-handlers-to-after-TXQ-dequeue.patch
That one works fine in my test.

Thanks,

- Felix


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 17:59                   ` Dave Taht
@ 2016-09-05 20:23                     ` Dave Taht
  2016-09-05 20:45                       ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 77+ messages in thread
From: Dave Taht @ 2016-09-05 20:23 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, Felix Fietkau; +Cc: make-wifi-fast

Toke

Alright, I got your build running on one of the new uap ac lites. Your
patchset includes
348-mac80211-Use-more-standard-codel-and-fq_codel-defaul.patch which
reverts the fq quantum to 1514, and the codel target to 5ms.

(still not clear to me what the actual max packet size is in this layer)

There's no kernel repo in what you put up, so the ath10k firmware and
kernel modules are AWOL. So I tested mcs1, msc4, and minstrel. I had
babel
enabled (to blow up multicast), also.

It basically survived many tests, with generally half the latency of
what I got from lede mainline, under various loads (from 40ms median
to below 20ms median) I did not test crypto, just pounded it flat from
a mac. Bandwidth was slightly lower at the peak in ht20 mode vs lede
mainine but my circumstances are not very repeatable yet - I hadn't
had babel enabled on the previous test run, I tested a channel that is
probably busy during the day vs not at night, I'd moved both the AP
and the test station, and so on. I'll be getting towards repeatability
soon, and
I'm happy the new APs seem to be working great. At 65Mbit I'd see
about 12% sirq.

I'll put up some pics later, try to test powersave, try higher rates,
and various forms of crypto - still yurtlab-building here!

However - probably triggered by the rrul test, I did get 3 kernel
panics. the symptoms were that the osx box would stay associated but
no longer pass traffic.

[ 3149.554217] ------------[ cut here ]------------
[ 3149.559260] WARNING: CPU: 0 PID: 0 at
compat-wireless-2016-06-20/net/mac80211/tx.c:1514
ieee80211_tx_dequeue+0x17c/0x968 [mac80211]()
[ 3149.571663] Modules linked in: ath9k ath9k_common iptable_nat
ath9k_hw ath nf_nat_ipv4 nf_conntrack_ipv6 nf_conntrack_ipv4 mac80211
ipt_REJECT ipt_MASQUERADE ebtable_nat ebtable_filter ebtable_broute
cfg80211 xt_time xt_tcpudp xt_tcpmss xt_statistic xt_state xt_recent
xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_id xt_hl
xt_helper xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit
xt_connbytes xt_comment xt_TCPMSS xt_REDIRECT xt_LOG xt_IPMARK xt_HL
xt_DSCP xt_CT xt_CLASSIFY nf_reject_ipv4 nf_nat_redirect
nf_nat_masquerade_ipv4 nf_nat nf_log_ipv4 nf_defrag_ipv6
nf_defrag_ipv4 nf_conntrack_rtcache nf_conntrack_netlink nf_conntrack
iptable_raw iptable_mangle iptable_filter ipt_ECN ip_tables ebtables
ebt_vlan ebt_stp ebt_snat ebt_redirect ebt_pkttype ebt_mark_m ebt_mark
ebt_limit ebt_ip6 ebt_ip ebt_dnat ebt_arpreply ebt_arp ebt_among
ebt_802_3 crc_ccitt compat_xtables compat br_netfilter arptable_filter
arpt_mangle arp_tables sch_cake em_nbyte sch_htb sch_prio sch_dsmark
sch_pie sch_gred em_meta sch_teql cls_basic act_ipt sch_red em_text
sch_tbf act_police sch_codel sch_sfq em_cmp sch_fq act_skbedit
act_mirred em_u32 cls_u32 cls_tcindex cls_flow cls_route cls_fw
sch_hfsc sch_ingress leds_wndr3700_usb ledtrig_usbdev xt_set
ip_set_list_set ip_set_hash_netiface ip_set_hash_netport
ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet
ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip
ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ip
ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set
nfnetlink ip6t_rt ip6t_frag ip6t_hbh ip6t_eui64 ip6t_mh ip6t_ah
ip6t_ipv6header ip6t_REJECT nf_reject_ipv6 nf_log_ipv6 nf_log_common
ip6table_raw ip6table_mangle ip6table_filter ip6_tables x_tables ifb
sit tunnel4 ip_tunnel tun ohci_platform ohci_hcd ehci_platform
ehci_hcd gpio_button_hotplug usbcore nls_base usb_common
[ 3149.740660] CPU: 0 PID: 0 Comm: swapper Tainted: G        W       4.4.19 #0
[ 3149.747856] Stack : 804205e4 00000000 00000001 80480000 8046f058
8046ece3 803f9bd0 00000000
[ 3149.747856]       804f37e0 b75381f8 87011000 87011008 871c3bd0
800ada74 80400a84 80460000
[ 3149.747856]       00000003 b75381f8 803fea6c 8046597c 871c3bd0
800ab9a0 00000002 00000000
[ 3149.747856]       8046b1a0 80231300 00000000 00000000 00000000
00000000 00000000 00000000
[ 3149.747856]       00000000 00000000 00000000 00000000 00000000
00000000 00000000 00000000
[ 3149.747856]       ...
[ 3149.784879] Call Trace:
[ 3149.787417] [<80072378>] show_stack+0x50/0x84
[ 3149.791925] [<80084240>] warn_slowpath_common+0xa4/0xd4
[ 3149.797333] [<800842f8>] warn_slowpath_null+0x18/0x24
[ 3149.802826] [<8712cf2c>] ieee80211_tx_dequeue+0x17c/0x968 [mac80211]
[ 3149.809726] [<870ea900>] ath_tid_dequeue+0x98/0x13c [ath9k]
[ 3149.815532] [<870ea9f8>] ath_tx_get_tid_subframe+0x54/0x1ec [ath9k]
[ 3149.822026] [<870eb354>] ath_txq_schedule+0x540/0x650 [ath9k]
[ 3149.827993] [<870ec018>] ath_tx_process_buffer+0x9d0/0xa18 [ath9k]
[ 3149.834413] [<870ecd6c>] ath_tx_edma_tasklet+0x2d0/0x324 [ath9k]
[ 3149.840634] [<870e4fa0>] ath9k_tasklet+0x24c/0x2b0 [ath9k]
[ 3149.846332] [<80087634>] tasklet_action+0x80/0xc8
[ 3149.851189] [<80086f68>] __do_softirq+0x26c/0x32c
[ 3149.856070] [<8006a908>] plat_irq_dispatch+0xd4/0x10c
[ 3149.861287] [<80060830>] ret_from_irq+0x0/0x4
[ 3149.865803] [<8006ec00>] r4k_wait_irqoff+0x18/0x20
[ 3149.870757] [<800a87ac>] cpu_startup_entry+0xf8/0x184
[ 3149.875994] [<8049cbec>] start_kernel+0x488/0x4a8
[ 3149.880851]
[ 3149.882392] ---[ end trace 103165cc10a64d96 ]---

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 20:23                     ` Dave Taht
@ 2016-09-05 20:45                       ` Toke Høiland-Jørgensen
  2016-09-05 21:02                         ` Dave Taht
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-05 20:45 UTC (permalink / raw)
  To: Dave Taht; +Cc: Felix Fietkau, make-wifi-fast

Dave Taht <dave.taht@gmail.com> writes:

> Toke
>
> Alright, I got your build running on one of the new uap ac lites. Your
> patchset includes
> 348-mac80211-Use-more-standard-codel-and-fq_codel-defaul.patch which
> reverts the fq quantum to 1514, and the codel target to 5ms.
>
> (still not clear to me what the actual max packet size is in this layer)
>
> There's no kernel repo in what you put up, so the ath10k firmware and
> kernel modules are AWOL. So I tested mcs1, msc4, and minstrel. I had
> babel
> enabled (to blow up multicast), also.
>
> It basically survived many tests, with generally half the latency of
> what I got from lede mainline, under various loads (from 40ms median
> to below 20ms median) I did not test crypto, just pounded it flat from
> a mac. Bandwidth was slightly lower at the peak in ht20 mode vs lede
> mainine but my circumstances are not very repeatable yet - I hadn't
> had babel enabled on the previous test run, I tested a channel that is
> probably busy during the day vs not at night, I'd moved both the AP
> and the test station, and so on. I'll be getting towards repeatability
> soon, and
> I'm happy the new APs seem to be working great. At 65Mbit I'd see
> about 12% sirq.

Yay, sounds promising :)

> I'll put up some pics later, try to test powersave, try higher rates,
> and various forms of crypto - still yurtlab-building here!
>
> However - probably triggered by the rrul test, I did get 3 kernel
> panics. the symptoms were that the osx box would stay associated but
> no longer pass traffic.
>
> [ 3149.554217] ------------[ cut here ]------------
> [ 3149.559260] WARNING: CPU: 0 PID: 0 at
> compat-wireless-2016-06-20/net/mac80211/tx.c:1514
> ieee80211_tx_dequeue+0x17c/0x968 [mac80211]()
> [ 3149.571663] Modules linked in: ath9k ath9k_common iptable_nat
> ath9k_hw ath nf_nat_ipv4 nf_conntrack_ipv6 nf_conntrack_ipv4 mac80211
> ipt_REJECT ipt_MASQUERADE ebtable_nat ebtable_filter ebtable_broute
> cfg80211 xt_time xt_tcpudp xt_tcpmss xt_statistic xt_state xt_recent
> xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_id xt_hl
> xt_helper xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit
> xt_connbytes xt_comment xt_TCPMSS xt_REDIRECT xt_LOG xt_IPMARK xt_HL
> xt_DSCP xt_CT xt_CLASSIFY nf_reject_ipv4 nf_nat_redirect
> nf_nat_masquerade_ipv4 nf_nat nf_log_ipv4 nf_defrag_ipv6
> nf_defrag_ipv4 nf_conntrack_rtcache nf_conntrack_netlink nf_conntrack
> iptable_raw iptable_mangle iptable_filter ipt_ECN ip_tables ebtables
> ebt_vlan ebt_stp ebt_snat ebt_redirect ebt_pkttype ebt_mark_m ebt_mark
> ebt_limit ebt_ip6 ebt_ip ebt_dnat ebt_arpreply ebt_arp ebt_among
> ebt_802_3 crc_ccitt compat_xtables compat br_netfilter arptable_filter
> arpt_mangle arp_tables sch_cake em_nbyte sch_htb sch_prio sch_dsmark
> sch_pie sch_gred em_meta sch_teql cls_basic act_ipt sch_red em_text
> sch_tbf act_police sch_codel sch_sfq em_cmp sch_fq act_skbedit
> act_mirred em_u32 cls_u32 cls_tcindex cls_flow cls_route cls_fw
> sch_hfsc sch_ingress leds_wndr3700_usb ledtrig_usbdev xt_set
> ip_set_list_set ip_set_hash_netiface ip_set_hash_netport
> ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet
> ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip
> ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ip
> ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set
> nfnetlink ip6t_rt ip6t_frag ip6t_hbh ip6t_eui64 ip6t_mh ip6t_ah
> ip6t_ipv6header ip6t_REJECT nf_reject_ipv6 nf_log_ipv6 nf_log_common
> ip6table_raw ip6table_mangle ip6table_filter ip6_tables x_tables ifb
> sit tunnel4 ip_tunnel tun ohci_platform ohci_hcd ehci_platform
> ehci_hcd gpio_button_hotplug usbcore nls_base usb_common
> [ 3149.740660] CPU: 0 PID: 0 Comm: swapper Tainted: G        W       4.4.19 #0
> [ 3149.747856] Stack : 804205e4 00000000 00000001 80480000 8046f058
> 8046ece3 803f9bd0 00000000
> [ 3149.747856]       804f37e0 b75381f8 87011000 87011008 871c3bd0
> 800ada74 80400a84 80460000
> [ 3149.747856]       00000003 b75381f8 803fea6c 8046597c 871c3bd0
> 800ab9a0 00000002 00000000
> [ 3149.747856]       8046b1a0 80231300 00000000 00000000 00000000
> 00000000 00000000 00000000
> [ 3149.747856]       00000000 00000000 00000000 00000000 00000000
> 00000000 00000000 00000000
> [ 3149.747856]       ...
> [ 3149.784879] Call Trace:
> [ 3149.787417] [<80072378>] show_stack+0x50/0x84
> [ 3149.791925] [<80084240>] warn_slowpath_common+0xa4/0xd4
> [ 3149.797333] [<800842f8>] warn_slowpath_null+0x18/0x24
> [ 3149.802826] [<8712cf2c>] ieee80211_tx_dequeue+0x17c/0x968 [mac80211]
> [ 3149.809726] [<870ea900>] ath_tid_dequeue+0x98/0x13c [ath9k]
> [ 3149.815532] [<870ea9f8>] ath_tx_get_tid_subframe+0x54/0x1ec [ath9k]
> [ 3149.822026] [<870eb354>] ath_txq_schedule+0x540/0x650 [ath9k]
> [ 3149.827993] [<870ec018>] ath_tx_process_buffer+0x9d0/0xa18 [ath9k]
> [ 3149.834413] [<870ecd6c>] ath_tx_edma_tasklet+0x2d0/0x324 [ath9k]
> [ 3149.840634] [<870e4fa0>] ath9k_tasklet+0x24c/0x2b0 [ath9k]
> [ 3149.846332] [<80087634>] tasklet_action+0x80/0xc8
> [ 3149.851189] [<80086f68>] __do_softirq+0x26c/0x32c
> [ 3149.856070] [<8006a908>] plat_irq_dispatch+0xd4/0x10c
> [ 3149.861287] [<80060830>] ret_from_irq+0x0/0x4
> [ 3149.865803] [<8006ec00>] r4k_wait_irqoff+0x18/0x20
> [ 3149.870757] [<800a87ac>] cpu_startup_entry+0xf8/0x184
> [ 3149.875994] [<8049cbec>] start_kernel+0x488/0x4a8
> [ 3149.880851]
> [ 3149.882392] ---[ end trace 103165cc10a64d96 ]---

Ah, no, those are not panics, those are warnings being triggered by the
fast_tx pointer going while the packet was queued. Now, the
xmit_fast_finish() function doesn't actually use that for anything other
than crypto key configuration, so it would probably be feasible to get
rid of that check in the dequeue path.

How many of those warnings do you see? And what do you have to do to get
traffic to flow again? The warning should just cause the packet to be
dropped; subsequent packets shouldn't be affected (ha!).

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 20:45                       ` Toke Høiland-Jørgensen
@ 2016-09-05 21:02                         ` Dave Taht
  2016-09-05 21:25                           ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 77+ messages in thread
From: Dave Taht @ 2016-09-05 21:02 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: Felix Fietkau, make-wifi-fast

[-- Attachment #1: Type: text/plain, Size: 8210 bytes --]

On Mon, Sep 5, 2016 at 1:45 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
> Dave Taht <dave.taht@gmail.com> writes:
>
>> Toke
>>
>> Alright, I got your build running on one of the new uap ac lites. Your
>> patchset includes
>> 348-mac80211-Use-more-standard-codel-and-fq_codel-defaul.patch which
>> reverts the fq quantum to 1514, and the codel target to 5ms.
>>
>> (still not clear to me what the actual max packet size is in this layer)
>>
>> There's no kernel repo in what you put up, so the ath10k firmware and
>> kernel modules are AWOL. So I tested mcs1, msc4, and minstrel. I had
>> babel
>> enabled (to blow up multicast), also.
>>
>> It basically survived many tests, with generally half the latency of
>> what I got from lede mainline, under various loads (from 40ms median
>> to below 20ms median)

pic at mcs4 attached. Note the labling is wrong, the "HT20" stuff was against
the privious lede-mainline firmware, against the 2.4ghz stuff, so this
data is directly comparable.

Note, we are testing 3 changed variables here - the reduced codel
target, the much saner quantum, and the airtime fairness code... and I
think but am not sure, I was using felix's new softirq patch in the
previous testing

and I still need to get around to airtime fairness... but need more
stuff bolted down first.

/me looks at his drill and screwdriver

> I did not test crypto, just pounded it flat from
>> a mac. Bandwidth was slightly lower at the peak in ht20 mode vs lede
>> mainine but my circumstances are not very repeatable yet - I hadn't
>> had babel enabled on the previous test run, I tested a channel that is
>> probably busy during the day vs not at night, I'd moved both the AP
>> and the test station, and so on. I'll be getting towards repeatability
>> soon, and
>> I'm happy the new APs seem to be working great. At 65Mbit I'd see
>> about 12% sirq.
>
> Yay, sounds promising :)
>
>> I'll put up some pics later, try to test powersave, try higher rates,
>> and various forms of crypto - still yurtlab-building here!
>>
>> However - probably triggered by the rrul test, I did get 3 kernel
>> panics. the symptoms were that the osx box would stay associated but
>> no longer pass traffic.
>>
>> [ 3149.554217] ------------[ cut here ]------------
>> [ 3149.559260] WARNING: CPU: 0 PID: 0 at
>> compat-wireless-2016-06-20/net/mac80211/tx.c:1514
>> ieee80211_tx_dequeue+0x17c/0x968 [mac80211]()
>> [ 3149.571663] Modules linked in: ath9k ath9k_common iptable_nat
>> ath9k_hw ath nf_nat_ipv4 nf_conntrack_ipv6 nf_conntrack_ipv4 mac80211
>> ipt_REJECT ipt_MASQUERADE ebtable_nat ebtable_filter ebtable_broute
>> cfg80211 xt_time xt_tcpudp xt_tcpmss xt_statistic xt_state xt_recent
>> xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_id xt_hl
>> xt_helper xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit
>> xt_connbytes xt_comment xt_TCPMSS xt_REDIRECT xt_LOG xt_IPMARK xt_HL
>> xt_DSCP xt_CT xt_CLASSIFY nf_reject_ipv4 nf_nat_redirect
>> nf_nat_masquerade_ipv4 nf_nat nf_log_ipv4 nf_defrag_ipv6
>> nf_defrag_ipv4 nf_conntrack_rtcache nf_conntrack_netlink nf_conntrack
>> iptable_raw iptable_mangle iptable_filter ipt_ECN ip_tables ebtables
>> ebt_vlan ebt_stp ebt_snat ebt_redirect ebt_pkttype ebt_mark_m ebt_mark
>> ebt_limit ebt_ip6 ebt_ip ebt_dnat ebt_arpreply ebt_arp ebt_among
>> ebt_802_3 crc_ccitt compat_xtables compat br_netfilter arptable_filter
>> arpt_mangle arp_tables sch_cake em_nbyte sch_htb sch_prio sch_dsmark
>> sch_pie sch_gred em_meta sch_teql cls_basic act_ipt sch_red em_text
>> sch_tbf act_police sch_codel sch_sfq em_cmp sch_fq act_skbedit
>> act_mirred em_u32 cls_u32 cls_tcindex cls_flow cls_route cls_fw
>> sch_hfsc sch_ingress leds_wndr3700_usb ledtrig_usbdev xt_set
>> ip_set_list_set ip_set_hash_netiface ip_set_hash_netport
>> ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet
>> ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip
>> ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ip
>> ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set
>> nfnetlink ip6t_rt ip6t_frag ip6t_hbh ip6t_eui64 ip6t_mh ip6t_ah
>> ip6t_ipv6header ip6t_REJECT nf_reject_ipv6 nf_log_ipv6 nf_log_common
>> ip6table_raw ip6table_mangle ip6table_filter ip6_tables x_tables ifb
>> sit tunnel4 ip_tunnel tun ohci_platform ohci_hcd ehci_platform
>> ehci_hcd gpio_button_hotplug usbcore nls_base usb_common
>> [ 3149.740660] CPU: 0 PID: 0 Comm: swapper Tainted: G        W       4.4.19 #0
>> [ 3149.747856] Stack : 804205e4 00000000 00000001 80480000 8046f058
>> 8046ece3 803f9bd0 00000000
>> [ 3149.747856]       804f37e0 b75381f8 87011000 87011008 871c3bd0
>> 800ada74 80400a84 80460000
>> [ 3149.747856]       00000003 b75381f8 803fea6c 8046597c 871c3bd0
>> 800ab9a0 00000002 00000000
>> [ 3149.747856]       8046b1a0 80231300 00000000 00000000 00000000
>> 00000000 00000000 00000000
>> [ 3149.747856]       00000000 00000000 00000000 00000000 00000000
>> 00000000 00000000 00000000
>> [ 3149.747856]       ...
>> [ 3149.784879] Call Trace:
>> [ 3149.787417] [<80072378>] show_stack+0x50/0x84
>> [ 3149.791925] [<80084240>] warn_slowpath_common+0xa4/0xd4
>> [ 3149.797333] [<800842f8>] warn_slowpath_null+0x18/0x24
>> [ 3149.802826] [<8712cf2c>] ieee80211_tx_dequeue+0x17c/0x968 [mac80211]
>> [ 3149.809726] [<870ea900>] ath_tid_dequeue+0x98/0x13c [ath9k]
>> [ 3149.815532] [<870ea9f8>] ath_tx_get_tid_subframe+0x54/0x1ec [ath9k]
>> [ 3149.822026] [<870eb354>] ath_txq_schedule+0x540/0x650 [ath9k]
>> [ 3149.827993] [<870ec018>] ath_tx_process_buffer+0x9d0/0xa18 [ath9k]
>> [ 3149.834413] [<870ecd6c>] ath_tx_edma_tasklet+0x2d0/0x324 [ath9k]
>> [ 3149.840634] [<870e4fa0>] ath9k_tasklet+0x24c/0x2b0 [ath9k]
>> [ 3149.846332] [<80087634>] tasklet_action+0x80/0xc8
>> [ 3149.851189] [<80086f68>] __do_softirq+0x26c/0x32c
>> [ 3149.856070] [<8006a908>] plat_irq_dispatch+0xd4/0x10c
>> [ 3149.861287] [<80060830>] ret_from_irq+0x0/0x4
>> [ 3149.865803] [<8006ec00>] r4k_wait_irqoff+0x18/0x20
>> [ 3149.870757] [<800a87ac>] cpu_startup_entry+0xf8/0x184
>> [ 3149.875994] [<8049cbec>] start_kernel+0x488/0x4a8
>> [ 3149.880851]
>> [ 3149.882392] ---[ end trace 103165cc10a64d96 ]---
>
> Ah, no, those are not panics, those are warnings being triggered by the
> fast_tx pointer going while the packet was queued. Now, the
> xmit_fast_finish() function doesn't actually use that for anything other
> than crypto key configuration, so it would probably be feasible to get
> rid of that check in the dequeue path.
>
> How many of those warnings do you see?

I'm not crazy, I run the rrul test at the conclusion of the run. Which this was.

I'll go run it on a fresh boot but...

dmesg | grep 'cut here'

[  707.011531] ------------[ cut here ]------------
[  707.343296] ------------[ cut here ]------------
[  707.676275] ------------[ cut here ]------------
[  708.009204] ------------[ cut here ]------------
[  708.342138] ------------[ cut here ]------------
[  709.247082] ------------[ cut here ]------------
[  709.580053] ------------[ cut here ]------------
[  709.913023] ------------[ cut here ]------------
[  710.245975] ------------[ cut here ]------------

Also attached.

> And what do you have to do to get
> traffic to flow again?

Seems to come back after a while.

> The warning should just cause the packet to be
> dropped; subsequent packets shouldn't be affected (ha!).

It appears that attempts to exercise CS5 and CS1 fail entirely judging
from the statio aqm file and the ath9k xmit (or the driver is
reloading and wiping out these stats). I'll poke at it harder.

tid ac backlog-bytes backlog-packets new-flows drops marks overlimit
collisions tx-bytes tx-packets
0 2 0 0 45282 3980 0 0 0 699726818 954301
1 3 0 0 0 0 0 0 0 0 0
2 3 0 0 0 0 0 0 0 0 0
3 2 0 0 0 0 0 0 0 0 0
4 1 0 0 0 0 0 0 0 0 0
5 1 0 0 0 0 0 0 0 0 0
6 0 0 0 25 0 0 0 0 2900 25
7 0 0 0 0 0 0 0 0 0 0
8 2 0 0 0 0 0 0 0 0 0
9 3 0 0 0 0 0 0 0 0 0
10 3 0 0 0 0 0 0 0 0 0
11 2 0 0 0 0 0 0 0 0 0
12 1 0 0 0 0 0 0 0 0 0
13 1 0 0 0 0 0 0 0 0 0
14 0 0 0 0 0 0 0 0 0 0
15 0 0 0 0 0 0 0 0 0 0


> -Toke

[-- Attachment #2: dmesg.txt --]
[-- Type: text/plain, Size: 45107 bytes --]

[    0.000000] Linux version 4.4.19 (alrua@alrua-kau) (gcc version 5.4.0 (LEDE GCC 5.4.0 r1512+2) ) #0 Mon Sep 5 14:44:09 2016
[    0.000000] MyLoader: sysp=e18a8a2c, boardp=e88baa89, parts=e3bb8aa0
[    0.000000] bootconsole [early0] enabled
[    0.000000] CPU0 revision is: 00019750 (MIPS 74Kc)
[    0.000000] SoC: Qualcomm Atheros QCA956X ver 1 rev 0
[    0.000000] Determined physical RAM map:
[    0.000000]  memory: 08000000 @ 00000000 (usable)
[    0.000000] Initrd not found or empty - disabling initrd
[    0.000000] No valid device tree found, continuing without
[    0.000000] Zone ranges:
[    0.000000]   Normal   [mem 0x0000000000000000-0x0000000007ffffff]
[    0.000000] Movable zone start for each node
[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000000000000-0x0000000007ffffff]
[    0.000000] Initmem setup node 0 [mem 0x0000000000000000-0x0000000007ffffff]
[    0.000000] On node 0 totalpages: 32768
[    0.000000] free_area_init_node: node 0, pgdat 8046c6e8, node_mem_map 81000000
[    0.000000]   Normal zone: 256 pages used for memmap
[    0.000000]   Normal zone: 0 pages reserved
[    0.000000]   Normal zone: 32768 pages, LIFO batch:7
[    0.000000] Primary instruction cache 64kB, VIPT, 4-way, linesize 32 bytes.
[    0.000000] Primary data cache 32kB, 4-way, VIPT, cache aliases, linesize 32 bytes
[    0.000000] pcpu-alloc: s0 r0 d32768 u32768 alloc=1*32768
[    0.000000] pcpu-alloc: [0] 0 
[    0.000000] Built 1 zonelists in Zone order, mobility grouping on.  Total pages: 32512
[    0.000000] Kernel command line:  board=UBNT-UF-AC-LITE mtdparts=spi0.0:384k(u-boot)ro,64k(u-boot-env)ro,7744k(firmware),7744k(ubnt-airos)ro,128k(bs)ro,256k(cfg)ro,64k(EEPROM)ro console=ttyS0,115200 rootfstype=squashfs,jffs2 noinitrd
[    0.000000] PID hash table entries: 512 (order: -1, 2048 bytes)
[    0.000000] Dentry cache hash table entries: 16384 (order: 4, 65536 bytes)
[    0.000000] Inode-cache hash table entries: 8192 (order: 3, 32768 bytes)
[    0.000000] Writing ErrCtl register=00000000
[    0.000000] Readback ErrCtl register=00000000
[    0.000000] Memory: 124620K/131072K available (3261K kernel code, 227K rwdata, 840K rodata, 336K init, 209K bss, 6452K reserved, 0K cma-reserved)
[    0.000000] SLUB: HWalign=32, Order=0-3, MinObjects=0, CPUs=1, Nodes=1
[    0.000000] NR_IRQS:51
[    0.000000] Clocks: CPU:775.000MHz, DDR:650.000MHz, AHB:258.333MHz, Ref:25.000MHz
[    0.000000] clocksource: MIPS: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 4932285024 ns
[    0.000007] sched_clock: 32 bits at 387MHz, resolution 2ns, wraps every 5541893118ns
[    0.008215] Calibrating delay loop... 385.84 BogoMIPS (lpj=1929216)
[    0.071026] pid_max: default: 32768 minimum: 301
[    0.075982] Mount-cache hash table entries: 1024 (order: 0, 4096 bytes)
[    0.082960] Mountpoint-cache hash table entries: 1024 (order: 0, 4096 bytes)
[    0.091348] Performance counters: mips/74K PMU enabled, 4 32-bit counters available to each CPU, irq 13
[    0.102599] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604462750000 ns
[    0.114024] NET: Registered protocol family 16
[    0.119926] MIPS: machine is Ubiquiti UniFi-AC-LITE
[    0.349752] registering PCI controller with io_map_base unset
[    0.363998] PCI host bridge to bus 0000:00
[    0.368366] pci_bus 0000:00: root bus resource [mem 0x12000000-0x13ffffff]
[    0.375603] pci_bus 0000:00: root bus resource [io  0x0001]
[    0.381484] pci_bus 0000:00: root bus resource [??? 0x00000000 flags 0x0]
[    0.388633] pci_bus 0000:00: No busn resource found for root bus, will use [bus 00-ff]
[    0.397027] pci 0000:00:00.0: [168c:003c] type 00 class 0x028000
[    0.397054] pci 0000:00:00.0: invalid calibration data
[    0.402484] pci 0000:00:00.0: reg 0x10: [mem 0x00000000-0x001fffff 64bit]
[    0.402537] pci 0000:00:00.0: reg 0x30: [mem 0x00000000-0x0000ffff pref]
[    0.402595] pci 0000:00:00.0: supports D1
[    0.402607] pci 0000:00:00.0: PME# supported from D0 D1 D3hot
[    0.402793] pci_bus 0000:00: busn_res: [bus 00-ff] end is updated to 00
[    0.402823] pci 0000:00:00.0: BAR 0: assigned [mem 0x12000000-0x121fffff 64bit]
[    0.410577] pci 0000:00:00.0: BAR 6: assigned [mem 0x12200000-0x1220ffff pref]
[    0.418192] pci 0000:00:00.0: using irq 40 for pin 1
[    0.424109] clocksource: Switched to clocksource MIPS
[    0.440402] NET: Registered protocol family 2
[    0.445699] TCP established hash table entries: 1024 (order: 0, 4096 bytes)
[    0.453055] TCP bind hash table entries: 1024 (order: 0, 4096 bytes)
[    0.459801] TCP: Hash tables configured (established 1024 bind 1024)
[    0.466595] UDP hash table entries: 256 (order: 0, 4096 bytes)
[    0.472759] UDP-Lite hash table entries: 256 (order: 0, 4096 bytes)
[    0.479662] NET: Registered protocol family 1
[    0.484382] PCI: CLS 0 bytes, default 32
[    0.488088] futex hash table entries: 256 (order: -1, 3072 bytes)
[    0.494918] Crashlog allocated RAM at address 0x3f00000
[    0.512013] squashfs: version 4.0 (2009/01/31) Phillip Lougher
[    0.518213] jffs2: version 2.2 (NAND) (SUMMARY) (LZMA) (RTIME) (CMODE_PRIORITY) (c) 2001-2006 Red Hat, Inc.
[    0.530725] io scheduler noop registered
[    0.534920] io scheduler deadline registered (default)
[    0.540515] Serial: 8250/16550 driver, 1 ports, IRQ sharing disabled
[    0.547598] console [ttyS0] disabled
[    0.571440] serial8250.0: ttyS0 at MMIO 0x18020000 (irq = 11, base_baud = 1562500) is a 16550A
[    0.580544] console [ttyS0] enabled
[    0.588104] bootconsole [early0] disabled
[    0.599282] m25p80 spi0.0: mx25l12805d (16384 Kbytes)
[    0.604595] 7 cmdlinepart partitions found on MTD device spi0.0
[    0.610701] Creating 7 MTD partitions on "spi0.0":
[    0.615681] 0x000000000000-0x000000060000 : "u-boot"
[    0.622644] 0x000000060000-0x000000070000 : "u-boot-env"
[    0.629591] 0x000000070000-0x000000800000 : "firmware"
[    0.649884] 2 uimage-fw partitions found on MTD device firmware
[    0.656042] 0x000000070000-0x0000001d0000 : "kernel"
[    0.662418] 0x0000001d0000-0x000000800000 : "rootfs"
[    0.668930] mtd: device 4 (rootfs) set to be root filesystem
[    0.674891] 1 squashfs-split partitions found on MTD device rootfs
[    0.681278] 0x0000006f0000-0x000000800000 : "rootfs_data"
[    0.688243] 0x000000800000-0x000000f90000 : "ubnt-airos"
[    0.695148] 0x000000f90000-0x000000fb0000 : "bs"
[    0.701242] 0x000000fb0000-0x000000ff0000 : "cfg"
[    0.707531] 0x000000ff0000-0x000001000000 : "EEPROM"
[    0.719246] libphy: ag71xx_mdio: probed
[    1.385658] ag71xx ag71xx.0: connected to PHY at ag71xx-mdio.0:04 [uid=004dd074, driver=Atheros 8031/8033 ethernet]
[    1.397029] eth0: Atheros AG71xx at 0xb9000000, irq 4, mode:SGMII
[    1.404935] NET: Registered protocol family 10
[    1.412597] NET: Registered protocol family 17
[    1.417311] bridge: automatic filtering via arp/ip/ip6tables has been deprecated. Update your scripts to load br_netfilter if you need this.
[    1.430475] 8021q: 802.1Q VLAN Support v1.8
[    1.442730] VFS: Mounted root (squashfs filesystem) readonly on device 31:4.
[    1.451886] Freeing unused kernel memory: 336K (8049c000 - 804f0000)
[    2.402852] init: Console is alive
[    2.406611] init: - watchdog -
[    3.861095] usbcore: registered new interface driver usbfs
[    3.866897] usbcore: registered new interface driver hub
[    3.872459] usbcore: registered new device driver usb
[    3.882475] ehci_hcd: USB 2.0 'Enhanced' Host Controller (EHCI) Driver
[    3.890626] ehci-platform: EHCI generic platform driver
[    3.898645] ohci_hcd: USB 1.1 'Open' Host Controller (OHCI) Driver
[    3.906248] ohci-platform: OHCI generic platform driver
[    3.915557] init: - preinit -
[    4.304201] IPv6: ADDRCONF(NETDEV_UP): eth0: link is not ready
[    4.326403] random: procd: uninitialized urandom read (4 bytes read, 7 bits of entropy available)
[    5.426405] eth0: link up (1000Mbps/Full duplex)
[    5.431203] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
[    6.575862] jffs2: notice: (354) jffs2_build_xattr_subsystem: complete building xattr subsystem, 0 of xdatum (0 unchecked, 0 orphan) and 0 of xref (0 dead, 0 orphan) found.
[    6.593261] mount_root: switching to jffs2 overlay
[    6.610310] urandom-seed: Seeding with /etc/urandom.seed
[    6.721353] eth0: link down
[    6.733352] procd: - early -
[    6.736972] procd: - watchdog -
[    7.338617] procd: - ubus -
[    7.440668] random: ubusd: uninitialized urandom read (4 bytes read, 12 bits of entropy available)
[    7.450354] random: ubusd: uninitialized urandom read (4 bytes read, 12 bits of entropy available)
[    7.459749] random: ubusd: uninitialized urandom read (4 bytes read, 12 bits of entropy available)
[    7.471858] random: ubusd: uninitialized urandom read (4 bytes read, 12 bits of entropy available)
[    7.481936] random: ubusd: uninitialized urandom read (4 bytes read, 12 bits of entropy available)
[    7.492351] random: ubusd: uninitialized urandom read (4 bytes read, 12 bits of entropy available)
[    7.501888] random: ubusd: uninitialized urandom read (4 bytes read, 12 bits of entropy available)
[    7.511554] procd: - init -
[    7.745160] tun: Universal TUN/TAP device driver, 1.6
[    7.750378] tun: (C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>
[    7.762543] sit: IPv6 over IPv4 tunneling driver
[    7.836512] ip6_tables: (C) 2000-2006 Netfilter Core Team
[    7.860041] Netfilter messages via NETLINK v0.30.
[    7.867889] ip_set: protocol 6
[    7.929532] u32 classifier
[    7.932337]     input device check on
[    7.936163]     Actions configured
[    7.996720] Mirror/redirect action on
[    8.147012] arp_tables: (C) 2002 David S. Miller
[    8.155833] Bridge firewalling registered
[    8.162332] Loading modules backported from Linux version wt-2016-06-20-0-gbc17424
[    8.170200] Backport generated by backports.git backports-20160216-7-g5735958
[    8.226049] Ebtables v2.0 registered
[    8.245946] ip_tables: (C) 2000-2006 Netfilter Core Team
[    8.271000] nf_conntrack version 0.5.0 (1952 buckets, 7808 max)
[    8.306199] ctnetlink v0.93: registering with nfnetlink.
[    8.428947] xt_time: kernel timezone is -0000
[    8.651662] ath: EEPROM regdomain: 0x2a
[    8.651680] ath: EEPROM indicates we should expect a direct regpair map
[    8.651698] ath: Country alpha2 being used: US
[    8.651705] ath: Regpair used: 0x2a
[    8.661717] ieee80211 phy0: Selected rate control algorithm 'minstrel_ht'
[    8.666672] ieee80211 phy0: Atheros AR9561 Rev:0 mem=0xb8100000, irq=47
[   10.338498] random: jshn: uninitialized urandom read (4 bytes read, 17 bits of entropy available)
[   10.414248] random: ubusd: uninitialized urandom read (4 bytes read, 17 bits of entropy available)
[   15.321176] device eth0 entered promiscuous mode
[   15.340710] IPv6: ADDRCONF(NETDEV_UP): br-lan: link is not ready
[   17.352587] IPv6: ADDRCONF(NETDEV_UP): wlan0: link is not ready
[   17.447009] device wlan0 entered promiscuous mode
[   17.530618] IPv6: ADDRCONF(NETDEV_CHANGE): wlan0: link becomes ready
[   17.537384] br-lan: port 2(wlan0) entered forwarding state
[   17.543093] br-lan: port 2(wlan0) entered forwarding state
[   17.587807] IPv6: ADDRCONF(NETDEV_CHANGE): br-lan: link becomes ready
[   17.826498] eth0: link up (1000Mbps/Full duplex)
[   17.889282] br-lan: port 1(eth0) entered forwarding state
[   17.894974] br-lan: port 1(eth0) entered forwarding state
[   19.534162] br-lan: port 2(wlan0) entered forwarding state
[   19.894159] br-lan: port 1(eth0) entered forwarding state
[   40.829923] random: nonblocking pool is initialized
[  707.011531] ------------[ cut here ]------------
[  707.016639] WARNING: CPU: 0 PID: 0 at compat-wireless-2016-06-20/net/mac80211/tx.c:1514 ieee80211_tx_dequeue+0x17c/0x968 [mac80211]()
[  707.029051] Modules linked in: ath9k ath9k_common iptable_nat ath9k_hw ath nf_nat_ipv4 nf_conntrack_ipv6 nf_conntrack_ipv4 mac80211 ipt_REJECT ipt_MASQUERADE ebtable_nat ebtable_filter ebtable_broute cfg80211 xt_time xt_tcpudp xt_tcpmss xt_statistic xt_state xt_recent xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_id xt_hl xt_helper xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit xt_connbytes xt_comment xt_TCPMSS xt_REDIRECT xt_LOG xt_IPMARK xt_HL xt_DSCP xt_CT xt_CLASSIFY nf_reject_ipv4 nf_nat_redirect nf_nat_masquerade_ipv4 nf_nat nf_log_ipv4 nf_defrag_ipv6 nf_defrag_ipv4 nf_conntrack_rtcache nf_conntrack_netlink nf_conntrack iptable_raw iptable_mangle iptable_filter ipt_ECN ip_tables ebtables ebt_vlan ebt_stp ebt_snat ebt_redirect ebt_pkttype ebt_mark_m ebt_mark ebt_limit ebt_ip6 ebt_ip ebt_dnat ebt_arpreply ebt_arp ebt_among ebt_802_3 crc_ccitt compat_xtables compat br_netfilter arptable_filter arpt_mangle arp_tables sch_cake em_nbyte sch_htb sch_prio sch_dsmark sch_pie sch_gred em_meta sch_teql cls_basic act_ipt sch_red em_text sch_tbf act_police sch_codel sch_sfq em_cmp sch_fq act_skbedit act_mirred em_u32 cls_u32 cls_tcindex cls_flow cls_route cls_fw sch_hfsc sch_ingress leds_wndr3700_usb ledtrig_usbdev xt_set ip_set_list_set ip_set_hash_netiface ip_set_hash_netport ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ip ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set nfnetlink ip6t_rt ip6t_frag ip6t_hbh ip6t_eui64 ip6t_mh ip6t_ah ip6t_ipv6header ip6t_REJECT nf_reject_ipv6 nf_log_ipv6 nf_log_common ip6table_raw ip6table_mangle ip6table_filter ip6_tables x_tables ifb sit tunnel4 ip_tunnel tun ohci_platform ohci_hcd ehci_platform ehci_hcd gpio_button_hotplug usbcore nls_base usb_common
[  707.198061] CPU: 0 PID: 0 Comm: swapper Not tainted 4.4.19 #0
[  707.203993] Stack : 804205e4 00000000 00000001 80480000 8046f058 8046ece3 803f9bd0 00000000
[  707.203993] 	  804f37e0 2926c8ef 871e2000 871e2008 876de3b8 800ada74 80400a84 80460000
[  707.203993] 	  00000003 2926c8ef 803fea6c 8046597c 876de3b8 800ab9a0 80469200 00000000
[  707.203993] 	  00000001 80231300 00000000 00000000 00000000 00000000 00000000 00000000
[  707.203993] 	  00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  707.203993] 	  ...
[  707.241016] Call Trace:
[  707.243554] [<80072378>] show_stack+0x50/0x84
[  707.248073] [<80084240>] warn_slowpath_common+0xa4/0xd4
[  707.253466] [<800842f8>] warn_slowpath_null+0x18/0x24
[  707.258964] [<8712cf2c>] ieee80211_tx_dequeue+0x17c/0x968 [mac80211]
[  707.265864] [<870ea900>] ath_tid_dequeue+0x98/0x13c [ath9k]
[  707.271639] [<870ea9f8>] ath_tx_get_tid_subframe+0x54/0x1ec [ath9k]
[  707.278148] [<870eb354>] ath_txq_schedule+0x540/0x650 [ath9k]
[  707.284115] [<870ec018>] ath_tx_process_buffer+0x9d0/0xa18 [ath9k]
[  707.290517] [<870ecd6c>] ath_tx_edma_tasklet+0x2d0/0x324 [ath9k]
[  707.296748] [<870e4fa0>] ath9k_tasklet+0x24c/0x2b0 [ath9k]
[  707.302437] [<80087634>] tasklet_action+0x80/0xc8
[  707.307310] [<80086f68>] __do_softirq+0x26c/0x32c
[  707.312184] [<8006a908>] plat_irq_dispatch+0xd4/0x10c
[  707.317407] [<80060830>] ret_from_irq+0x0/0x4
[  707.321909] [<8006ec00>] r4k_wait_irqoff+0x18/0x20
[  707.326878] [<800a87ac>] cpu_startup_entry+0xf8/0x184
[  707.332112] [<8049cbec>] start_kernel+0x488/0x4a8
[  707.336985] 
[  707.338520] ---[ end trace b5612eece15fa33e ]---
[  707.343296] ------------[ cut here ]------------
[  707.348351] WARNING: CPU: 0 PID: 0 at compat-wireless-2016-06-20/net/mac80211/tx.c:1514 ieee80211_tx_dequeue+0x17c/0x968 [mac80211]()
[  707.360751] Modules linked in: ath9k ath9k_common iptable_nat ath9k_hw ath nf_nat_ipv4 nf_conntrack_ipv6 nf_conntrack_ipv4 mac80211 ipt_REJECT ipt_MASQUERADE ebtable_nat ebtable_filter ebtable_broute cfg80211 xt_time xt_tcpudp xt_tcpmss xt_statistic xt_state xt_recent xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_id xt_hl xt_helper xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit xt_connbytes xt_comment xt_TCPMSS xt_REDIRECT xt_LOG xt_IPMARK xt_HL xt_DSCP xt_CT xt_CLASSIFY nf_reject_ipv4 nf_nat_redirect nf_nat_masquerade_ipv4 nf_nat nf_log_ipv4 nf_defrag_ipv6 nf_defrag_ipv4 nf_conntrack_rtcache nf_conntrack_netlink nf_conntrack iptable_raw iptable_mangle iptable_filter ipt_ECN ip_tables ebtables ebt_vlan ebt_stp ebt_snat ebt_redirect ebt_pkttype ebt_mark_m ebt_mark ebt_limit ebt_ip6 ebt_ip ebt_dnat ebt_arpreply ebt_arp ebt_among ebt_802_3 crc_ccitt compat_xtables compat br_netfilter arptable_filter arpt_mangle arp_tables sch_cake em_nbyte sch_htb sch_prio sch_dsmark sch_pie sch_gred em_meta sch_teql cls_basic act_ipt sch_red em_text sch_tbf act_police sch_codel sch_sfq em_cmp sch_fq act_skbedit act_mirred em_u32 cls_u32 cls_tcindex cls_flow cls_route cls_fw sch_hfsc sch_ingress leds_wndr3700_usb ledtrig_usbdev xt_set ip_set_list_set ip_set_hash_netiface ip_set_hash_netport ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ip ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set nfnetlink ip6t_rt ip6t_frag ip6t_hbh ip6t_eui64 ip6t_mh ip6t_ah ip6t_ipv6header ip6t_REJECT nf_reject_ipv6 nf_log_ipv6 nf_log_common ip6table_raw ip6table_mangle ip6table_filter ip6_tables x_tables ifb sit tunnel4 ip_tunnel tun ohci_platform ohci_hcd ehci_platform ehci_hcd gpio_button_hotplug usbcore nls_base usb_common
[  707.529760] CPU: 0 PID: 0 Comm: swapper Tainted: G        W       4.4.19 #0
[  707.536955] Stack : 804205e4 00000000 00000001 80480000 8046f058 8046ece3 803f9bd0 00000000
[  707.536955] 	  804f37e0 292d37ff 871e2000 871e2008 876de3b8 800ada74 80400a84 80460000
[  707.536955] 	  00000003 292d37ff 803fea6c 8046597c 876de3b8 800ab9a0 80469200 00000000
[  707.536955] 	  8046b1a0 80231300 00000000 00000000 00000000 00000000 00000000 00000000
[  707.536955] 	  00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  707.536955] 	  ...
[  707.573969] Call Trace:
[  707.576517] [<80072378>] show_stack+0x50/0x84
[  707.581024] [<80084240>] warn_slowpath_common+0xa4/0xd4
[  707.586432] [<800842f8>] warn_slowpath_null+0x18/0x24
[  707.591926] [<8712cf2c>] ieee80211_tx_dequeue+0x17c/0x968 [mac80211]
[  707.598825] [<870ea900>] ath_tid_dequeue+0x98/0x13c [ath9k]
[  707.604625] [<870ea9f8>] ath_tx_get_tid_subframe+0x54/0x1ec [ath9k]
[  707.611116] [<870eb354>] ath_txq_schedule+0x540/0x650 [ath9k]
[  707.617080] [<870ec018>] ath_tx_process_buffer+0x9d0/0xa18 [ath9k]
[  707.623480] [<870ecd6c>] ath_tx_edma_tasklet+0x2d0/0x324 [ath9k]
[  707.629711] [<870e4fa0>] ath9k_tasklet+0x24c/0x2b0 [ath9k]
[  707.635409] [<80087634>] tasklet_action+0x80/0xc8
[  707.640270] [<80086f68>] __do_softirq+0x26c/0x32c
[  707.645150] [<8006a908>] plat_irq_dispatch+0xd4/0x10c
[  707.650367] [<80060830>] ret_from_irq+0x0/0x4
[  707.654886] [<8006ec00>] r4k_wait_irqoff+0x18/0x20
[  707.659847] [<800a87ac>] cpu_startup_entry+0xf8/0x184
[  707.665084] [<8049cbec>] start_kernel+0x488/0x4a8
[  707.669941] 
[  707.671483] ---[ end trace b5612eece15fa33f ]---
[  707.676275] ------------[ cut here ]------------
[  707.681319] WARNING: CPU: 0 PID: 0 at compat-wireless-2016-06-20/net/mac80211/tx.c:1514 ieee80211_tx_dequeue+0x17c/0x968 [mac80211]()
[  707.693724] Modules linked in: ath9k ath9k_common iptable_nat ath9k_hw ath nf_nat_ipv4 nf_conntrack_ipv6 nf_conntrack_ipv4 mac80211 ipt_REJECT ipt_MASQUERADE ebtable_nat ebtable_filter ebtable_broute cfg80211 xt_time xt_tcpudp xt_tcpmss xt_statistic xt_state xt_recent xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_id xt_hl xt_helper xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit xt_connbytes xt_comment xt_TCPMSS xt_REDIRECT xt_LOG xt_IPMARK xt_HL xt_DSCP xt_CT xt_CLASSIFY nf_reject_ipv4 nf_nat_redirect nf_nat_masquerade_ipv4 nf_nat nf_log_ipv4 nf_defrag_ipv6 nf_defrag_ipv4 nf_conntrack_rtcache nf_conntrack_netlink nf_conntrack iptable_raw iptable_mangle iptable_filter ipt_ECN ip_tables ebtables ebt_vlan ebt_stp ebt_snat ebt_redirect ebt_pkttype ebt_mark_m ebt_mark ebt_limit ebt_ip6 ebt_ip ebt_dnat ebt_arpreply ebt_arp ebt_among ebt_802_3 crc_ccitt compat_xtables compat br_netfilter arptable_filter arpt_mangle arp_tables sch_cake em_nbyte sch_htb sch_prio sch_dsmark sch_pie sch_gred em_meta sch_teql cls_basic act_ipt sch_red em_text sch_tbf act_police sch_codel sch_sfq em_cmp sch_fq act_skbedit act_mirred em_u32 cls_u32 cls_tcindex cls_flow cls_route cls_fw sch_hfsc sch_ingress leds_wndr3700_usb ledtrig_usbdev xt_set ip_set_list_set ip_set_hash_netiface ip_set_hash_netport ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ip ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set nfnetlink ip6t_rt ip6t_frag ip6t_hbh ip6t_eui64 ip6t_mh ip6t_ah ip6t_ipv6header ip6t_REJECT nf_reject_ipv6 nf_log_ipv6 nf_log_common ip6table_raw ip6table_mangle ip6table_filter ip6_tables x_tables ifb sit tunnel4 ip_tunnel tun ohci_platform ohci_hcd ehci_platform ehci_hcd gpio_button_hotplug usbcore nls_base usb_common
[  707.862722] CPU: 0 PID: 0 Comm: swapper Tainted: G        W       4.4.19 #0
[  707.869918] Stack : 804205e4 00000000 00000001 80480000 8046f058 8046ece3 803f9bd0 00000000
[  707.869918] 	  804f37e0 2930b0ba 871e2000 871e2008 876de3b8 800ada74 80400a84 80460000
[  707.869918] 	  00000003 2930b0ba 803fea6c 8046597c 876de3b8 800ab9a0 80469200 00000000
[  707.869918] 	  8046b1a0 80231300 00000000 00000000 00000000 00000000 00000000 00000000
[  707.869918] 	  00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  707.869918] 	  ...
[  707.906940] Call Trace:
[  707.909469] [<80072378>] show_stack+0x50/0x84
[  707.913978] [<80084240>] warn_slowpath_common+0xa4/0xd4
[  707.919386] [<800842f8>] warn_slowpath_null+0x18/0x24
[  707.924887] [<8712cf2c>] ieee80211_tx_dequeue+0x17c/0x968 [mac80211]
[  707.931762] [<870ea900>] ath_tid_dequeue+0x98/0x13c [ath9k]
[  707.937560] [<870ea9f8>] ath_tx_get_tid_subframe+0x54/0x1ec [ath9k]
[  707.944051] [<870eb354>] ath_txq_schedule+0x540/0x650 [ath9k]
[  707.950014] [<870ec018>] ath_tx_process_buffer+0x9d0/0xa18 [ath9k]
[  707.956425] [<870ecd6c>] ath_tx_edma_tasklet+0x2d0/0x324 [ath9k]
[  707.962642] [<870e4fa0>] ath9k_tasklet+0x24c/0x2b0 [ath9k]
[  707.968338] [<80087634>] tasklet_action+0x80/0xc8
[  707.973197] [<80086f68>] __do_softirq+0x26c/0x32c
[  707.978076] [<8006a908>] plat_irq_dispatch+0xd4/0x10c
[  707.983294] [<80060830>] ret_from_irq+0x0/0x4
[  707.987810] [<8006ec00>] r4k_wait_irqoff+0x18/0x20
[  707.992765] [<800a87ac>] cpu_startup_entry+0xf8/0x184
[  707.998003] [<8049cbec>] start_kernel+0x488/0x4a8
[  708.002858] 
[  708.004410] ---[ end trace b5612eece15fa340 ]---
[  708.009204] ------------[ cut here ]------------
[  708.014245] WARNING: CPU: 0 PID: 0 at compat-wireless-2016-06-20/net/mac80211/tx.c:1514 ieee80211_tx_dequeue+0x17c/0x968 [mac80211]()
[  708.026654] Modules linked in: ath9k ath9k_common iptable_nat ath9k_hw ath nf_nat_ipv4 nf_conntrack_ipv6 nf_conntrack_ipv4 mac80211 ipt_REJECT ipt_MASQUERADE ebtable_nat ebtable_filter ebtable_broute cfg80211 xt_time xt_tcpudp xt_tcpmss xt_statistic xt_state xt_recent xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_id xt_hl xt_helper xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit xt_connbytes xt_comment xt_TCPMSS xt_REDIRECT xt_LOG xt_IPMARK xt_HL xt_DSCP xt_CT xt_CLASSIFY nf_reject_ipv4 nf_nat_redirect nf_nat_masquerade_ipv4 nf_nat nf_log_ipv4 nf_defrag_ipv6 nf_defrag_ipv4 nf_conntrack_rtcache nf_conntrack_netlink nf_conntrack iptable_raw iptable_mangle iptable_filter ipt_ECN ip_tables ebtables ebt_vlan ebt_stp ebt_snat ebt_redirect ebt_pkttype ebt_mark_m ebt_mark ebt_limit ebt_ip6 ebt_ip ebt_dnat ebt_arpreply ebt_arp ebt_among ebt_802_3 crc_ccitt compat_xtables compat br_netfilter arptable_filter arpt_mangle arp_tables sch_cake em_nbyte sch_htb sch_prio sch_dsmark sch_pie sch_gred em_meta sch_teql cls_basic act_ipt sch_red em_text sch_tbf act_police sch_codel sch_sfq em_cmp sch_fq act_skbedit act_mirred em_u32 cls_u32 cls_tcindex cls_flow cls_route cls_fw sch_hfsc sch_ingress leds_wndr3700_usb ledtrig_usbdev xt_set ip_set_list_set ip_set_hash_netiface ip_set_hash_netport ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ip ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set nfnetlink ip6t_rt ip6t_frag ip6t_hbh ip6t_eui64 ip6t_mh ip6t_ah ip6t_ipv6header ip6t_REJECT nf_reject_ipv6 nf_log_ipv6 nf_log_common ip6table_raw ip6table_mangle ip6table_filter ip6_tables x_tables ifb sit tunnel4 ip_tunnel tun ohci_platform ohci_hcd ehci_platform ehci_hcd gpio_button_hotplug usbcore nls_base usb_common
[  708.195649] CPU: 0 PID: 0 Comm: swapper Tainted: G        W       4.4.19 #0
[  708.202835] Stack : 804205e4 00000000 00000001 80480000 8046f058 8046ece3 803f9bd0 00000000
[  708.202835] 	  804f37e0 2935a6b4 871e2000 871e2008 876de3b8 800ada74 80400a84 80460000
[  708.202835] 	  00000003 2935a6b4 803fea6c 8046597c 876de3b8 800ab9a0 80469200 00000000
[  708.202835] 	  8046b1a0 80231300 00000000 00000000 00000000 00000000 00000000 00000000
[  708.202835] 	  00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  708.202835] 	  ...
[  708.239857] Call Trace:
[  708.242387] [<80072378>] show_stack+0x50/0x84
[  708.246906] [<80084240>] warn_slowpath_common+0xa4/0xd4
[  708.252301] [<800842f8>] warn_slowpath_null+0x18/0x24
[  708.257797] [<8712cf2c>] ieee80211_tx_dequeue+0x17c/0x968 [mac80211]
[  708.264694] [<870ea900>] ath_tid_dequeue+0x98/0x13c [ath9k]
[  708.270473] [<870ea9f8>] ath_tx_get_tid_subframe+0x54/0x1ec [ath9k]
[  708.276982] [<870eb354>] ath_txq_schedule+0x540/0x650 [ath9k]
[  708.282937] [<870ec018>] ath_tx_process_buffer+0x9d0/0xa18 [ath9k]
[  708.289347] [<870ecd6c>] ath_tx_edma_tasklet+0x2d0/0x324 [ath9k]
[  708.295578] [<870e4fa0>] ath9k_tasklet+0x24c/0x2b0 [ath9k]
[  708.301261] [<80087634>] tasklet_action+0x80/0xc8
[  708.306135] [<80086f68>] __do_softirq+0x26c/0x32c
[  708.311008] [<8006a908>] plat_irq_dispatch+0xd4/0x10c
[  708.316232] [<80060830>] ret_from_irq+0x0/0x4
[  708.320735] [<8006ec00>] r4k_wait_irqoff+0x18/0x20
[  708.325702] [<800a87ac>] cpu_startup_entry+0xf8/0x184
[  708.330928] [<8049cbec>] start_kernel+0x488/0x4a8
[  708.335796] 
[  708.337336] ---[ end trace b5612eece15fa341 ]---
[  708.342138] ------------[ cut here ]------------
[  708.347195] WARNING: CPU: 0 PID: 0 at compat-wireless-2016-06-20/net/mac80211/tx.c:1514 ieee80211_tx_dequeue+0x17c/0x968 [mac80211]()
[  708.359601] Modules linked in: ath9k ath9k_common iptable_nat ath9k_hw ath nf_nat_ipv4 nf_conntrack_ipv6 nf_conntrack_ipv4 mac80211 ipt_REJECT ipt_MASQUERADE ebtable_nat ebtable_filter ebtable_broute cfg80211 xt_time xt_tcpudp xt_tcpmss xt_statistic xt_state xt_recent xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_id xt_hl xt_helper xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit xt_connbytes xt_comment xt_TCPMSS xt_REDIRECT xt_LOG xt_IPMARK xt_HL xt_DSCP xt_CT xt_CLASSIFY nf_reject_ipv4 nf_nat_redirect nf_nat_masquerade_ipv4 nf_nat nf_log_ipv4 nf_defrag_ipv6 nf_defrag_ipv4 nf_conntrack_rtcache nf_conntrack_netlink nf_conntrack iptable_raw iptable_mangle iptable_filter ipt_ECN ip_tables ebtables ebt_vlan ebt_stp ebt_snat ebt_redirect ebt_pkttype ebt_mark_m ebt_mark ebt_limit ebt_ip6 ebt_ip ebt_dnat ebt_arpreply ebt_arp ebt_among ebt_802_3 crc_ccitt compat_xtables compat br_netfilter arptable_filter arpt_mangle arp_tables sch_cake em_nbyte sch_htb sch_prio sch_dsmark sch_pie sch_gred em_meta sch_teql cls_basic act_ipt sch_red em_text sch_tbf act_police sch_codel sch_sfq em_cmp sch_fq act_skbedit act_mirred em_u32 cls_u32 cls_tcindex cls_flow cls_route cls_fw sch_hfsc sch_ingress leds_wndr3700_usb ledtrig_usbdev xt_set ip_set_list_set ip_set_hash_netiface ip_set_hash_netport ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ip ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set nfnetlink ip6t_rt ip6t_frag ip6t_hbh ip6t_eui64 ip6t_mh ip6t_ah ip6t_ipv6header ip6t_REJECT nf_reject_ipv6 nf_log_ipv6 nf_log_common ip6table_raw ip6table_mangle ip6table_filter ip6_tables x_tables ifb sit tunnel4 ip_tunnel tun ohci_platform ohci_hcd ehci_platform ehci_hcd gpio_button_hotplug usbcore nls_base usb_common
[  708.528593] CPU: 0 PID: 0 Comm: swapper Tainted: G        W       4.4.19 #0
[  708.535788] Stack : 804205e4 00000000 00000001 80480000 8046f058 8046ece3 803f9bd0 00000000
[  708.535788] 	  804f37e0 293a9cb6 871e2000 871e2008 876de3b8 800ada74 80400a84 80460000
[  708.535788] 	  00000003 293a9cb6 803fea6c 8046597c 876de3b8 800ab9a0 80469200 00000000
[  708.535788] 	  8046b1a0 80231300 00000000 00000000 00000000 00000000 00000000 00000000
[  708.535788] 	  00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  708.535788] 	  ...
[  708.572802] Call Trace:
[  708.575349] [<80072378>] show_stack+0x50/0x84
[  708.579859] [<80084240>] warn_slowpath_common+0xa4/0xd4
[  708.585266] [<800842f8>] warn_slowpath_null+0x18/0x24
[  708.590758] [<8712cf2c>] ieee80211_tx_dequeue+0x17c/0x968 [mac80211]
[  708.597659] [<870ea900>] ath_tid_dequeue+0x98/0x13c [ath9k]
[  708.603437] [<870ea9f8>] ath_tx_get_tid_subframe+0x54/0x1ec [ath9k]
[  708.609955] [<870eb354>] ath_txq_schedule+0x540/0x650 [ath9k]
[  708.615918] [<870ec018>] ath_tx_process_buffer+0x9d0/0xa18 [ath9k]
[  708.622322] [<870ecd6c>] ath_tx_edma_tasklet+0x2d0/0x324 [ath9k]
[  708.628554] [<870e4fa0>] ath9k_tasklet+0x24c/0x2b0 [ath9k]
[  708.634255] [<80087634>] tasklet_action+0x80/0xc8
[  708.639113] [<80086f68>] __do_softirq+0x26c/0x32c
[  708.643981] [<8006a908>] plat_irq_dispatch+0xd4/0x10c
[  708.649215] [<80060830>] ret_from_irq+0x0/0x4
[  708.653724] [<8006ec00>] r4k_wait_irqoff+0x18/0x20
[  708.658689] [<800a87ac>] cpu_startup_entry+0xf8/0x184
[  708.663917] [<8049cbec>] start_kernel+0x488/0x4a8
[  708.668776] 
[  708.670317] ---[ end trace b5612eece15fa342 ]---
[  709.247082] ------------[ cut here ]------------
[  709.252161] WARNING: CPU: 0 PID: 0 at compat-wireless-2016-06-20/net/mac80211/tx.c:1514 ieee80211_tx_dequeue+0x17c/0x968 [mac80211]()
[  709.264575] Modules linked in: ath9k ath9k_common iptable_nat ath9k_hw ath nf_nat_ipv4 nf_conntrack_ipv6 nf_conntrack_ipv4 mac80211 ipt_REJECT ipt_MASQUERADE ebtable_nat ebtable_filter ebtable_broute cfg80211 xt_time xt_tcpudp xt_tcpmss xt_statistic xt_state xt_recent xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_id xt_hl xt_helper xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit xt_connbytes xt_comment xt_TCPMSS xt_REDIRECT xt_LOG xt_IPMARK xt_HL xt_DSCP xt_CT xt_CLASSIFY nf_reject_ipv4 nf_nat_redirect nf_nat_masquerade_ipv4 nf_nat nf_log_ipv4 nf_defrag_ipv6 nf_defrag_ipv4 nf_conntrack_rtcache nf_conntrack_netlink nf_conntrack iptable_raw iptable_mangle iptable_filter ipt_ECN ip_tables ebtables ebt_vlan ebt_stp ebt_snat ebt_redirect ebt_pkttype ebt_mark_m ebt_mark ebt_limit ebt_ip6 ebt_ip ebt_dnat ebt_arpreply ebt_arp ebt_among ebt_802_3 crc_ccitt compat_xtables compat br_netfilter arptable_filter arpt_mangle arp_tables sch_cake em_nbyte sch_htb sch_prio sch_dsmark sch_pie sch_gred em_meta sch_teql cls_basic act_ipt sch_red em_text sch_tbf act_police sch_codel sch_sfq em_cmp sch_fq act_skbedit act_mirred em_u32 cls_u32 cls_tcindex cls_flow cls_route cls_fw sch_hfsc sch_ingress leds_wndr3700_usb ledtrig_usbdev xt_set ip_set_list_set ip_set_hash_netiface ip_set_hash_netport ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ip ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set nfnetlink ip6t_rt ip6t_frag ip6t_hbh ip6t_eui64 ip6t_mh ip6t_ah ip6t_ipv6header ip6t_REJECT nf_reject_ipv6 nf_log_ipv6 nf_log_common ip6table_raw ip6table_mangle ip6table_filter ip6_tables x_tables ifb sit tunnel4 ip_tunnel tun ohci_platform ohci_hcd ehci_platform ehci_hcd gpio_button_hotplug usbcore nls_base usb_common
[  709.433572] CPU: 0 PID: 0 Comm: swapper Tainted: G        W       4.4.19 #0
[  709.440773] Stack : 804205e4 00000000 00000001 80480000 8046f058 8046ece3 803f9bd0 00000000
[  709.440773] 	  804f37e0 294818e4 871e2000 871e2008 876de3b8 800ada74 80400a84 80460000
[  709.440773] 	  00000003 294818e4 803fea6c 8046597c 876de3b8 800ab9a0 8760486b 00000000
[  709.440773] 	  8046b1a0 80231300 00000000 00000000 00000000 00000000 00000000 00000000
[  709.440773] 	  00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  709.440773] 	  ...
[  709.477798] Call Trace:
[  709.480338] [<80072378>] show_stack+0x50/0x84
[  709.484856] [<80084240>] warn_slowpath_common+0xa4/0xd4
[  709.490250] [<800842f8>] warn_slowpath_null+0x18/0x24
[  709.495747] [<8712cf2c>] ieee80211_tx_dequeue+0x17c/0x968 [mac80211]
[  709.502621] [<870ea900>] ath_tid_dequeue+0x98/0x13c [ath9k]
[  709.508418] [<870ea9f8>] ath_tx_get_tid_subframe+0x54/0x1ec [ath9k]
[  709.514922] [<870eb354>] ath_txq_schedule+0x540/0x650 [ath9k]
[  709.520877] [<870ec018>] ath_tx_process_buffer+0x9d0/0xa18 [ath9k]
[  709.527287] [<870ecd6c>] ath_tx_edma_tasklet+0x2d0/0x324 [ath9k]
[  709.533510] [<870e4fa0>] ath9k_tasklet+0x24c/0x2b0 [ath9k]
[  709.539206] [<80087634>] tasklet_action+0x80/0xc8
[  709.544064] [<80086f68>] __do_softirq+0x26c/0x32c
[  709.548943] [<8006a908>] plat_irq_dispatch+0xd4/0x10c
[  709.554170] [<80060830>] ret_from_irq+0x0/0x4
[  709.558675] [<8006ec00>] r4k_wait_irqoff+0x18/0x20
[  709.563632] [<800a87ac>] cpu_startup_entry+0xf8/0x184
[  709.568869] [<8049cbec>] start_kernel+0x488/0x4a8
[  709.573726] 
[  709.575275] ---[ end trace b5612eece15fa343 ]---
[  709.580053] ------------[ cut here ]------------
[  709.585106] WARNING: CPU: 0 PID: 0 at compat-wireless-2016-06-20/net/mac80211/tx.c:1514 ieee80211_tx_dequeue+0x17c/0x968 [mac80211]()
[  709.597507] Modules linked in: ath9k ath9k_common iptable_nat ath9k_hw ath nf_nat_ipv4 nf_conntrack_ipv6 nf_conntrack_ipv4 mac80211 ipt_REJECT ipt_MASQUERADE ebtable_nat ebtable_filter ebtable_broute cfg80211 xt_time xt_tcpudp xt_tcpmss xt_statistic xt_state xt_recent xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_id xt_hl xt_helper xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit xt_connbytes xt_comment xt_TCPMSS xt_REDIRECT xt_LOG xt_IPMARK xt_HL xt_DSCP xt_CT xt_CLASSIFY nf_reject_ipv4 nf_nat_redirect nf_nat_masquerade_ipv4 nf_nat nf_log_ipv4 nf_defrag_ipv6 nf_defrag_ipv4 nf_conntrack_rtcache nf_conntrack_netlink nf_conntrack iptable_raw iptable_mangle iptable_filter ipt_ECN ip_tables ebtables ebt_vlan ebt_stp ebt_snat ebt_redirect ebt_pkttype ebt_mark_m ebt_mark ebt_limit ebt_ip6 ebt_ip ebt_dnat ebt_arpreply ebt_arp ebt_among ebt_802_3 crc_ccitt compat_xtables compat br_netfilter arptable_filter arpt_mangle arp_tables sch_cake em_nbyte sch_htb sch_prio sch_dsmark sch_pie sch_gred em_meta sch_teql cls_basic act_ipt sch_red em_text sch_tbf act_police sch_codel sch_sfq em_cmp sch_fq act_skbedit act_mirred em_u32 cls_u32 cls_tcindex cls_flow cls_route cls_fw sch_hfsc sch_ingress leds_wndr3700_usb ledtrig_usbdev xt_set ip_set_list_set ip_set_hash_netiface ip_set_hash_netport ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ip ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set nfnetlink ip6t_rt ip6t_frag ip6t_hbh ip6t_eui64 ip6t_mh ip6t_ah ip6t_ipv6header ip6t_REJECT nf_reject_ipv6 nf_log_ipv6 nf_log_common ip6table_raw ip6table_mangle ip6table_filter ip6_tables x_tables ifb sit tunnel4 ip_tunnel tun ohci_platform ohci_hcd ehci_platform ehci_hcd gpio_button_hotplug usbcore nls_base usb_common
[  709.766516] CPU: 0 PID: 0 Comm: swapper Tainted: G        W       4.4.19 #0
[  709.773703] Stack : 804205e4 00000000 00000001 80480000 8046f058 8046ece3 803f9bd0 00000000
[  709.773703] 	  804f37e0 294d527f 871e2000 871e2008 876de3b8 800ada74 80400a84 80460000
[  709.773703] 	  00000003 294d527f 803fea6c 8046597c 876de3b8 800ab9a0 8760486b 00000000
[  709.773703] 	  8046b1a0 80231300 00000000 00000000 00000000 00000000 00000000 00000000
[  709.773703] 	  00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  709.773703] 	  ...
[  709.810726] Call Trace:
[  709.813263] [<80072378>] show_stack+0x50/0x84
[  709.817782] [<80084240>] warn_slowpath_common+0xa4/0xd4
[  709.823176] [<800842f8>] warn_slowpath_null+0x18/0x24
[  709.828674] [<8712cf2c>] ieee80211_tx_dequeue+0x17c/0x968 [mac80211]
[  709.835573] [<870ea900>] ath_tid_dequeue+0x98/0x13c [ath9k]
[  709.841350] [<870ea9f8>] ath_tx_get_tid_subframe+0x54/0x1ec [ath9k]
[  709.847859] [<870eb354>] ath_txq_schedule+0x540/0x650 [ath9k]
[  709.853813] [<870ec018>] ath_tx_process_buffer+0x9d0/0xa18 [ath9k]
[  709.860223] [<870ecd6c>] ath_tx_edma_tasklet+0x2d0/0x324 [ath9k]
[  709.866454] [<870e4fa0>] ath9k_tasklet+0x24c/0x2b0 [ath9k]
[  709.872138] [<80087634>] tasklet_action+0x80/0xc8
[  709.877011] [<80086f68>] __do_softirq+0x26c/0x32c
[  709.881885] [<8006a908>] plat_irq_dispatch+0xd4/0x10c
[  709.887108] [<80060830>] ret_from_irq+0x0/0x4
[  709.891611] [<8006ec00>] r4k_wait_irqoff+0x18/0x20
[  709.896578] [<800a87ac>] cpu_startup_entry+0xf8/0x184
[  709.901804] [<8049cbec>] start_kernel+0x488/0x4a8
[  709.906671] 
[  709.908211] ---[ end trace b5612eece15fa344 ]---
[  709.913023] ------------[ cut here ]------------
[  709.918080] WARNING: CPU: 0 PID: 0 at compat-wireless-2016-06-20/net/mac80211/tx.c:1514 ieee80211_tx_dequeue+0x17c/0x968 [mac80211]()
[  709.930488] Modules linked in: ath9k ath9k_common iptable_nat ath9k_hw ath nf_nat_ipv4 nf_conntrack_ipv6 nf_conntrack_ipv4 mac80211 ipt_REJECT ipt_MASQUERADE ebtable_nat ebtable_filter ebtable_broute cfg80211 xt_time xt_tcpudp xt_tcpmss xt_statistic xt_state xt_recent xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_id xt_hl xt_helper xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit xt_connbytes xt_comment xt_TCPMSS xt_REDIRECT xt_LOG xt_IPMARK xt_HL xt_DSCP xt_CT xt_CLASSIFY nf_reject_ipv4 nf_nat_redirect nf_nat_masquerade_ipv4 nf_nat nf_log_ipv4 nf_defrag_ipv6 nf_defrag_ipv4 nf_conntrack_rtcache nf_conntrack_netlink nf_conntrack iptable_raw iptable_mangle iptable_filter ipt_ECN ip_tables ebtables ebt_vlan ebt_stp ebt_snat ebt_redirect ebt_pkttype ebt_mark_m ebt_mark ebt_limit ebt_ip6 ebt_ip ebt_dnat ebt_arpreply ebt_arp ebt_among ebt_802_3 crc_ccitt compat_xtables compat br_netfilter arptable_filter arpt_mangle arp_tables sch_cake em_nbyte sch_htb sch_prio sch_dsmark sch_pie sch_gred em_meta sch_teql cls_basic act_ipt sch_red em_text sch_tbf act_police sch_codel sch_sfq em_cmp sch_fq act_skbedit act_mirred em_u32 cls_u32 cls_tcindex cls_flow cls_route cls_fw sch_hfsc sch_ingress leds_wndr3700_usb ledtrig_usbdev xt_set ip_set_list_set ip_set_hash_netiface ip_set_hash_netport ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ip ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set nfnetlink ip6t_rt ip6t_frag ip6t_hbh ip6t_eui64 ip6t_mh ip6t_ah ip6t_ipv6header ip6t_REJECT nf_reject_ipv6 nf_log_ipv6 nf_log_common ip6table_raw ip6table_mangle ip6table_filter ip6_tables x_tables ifb sit tunnel4 ip_tunnel tun ohci_platform ohci_hcd ehci_platform ehci_hcd gpio_button_hotplug usbcore nls_base usb_common
[  710.099487] CPU: 0 PID: 0 Comm: swapper Tainted: G        W       4.4.19 #0
[  710.106683] Stack : 804205e4 00000000 00000001 80480000 8046f058 8046ece3 803f9bd0 00000000
[  710.106683] 	  804f37e0 29520521 871e2000 871e2008 876de3b8 800ada74 80400a84 80460000
[  710.106683] 	  00000003 29520521 803fea6c 8046597c 876de3b8 800ab9a0 8760486b 00000000
[  710.106683] 	  8046b1a0 80231300 00000000 00000000 00000000 00000000 00000000 00000000
[  710.106683] 	  00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  710.106683] 	  ...
[  710.143697] Call Trace:
[  710.146244] [<80072378>] show_stack+0x50/0x84
[  710.150753] [<80084240>] warn_slowpath_common+0xa4/0xd4
[  710.156159] [<800842f8>] warn_slowpath_null+0x18/0x24
[  710.161644] [<8712cf2c>] ieee80211_tx_dequeue+0x17c/0x968 [mac80211]
[  710.168543] [<870ea900>] ath_tid_dequeue+0x98/0x13c [ath9k]
[  710.174343] [<870ea9f8>] ath_tx_get_tid_subframe+0x54/0x1ec [ath9k]
[  710.180835] [<870eb354>] ath_txq_schedule+0x540/0x650 [ath9k]
[  710.186797] [<870ec018>] ath_tx_process_buffer+0x9d0/0xa18 [ath9k]
[  710.193200] [<870ecd6c>] ath_tx_edma_tasklet+0x2d0/0x324 [ath9k]
[  710.199430] [<870e4fa0>] ath9k_tasklet+0x24c/0x2b0 [ath9k]
[  710.205128] [<80087634>] tasklet_action+0x80/0xc8
[  710.209990] [<80086f68>] __do_softirq+0x26c/0x32c
[  710.214867] [<8006a908>] plat_irq_dispatch+0xd4/0x10c
[  710.220086] [<80060830>] ret_from_irq+0x0/0x4
[  710.224602] [<8006ec00>] r4k_wait_irqoff+0x18/0x20
[  710.229557] [<800a87ac>] cpu_startup_entry+0xf8/0x184
[  710.234795] [<8049cbec>] start_kernel+0x488/0x4a8
[  710.239651] 
[  710.241193] ---[ end trace b5612eece15fa345 ]---
[  710.245975] ------------[ cut here ]------------
[  710.251019] WARNING: CPU: 0 PID: 0 at compat-wireless-2016-06-20/net/mac80211/tx.c:1514 ieee80211_tx_dequeue+0x17c/0x968 [mac80211]()
[  710.263424] Modules linked in: ath9k ath9k_common iptable_nat ath9k_hw ath nf_nat_ipv4 nf_conntrack_ipv6 nf_conntrack_ipv4 mac80211 ipt_REJECT ipt_MASQUERADE ebtable_nat ebtable_filter ebtable_broute cfg80211 xt_time xt_tcpudp xt_tcpmss xt_statistic xt_state xt_recent xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_id xt_hl xt_helper xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit xt_connbytes xt_comment xt_TCPMSS xt_REDIRECT xt_LOG xt_IPMARK xt_HL xt_DSCP xt_CT xt_CLASSIFY nf_reject_ipv4 nf_nat_redirect nf_nat_masquerade_ipv4 nf_nat nf_log_ipv4 nf_defrag_ipv6 nf_defrag_ipv4 nf_conntrack_rtcache nf_conntrack_netlink nf_conntrack iptable_raw iptable_mangle iptable_filter ipt_ECN ip_tables ebtables ebt_vlan ebt_stp ebt_snat ebt_redirect ebt_pkttype ebt_mark_m ebt_mark ebt_limit ebt_ip6 ebt_ip ebt_dnat ebt_arpreply ebt_arp ebt_among ebt_802_3 crc_ccitt compat_xtables compat br_netfilter arptable_filter arpt_mangle arp_tables sch_cake em_nbyte sch_htb sch_prio sch_dsmark sch_pie sch_gred em_meta sch_teql cls_basic act_ipt sch_red em_text sch_tbf act_police sch_codel sch_sfq em_cmp sch_fq act_skbedit act_mirred em_u32 cls_u32 cls_tcindex cls_flow cls_route cls_fw sch_hfsc sch_ingress leds_wndr3700_usb ledtrig_usbdev xt_set ip_set_list_set ip_set_hash_netiface ip_set_hash_netport ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ip ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set nfnetlink ip6t_rt ip6t_frag ip6t_hbh ip6t_eui64 ip6t_mh ip6t_ah ip6t_ipv6header ip6t_REJECT nf_reject_ipv6 nf_log_ipv6 nf_log_common ip6table_raw ip6table_mangle ip6table_filter ip6_tables x_tables ifb sit tunnel4 ip_tunnel tun ohci_platform ohci_hcd ehci_platform ehci_hcd gpio_button_hotplug usbcore nls_base usb_common
[  710.432422] CPU: 0 PID: 0 Comm: swapper Tainted: G        W       4.4.19 #0
[  710.439610] Stack : 804205e4 00000000 00000001 80480000 8046f058 8046ece3 803f9bd0 00000000
[  710.439610] 	  804f37e0 2956fb64 871e2000 871e2008 876de3b8 800ada74 80400a84 80460000
[  710.439610] 	  00000003 2956fb64 803fea6c 8046597c 876de3b8 800ab9a0 8760486b 00000000
[  710.439610] 	  8046b1a0 80231300 00000000 00000000 00000000 00000000 00000000 00000000
[  710.439610] 	  00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[  710.439610] 	  ...
[  710.476632] Call Trace:
[  710.479170] [<80072378>] show_stack+0x50/0x84
[  710.483679] [<80084240>] warn_slowpath_common+0xa4/0xd4
[  710.489086] [<800842f8>] warn_slowpath_null+0x18/0x24
[  710.494591] [<8712cf2c>] ieee80211_tx_dequeue+0x17c/0x968 [mac80211]
[  710.501465] [<870ea900>] ath_tid_dequeue+0x98/0x13c [ath9k]
[  710.507261] [<870ea9f8>] ath_tx_get_tid_subframe+0x54/0x1ec [ath9k]
[  710.513752] [<870eb354>] ath_txq_schedule+0x540/0x650 [ath9k]
[  710.519715] [<870ec018>] ath_tx_process_buffer+0x9d0/0xa18 [ath9k]
[  710.526126] [<870ecd6c>] ath_tx_edma_tasklet+0x2d0/0x324 [ath9k]
[  710.532343] [<870e4fa0>] ath9k_tasklet+0x24c/0x2b0 [ath9k]
[  710.538039] [<80087634>] tasklet_action+0x80/0xc8
[  710.542899] [<80086f68>] __do_softirq+0x26c/0x32c
[  710.547778] [<8006a908>] plat_irq_dispatch+0xd4/0x10c
[  710.552995] [<80060830>] ret_from_irq+0x0/0x4
[  710.557511] [<8006ec00>] r4k_wait_irqoff+0x18/0x20
[  710.562467] [<800a87ac>] cpu_startup_entry+0xf8/0x184
[  710.567703] [<8049cbec>] start_kernel+0x488/0x4a8
[  710.572560] 
[  710.574111] ---[ end trace b5612eece15fa346 ]---

[-- Attachment #3: clearlybetteratmcs4.png --]
[-- Type: image/png, Size: 225733 bytes --]

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 21:02                         ` Dave Taht
@ 2016-09-05 21:25                           ` Toke Høiland-Jørgensen
  2016-09-05 21:29                             ` Dave Taht
  2016-09-05 22:01                             ` Toke Høiland-Jørgensen
  0 siblings, 2 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-05 21:25 UTC (permalink / raw)
  To: Dave Taht; +Cc: Felix Fietkau, make-wifi-fast

Dave Taht <dave.taht@gmail.com> writes:

>> Ah, no, those are not panics, those are warnings being triggered by the
>> fast_tx pointer going while the packet was queued. Now, the
>> xmit_fast_finish() function doesn't actually use that for anything other
>> than crypto key configuration, so it would probably be feasible to get
>> rid of that check in the dequeue path.
>>
>> How many of those warnings do you see?
>
> I'm not crazy, I run the rrul test at the conclusion of the run. Which this was.
>
> I'll go run it on a fresh boot but...
>
> dmesg | grep 'cut here'
>
> [  707.011531] ------------[ cut here ]------------
> [  707.343296] ------------[ cut here ]------------
> [  707.676275] ------------[ cut here ]------------
> [  708.009204] ------------[ cut here ]------------
> [  708.342138] ------------[ cut here ]------------
> [  709.247082] ------------[ cut here ]------------
> [  709.580053] ------------[ cut here ]------------
> [  709.913023] ------------[ cut here ]------------
> [  710.245975] ------------[ cut here ]------------
>
> Also attached.
>
>> And what do you have to do to get
>> traffic to flow again?
>
> Seems to come back after a while.

Right. Put up a bunch of new images (the ones with version +3).
Completely untested, but should get rid of the need for the fast_tx
pointer. See if you can blow those up? :)

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 21:25                           ` Toke Høiland-Jørgensen
@ 2016-09-05 21:29                             ` Dave Taht
  2016-09-05 21:35                               ` Toke Høiland-Jørgensen
  2016-09-05 22:01                             ` Toke Høiland-Jørgensen
  1 sibling, 1 reply; 77+ messages in thread
From: Dave Taht @ 2016-09-05 21:29 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: Felix Fietkau, make-wifi-fast

Did a few tests of rrul and rrul_be standalone. Didn't blow it up.
Don't know what causes it, it where it is caused.

On Mon, Sep 5, 2016 at 2:25 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
> Dave Taht <dave.taht@gmail.com> writes:
>
>>> Ah, no, those are not panics, those are warnings being triggered by the
>>> fast_tx pointer going while the packet was queued. Now, the
>>> xmit_fast_finish() function doesn't actually use that for anything other
>>> than crypto key configuration, so it would probably be feasible to get
>>> rid of that check in the dequeue path.
>>>
>>> How many of those warnings do you see?
>>
>> I'm not crazy, I run the rrul test at the conclusion of the run. Which this was.
>>
>> I'll go run it on a fresh boot but...
>>
>> dmesg | grep 'cut here'
>>
>> [  707.011531] ------------[ cut here ]------------
>> [  707.343296] ------------[ cut here ]------------
>> [  707.676275] ------------[ cut here ]------------
>> [  708.009204] ------------[ cut here ]------------
>> [  708.342138] ------------[ cut here ]------------
>> [  709.247082] ------------[ cut here ]------------
>> [  709.580053] ------------[ cut here ]------------
>> [  709.913023] ------------[ cut here ]------------
>> [  710.245975] ------------[ cut here ]------------
>>
>> Also attached.
>>
>>> And what do you have to do to get
>>> traffic to flow again?
>>
>> Seems to come back after a while.
>
> Right. Put up a bunch of new images (the ones with version +3).
> Completely untested, but should get rid of the need for the fast_tx
> pointer. See if you can blow those up? :)

ok. did you wedge the ath10k driver + firmware into there?

I'm about to go rewire some things and re-screw in some things...

> -Toke



-- 
Dave Täht
Let's go make home routers and wifi faster! With better software!
http://blog.cerowrt.org

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 21:29                             ` Dave Taht
@ 2016-09-05 21:35                               ` Toke Høiland-Jørgensen
  2016-09-05 21:42                                 ` Dave Taht
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-05 21:35 UTC (permalink / raw)
  To: Dave Taht; +Cc: Felix Fietkau, make-wifi-fast

Dave Taht <dave.taht@gmail.com> writes:

> Did a few tests of rrul and rrul_be standalone. Didn't blow it up.
> Don't know what causes it, it where it is caused.
>
> On Mon, Sep 5, 2016 at 2:25 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
>> Dave Taht <dave.taht@gmail.com> writes:
>>
>>>> Ah, no, those are not panics, those are warnings being triggered by the
>>>> fast_tx pointer going while the packet was queued. Now, the
>>>> xmit_fast_finish() function doesn't actually use that for anything other
>>>> than crypto key configuration, so it would probably be feasible to get
>>>> rid of that check in the dequeue path.
>>>>
>>>> How many of those warnings do you see?
>>>
>>> I'm not crazy, I run the rrul test at the conclusion of the run. Which this was.
>>>
>>> I'll go run it on a fresh boot but...
>>>
>>> dmesg | grep 'cut here'
>>>
>>> [  707.011531] ------------[ cut here ]------------
>>> [  707.343296] ------------[ cut here ]------------
>>> [  707.676275] ------------[ cut here ]------------
>>> [  708.009204] ------------[ cut here ]------------
>>> [  708.342138] ------------[ cut here ]------------
>>> [  709.247082] ------------[ cut here ]------------
>>> [  709.580053] ------------[ cut here ]------------
>>> [  709.913023] ------------[ cut here ]------------
>>> [  710.245975] ------------[ cut here ]------------
>>>
>>> Also attached.
>>>
>>>> And what do you have to do to get
>>>> traffic to flow again?
>>>
>>> Seems to come back after a while.
>>
>> Right. Put up a bunch of new images (the ones with version +3).
>> Completely untested, but should get rid of the need for the fast_tx
>> pointer. See if you can blow those up? :)
>
> ok. did you wedge the ath10k driver + firmware into there?
>
> I'm about to go rewire some things and re-screw in some things...

No, but I put the ath10 drivers and firmware them in the packages/ dir..
There are a bunch of different ath10k firmwares, and everything is there
in a stock and a -ct version. So knock yourself out, I suppose :)

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 21:35                               ` Toke Høiland-Jørgensen
@ 2016-09-05 21:42                                 ` Dave Taht
  2016-09-05 22:04                                   ` Dave Taht
  0 siblings, 1 reply; 77+ messages in thread
From: Dave Taht @ 2016-09-05 21:42 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: Felix Fietkau, make-wifi-fast

Yea!

In theory the ct firmwares support adhoc on the ath10k.

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 21:25                           ` Toke Høiland-Jørgensen
  2016-09-05 21:29                             ` Dave Taht
@ 2016-09-05 22:01                             ` Toke Høiland-Jørgensen
  2016-09-05 22:08                               ` Dave Taht
  1 sibling, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-05 22:01 UTC (permalink / raw)
  To: Dave Taht; +Cc: make-wifi-fast

Toke Høiland-Jørgensen <toke@toke.dk> writes:

> Dave Taht <dave.taht@gmail.com> writes:
>
>>> Ah, no, those are not panics, those are warnings being triggered by the
>>> fast_tx pointer going while the packet was queued. Now, the
>>> xmit_fast_finish() function doesn't actually use that for anything other
>>> than crypto key configuration, so it would probably be feasible to get
>>> rid of that check in the dequeue path.
>>>
>>> How many of those warnings do you see?
>>
>> I'm not crazy, I run the rrul test at the conclusion of the run. Which this was.
>>
>> I'll go run it on a fresh boot but...
>>
>> dmesg | grep 'cut here'
>>
>> [  707.011531] ------------[ cut here ]------------
>> [  707.343296] ------------[ cut here ]------------
>> [  707.676275] ------------[ cut here ]------------
>> [  708.009204] ------------[ cut here ]------------
>> [  708.342138] ------------[ cut here ]------------
>> [  709.247082] ------------[ cut here ]------------
>> [  709.580053] ------------[ cut here ]------------
>> [  709.913023] ------------[ cut here ]------------
>> [  710.245975] ------------[ cut here ]------------
>>
>> Also attached.
>>
>>> And what do you have to do to get
>>> traffic to flow again?
>>
>> Seems to come back after a while.
>
> Right. Put up a bunch of new images (the ones with version +3).
> Completely untested, but should get rid of the need for the fast_tx
> pointer. See if you can blow those up? :)

BTW, if you do test these, please test them with crypto on (as well).
That's where the largest potential for things blowing up is. :)

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 21:42                                 ` Dave Taht
@ 2016-09-05 22:04                                   ` Dave Taht
  0 siblings, 0 replies; 77+ messages in thread
From: Dave Taht @ 2016-09-05 22:04 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: Felix Fietkau, make-wifi-fast

[-- Attachment #1: Type: text/plain, Size: 439 bytes --]

pounding the ath10k flat now

top screenshot, and some quick measurements will return to the ath9k later...

one of the reasons why I was interested in the softirq patch of
felix's was watching softirq eat so much cpu here... which sqm-scripts
also exercises....

from what I can tell, the fq_codel and intermediate queues for ath10k
is not in this stuff (yet).

(I don't care(yet)! Establishing a baseline with newer hardware here I am!)

[-- Attachment #2: poundingath10kflat.png --]
[-- Type: image/png, Size: 318379 bytes --]

[-- Attachment #3: ath10kpoundedflat.png --]
[-- Type: image/png, Size: 91730 bytes --]

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 22:01                             ` Toke Høiland-Jørgensen
@ 2016-09-05 22:08                               ` Dave Taht
  2016-09-05 22:31                                 ` Dave Taht
  0 siblings, 1 reply; 77+ messages in thread
From: Dave Taht @ 2016-09-05 22:08 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast

>> Right. Put up a bunch of new images (the ones with version +3).
>> Completely untested, but should get rid of the need for the fast_tx
>> pointer. See if you can blow those up? :)
>
> BTW, if you do test these, please test them with crypto on (as well).
> That's where the largest potential for things blowing up is. :)

my
first goal was to get something that survived being pounded and didn't
crash under basic conditions.
second goal was to nail things up to get repeatable tests

(um, er I need to patch a few x86 boxes too)

third goal was to get a bunch of clients running
fourth goal was to thoroughly test crypto this time around
sorry, I'm a bit behind on goals 2-5 as yet.

Anyone for adhoc?

:)

> -Toke



-- 
Dave Täht
Let's go make home routers and wifi faster! With better software!
http://blog.cerowrt.org

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 22:08                               ` Dave Taht
@ 2016-09-05 22:31                                 ` Dave Taht
  0 siblings, 0 replies; 77+ messages in thread
From: Dave Taht @ 2016-09-05 22:31 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast

[-- Attachment #1: Type: text/plain, Size: 1307 bytes --]

On +3:

I was able to get rrul to sort of fail on the ath10k also in the same
way. no kernel messages, but it had trouble establishing the control
channel after rrul ran, on rrul_be.

it could be osx going to hell, I can only get it to happen after
pounding it flat with a dozen other tests first. If I just run it
after a period of idle, can't get it to fail.

The ath9k test on the same hardware I just did of your +3 completed
all tests (except for rrul, same as above) and throughput was a bit
better than the last series I'd done (I switched channels to one less
busy at this hour of the day).

No kernel messages.

There is an interesting pattern emerging relative to the number of
flows (see attached), that I guess I'll have to go check against
linux<->linux's tcp... I've always kind of wondered about block acks
vs del tcp acks.... we're not out of cpu at this lower rate, so it
shouldn't be fq_codel's cpu usage at fault...

The quantum is writable?

Cake works, too. While I won't certify this new hardware as ready for
wide testing by others, yet, it really does look quite good for the
basic stuff we care about.

I will fire up a long series of 300 tests on this, move back to the
archer, and see if I can sort through the x86 patchset... and get in a
swim.

Happy. thx. a nice labor day....

[-- Attachment #2: patternemerging_for1-2-4-12flows.png --]
[-- Type: image/png, Size: 95841 bytes --]

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 11:30           ` [Make-wifi-fast] [PATCH v7] " Toke Høiland-Jørgensen
  2016-09-05 16:06             ` Toke Høiland-Jørgensen
  2016-09-05 17:49             ` Felix Fietkau
@ 2016-09-06 11:43             ` Toke Høiland-Jørgensen
  2016-09-06 11:45               ` Toke Høiland-Jørgensen
  2016-09-06 11:44             ` [Make-wifi-fast] [PATCH v8] " Toke Høiland-Jørgensen
  3 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-06 11:43 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

Because fragments shouldn't be split up or reordered, the fragmentation
handler is run after dequeue. Any fragments are then kept in the TXQ and
on subsequent dequeues they take precedence over dequeueing from the FQ
structure.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
Changes since v6:
  - Invoking the rate control handler can cause packets to be generated
    (for establishing a BA session). This can cause a deadlock because
    dequeue can happen while sta->lock is held. So this version moves
    the rate control handler back before the intermediate queue step.
  - Fix sequence number allocation on the slow path.
  
 include/net/mac80211.h     |   2 +
 net/mac80211/ieee80211_i.h |   2 +
 net/mac80211/tx.c          | 250 ++++++++++++++++++++++++++++++++++-----------
 3 files changed, 192 insertions(+), 62 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
 };
 
 /*
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9211cce..d36f3b1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -813,11 +813,13 @@ enum txq_info_flags {
  * @def_flow: used as a fallback flow when a packet destined to @tin hashes to
  *	a fq_flow which is already owned by a different tin
  * @def_cvars: codel vars for @def_flow
+ * @frags: used to keep fragments created after dequeue
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
 	struct codel_vars def_cvars;
+	struct sk_buff_head frags;
 	unsigned long flags;
 
 	/* keep last! */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index efc38e7..94f38cc 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
 
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb);
+
 /* misc utils */
 
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -853,8 +859,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
 
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+	hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
 
 	return TX_CONTINUE;
 }
@@ -1403,6 +1408,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 	fq_tin_init(&txqi->tin);
 	fq_flow_init(&txqi->def_flow);
 	codel_vars_init(&txqi->def_cvars);
+	__skb_queue_head_init(&txqi->frags);
 
 	txqi->txq.vif = &sdata->vif;
 
@@ -1425,6 +1431,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
 	struct fq_tin *tin = &txqi->tin;
 
 	fq_tin_reset(fq, tin, fq_skb_free_func);
+	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 }
 
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1481,33 +1488,61 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct txq_info *txqi = container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb = NULL;
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
+	struct ieee80211_tx_info *info;
 
 	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
+	/* Make sure fragments stay together. */
+	skb = __skb_dequeue(&txqi->frags);
+	if (skb)
+		goto out;
+
+begin:
 	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
 
 	ieee80211_set_skb_vif(skb, txqi);
 
-	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info = IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
 
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx = rcu_dereference(sta->fast_tx);
+		if (WARN_ON(!fast_tx)) {
+			/* lost fast_tx pointer while the packet was queued */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+		ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb);
+	} else {
+		struct ieee80211_tx_data tx = { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local = local;
+		tx.skb = skb;
+		if (txq->sta) {
+			tx.sta = container_of(txq->sta, struct sta_info, sta);
+			tx.sdata = tx.sta->sdata;
+		} else {
+			tx.sdata = vif_to_sdata(info->control.vif);
+		}
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		skb = __skb_dequeue(&tx.skbs);
+
+		if (!skb_queue_empty(&tx.skbs))
+			skb_queue_splice_tail(&tx.skbs, &txqi->frags);
 	}
 
 out:
@@ -1521,6 +1556,47 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
 
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct fq *fq = &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+	struct ieee80211_sta *pubsta;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type == NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta = &sta->sta;
+	else
+		pubsta = NULL;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	vif = &sdata->vif;
+	txqi = ieee80211_get_txq(local, vif, pubsta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif = vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1604,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control = {};
-	struct fq *fq = &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1619,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
-		txqi = ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif = vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,10 +1739,13 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res = TX_DROP;
 
 #define CALL_TXH(txh) \
@@ -1697,16 +1759,42 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
+
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
 
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Late handlers can be called while the sta lock is held. Handlers that can
+ * cause packets to be generated will cause deadlock!
+ */
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	ieee80211_tx_result res = TX_CONTINUE;
+
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
 		__skb_queue_tail(&tx->skbs, tx->skb);
 		tx->skb = NULL;
 		goto txh_done;
 	}
 
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
 	CALL_TXH(ieee80211_tx_h_fragment);
@@ -1733,6 +1821,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	return 0;
 }
 
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r = invoke_tx_handlers_early(tx);
+	if (r)
+		return r;
+
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1807,7 +1904,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		info->hw_queue =
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
 
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result = __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
 
@@ -3159,7 +3262,7 @@ out:
 }
 
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
-				struct net_device *dev, struct sta_info *sta,
+				struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
 				struct sk_buff *skb)
 {
@@ -3170,9 +3273,9 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx = NULL;
+	ieee80211_tx_result r;
+	struct ieee80211_tx_data tx;
 	u8 tid = IEEE80211_NUM_TIDS;
 
 	/* control port protocol needs a lot of special handling */
@@ -3210,8 +3313,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 			return true;
 	}
 
-	ieee80211_tx_stats(dev, skb->len + extra_head);
-
 	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
 	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
 		return true;
@@ -3240,24 +3341,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
-
-	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
-		*ieee80211_get_qos_ctl(hdr) = tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
-	} else {
-		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
-		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
-		sdata->sequence_number += 0x10;
-	}
-
-	if (skb_shinfo(skb)->gso_size)
-		sta->tx_stats.msdu[tid] +=
-			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
-	else
-		sta->tx_stats.msdu[tid]++;
-
-	info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
 
 	__skb_queue_head_init(&tx.skbs);
 
@@ -3283,6 +3367,54 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
+	if (ieee80211_queue_skb(local, sdata, sta, skb))
+		return true;
+
+	ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb);
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	__skb_queue_tail(&tx.skbs, skb);
+	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+
+	return true;
+}
+
+/*
+ * Can be called while the sta lock is held. Anything that can cause packets to
+ * be generated will cause deadlock!
+ */
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	u8 tid = IEEE80211_NUM_TIDS;
+
+	ieee80211_tx_stats(skb->dev, skb->len);
+
+	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+		*ieee80211_get_qos_ctl(hdr) = tid;
+		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+	} else {
+		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
+		sdata->sequence_number += 0x10;
+	}
+
+	if (skb_shinfo(skb)->gso_size)
+		sta->tx_stats.msdu[tid] +=
+			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
+	else
+		sta->tx_stats.msdu[tid]++;
+
+	info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
 	/* statistics normally done by ieee80211_tx_h_stats (but that
 	 * has to consider fragmentation, so is more complex)
 	 */
@@ -3309,12 +3441,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-		sdata = container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
-
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
 
@@ -3342,7 +3468,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 		fast_tx = rcu_dereference(sta->fast_tx);
 
 		if (fast_tx &&
-		    ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+		    ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
 			goto out;
 	}
 
-- 
2.9.3

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [Make-wifi-fast] [PATCH v8] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 11:30           ` [Make-wifi-fast] [PATCH v7] " Toke Høiland-Jørgensen
                               ` (2 preceding siblings ...)
  2016-09-06 11:43             ` Toke Høiland-Jørgensen
@ 2016-09-06 11:44             ` Toke Høiland-Jørgensen
  2016-09-06 22:04               ` Felix Fietkau
                                 ` (4 more replies)
  3 siblings, 5 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-06 11:44 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

Because fragments shouldn't be split up or reordered, the fragmentation
handler is run after dequeue. Any fragments are then kept in the TXQ and
on subsequent dequeues they take precedence over dequeueing from the FQ
structure.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
Changes since v8:
- Don't rely on the fast_tx pointer on TXQ dequeue; it can go away while
  the packet was queued, and we don't actually need it, since we can get
  the key configuration and offset from the packet info.

 include/net/mac80211.h     |   2 +
 net/mac80211/ieee80211_i.h |   2 +
 net/mac80211/tx.c          | 255 +++++++++++++++++++++++++++++++++------------
 3 files changed, 195 insertions(+), 64 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
 };
 
 /*
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9211cce..d36f3b1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -813,11 +813,13 @@ enum txq_info_flags {
  * @def_flow: used as a fallback flow when a packet destined to @tin hashes to
  *	a fq_flow which is already owned by a different tin
  * @def_cvars: codel vars for @def_flow
+ * @frags: used to keep fragments created after dequeue
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
 	struct codel_vars def_cvars;
+	struct sk_buff_head frags;
 	unsigned long flags;
 
 	/* keep last! */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index efc38e7..f8eec60 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
 
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta, u8 pn_offs,
+				       struct ieee80211_key_conf *key_conf,
+				       struct sk_buff *skb);
+
 /* misc utils */
 
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -853,8 +859,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
 
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+	hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
 
 	return TX_CONTINUE;
 }
@@ -1403,6 +1408,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 	fq_tin_init(&txqi->tin);
 	fq_flow_init(&txqi->def_flow);
 	codel_vars_init(&txqi->def_cvars);
+	__skb_queue_head_init(&txqi->frags);
 
 	txqi->txq.vif = &sdata->vif;
 
@@ -1425,6 +1431,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
 	struct fq_tin *tin = &txqi->tin;
 
 	fq_tin_reset(fq, tin, fq_skb_free_func);
+	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 }
 
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1485,12 +1492,19 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	struct sk_buff *skb = NULL;
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
+	struct ieee80211_tx_info *info;
 
 	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
+	/* Make sure fragments stay together. */
+	skb = __skb_dequeue(&txqi->frags);
+	if (skb)
+		goto out;
+
+begin:
 	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
@@ -1498,16 +1512,37 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	ieee80211_set_skb_vif(skb, txqi);
 
 	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info = IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		u8 pn_offs = 0;
 
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+		if (info->control.hw_key)
+			pn_offs = ieee80211_hdrlen(hdr->frame_control);
+
+		ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs,
+					   info->control.hw_key, skb);
+	} else {
+		struct ieee80211_tx_data tx = { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local = local;
+		tx.skb = skb;
+		if (txq->sta) {
+			tx.sta = container_of(txq->sta, struct sta_info, sta);
+			tx.sdata = tx.sta->sdata;
+		} else {
+			tx.sdata = vif_to_sdata(info->control.vif);
+		}
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		skb = __skb_dequeue(&tx.skbs);
+
+		if (!skb_queue_empty(&tx.skbs))
+			skb_queue_splice_tail(&tx.skbs, &txqi->frags);
 	}
 
 out:
@@ -1521,6 +1556,47 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
 
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct fq *fq = &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+	struct ieee80211_sta *pubsta;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type == NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta = &sta->sta;
+	else
+		pubsta = NULL;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	vif = &sdata->vif;
+	txqi = ieee80211_get_txq(local, vif, pubsta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif = vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1604,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control = {};
-	struct fq *fq = &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1619,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
-		txqi = ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif = vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,10 +1739,13 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res = TX_DROP;
 
 #define CALL_TXH(txh) \
@@ -1697,16 +1759,42 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
+
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
 
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Late handlers can be called while the sta lock is held. Handlers that can
+ * cause packets to be generated will cause deadlock!
+ */
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	ieee80211_tx_result res = TX_CONTINUE;
+
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
 		__skb_queue_tail(&tx->skbs, tx->skb);
 		tx->skb = NULL;
 		goto txh_done;
 	}
 
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
 	CALL_TXH(ieee80211_tx_h_fragment);
@@ -1733,6 +1821,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	return 0;
 }
 
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r = invoke_tx_handlers_early(tx);
+
+	if (r)
+		return r;
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1807,7 +1904,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		info->hw_queue =
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
 
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result = __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
 
@@ -3159,7 +3262,7 @@ out:
 }
 
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
-				struct net_device *dev, struct sta_info *sta,
+				struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
 				struct sk_buff *skb)
 {
@@ -3170,9 +3273,9 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx = NULL;
+	ieee80211_tx_result r;
+	struct ieee80211_tx_data tx;
 	u8 tid = IEEE80211_NUM_TIDS;
 
 	/* control port protocol needs a lot of special handling */
@@ -3210,8 +3313,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 			return true;
 	}
 
-	ieee80211_tx_stats(dev, skb->len + extra_head);
-
 	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
 	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
 		return true;
@@ -3240,24 +3341,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
-
-	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
-		*ieee80211_get_qos_ctl(hdr) = tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
-	} else {
-		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
-		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
-		sdata->sequence_number += 0x10;
-	}
-
-	if (skb_shinfo(skb)->gso_size)
-		sta->tx_stats.msdu[tid] +=
-			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
-	else
-		sta->tx_stats.msdu[tid]++;
-
-	info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
 
 	__skb_queue_head_init(&tx.skbs);
 
@@ -3283,22 +3367,71 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
+	if (ieee80211_queue_skb(local, sdata, sta, skb))
+		return true;
+
+	ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs,
+				   &fast_tx->key->conf, skb);
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	__skb_queue_tail(&tx.skbs, skb);
+	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+
+	return true;
+}
+
+/*
+ * Can be called while the sta lock is held. Anything that can cause packets to
+ * be generated will cause deadlock!
+ */
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta, u8 pn_offs,
+				       struct ieee80211_key_conf *key_conf,
+				       struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	u8 tid = IEEE80211_NUM_TIDS;
+
+	ieee80211_tx_stats(skb->dev, skb->len);
+
+	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+		*ieee80211_get_qos_ctl(hdr) = tid;
+		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+	} else {
+		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
+		sdata->sequence_number += 0x10;
+	}
+
+	if (skb_shinfo(skb)->gso_size)
+		sta->tx_stats.msdu[tid] +=
+			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
+	else
+		sta->tx_stats.msdu[tid]++;
+
+	info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
 	/* statistics normally done by ieee80211_tx_h_stats (but that
 	 * has to consider fragmentation, so is more complex)
 	 */
 	sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
 	sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
 
-	if (fast_tx->pn_offs) {
+	if (pn_offs) {
 		u64 pn;
-		u8 *crypto_hdr = skb->data + fast_tx->pn_offs;
+		u8 *crypto_hdr = skb->data + pn_offs;
 
-		switch (fast_tx->key->conf.cipher) {
+		switch (key_conf->cipher) {
 		case WLAN_CIPHER_SUITE_CCMP:
 		case WLAN_CIPHER_SUITE_CCMP_256:
 		case WLAN_CIPHER_SUITE_GCMP:
 		case WLAN_CIPHER_SUITE_GCMP_256:
-			pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn);
+			pn = atomic64_inc_return(&key_conf->tx_pn);
 			crypto_hdr[0] = pn;
 			crypto_hdr[1] = pn >> 8;
 			crypto_hdr[4] = pn >> 16;
@@ -3309,12 +3442,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-		sdata = container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
-
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
 
@@ -3342,7 +3469,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 		fast_tx = rcu_dereference(sta->fast_tx);
 
 		if (fast_tx &&
-		    ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+		    ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
 			goto out;
 	}
 
-- 
2.9.3

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-06 11:43             ` Toke Høiland-Jørgensen
@ 2016-09-06 11:45               ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-06 11:45 UTC (permalink / raw)
  To: make-wifi-fast; +Cc: linux-wireless

Toke Høiland-Jørgensen <toke@toke.dk> writes:

> The TXQ intermediate queues can cause packet reordering when more than
> one flow is active to a single station. Since some of the wifi-specific
> packet handling (notably sequence number and encryption handling) is
> sensitive to re-ordering, things break if they are applied before the
> TXQ.
>
> This splits up the TX handlers and fast_xmit logic into two parts: An
> early part and a late part. The former is applied before TXQ enqueue,
> and the latter after dequeue. The non-TXQ path just applies both parts
> at once.
>
> Because fragments shouldn't be split up or reordered, the fragmentation
> handler is run after dequeue. Any fragments are then kept in the TXQ and
> on subsequent dequeues they take precedence over dequeueing from the FQ
> structure.
>
> This approach avoids having to scatter special cases for when TXQ is
> enabled, at the cost of making the fast_xmit and TX handler code
> slightly more complex.
>
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>

Sorry for sending this again; meant to send v8. :/

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v8] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-06 11:44             ` [Make-wifi-fast] [PATCH v8] " Toke Høiland-Jørgensen
@ 2016-09-06 22:04               ` Felix Fietkau
  2016-09-12 12:35               ` Johannes Berg
                                 ` (3 subsequent siblings)
  4 siblings, 0 replies; 77+ messages in thread
From: Felix Fietkau @ 2016-09-06 22:04 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless

On 2016-09-06 13:44, Toke Høiland-Jørgensen wrote:
> The TXQ intermediate queues can cause packet reordering when more than
> one flow is active to a single station. Since some of the wifi-specific
> packet handling (notably sequence number and encryption handling) is
> sensitive to re-ordering, things break if they are applied before the
> TXQ.
> 
> This splits up the TX handlers and fast_xmit logic into two parts: An
> early part and a late part. The former is applied before TXQ enqueue,
> and the latter after dequeue. The non-TXQ path just applies both parts
> at once.
> 
> Because fragments shouldn't be split up or reordered, the fragmentation
> handler is run after dequeue. Any fragments are then kept in the TXQ and
> on subsequent dequeues they take precedence over dequeueing from the FQ
> structure.
> 
> This approach avoids having to scatter special cases for when TXQ is
> enabled, at the cost of making the fast_xmit and TX handler code
> slightly more complex.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Acked-by: Felix Fietkau <nbd@nbd.name>


^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v8] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-06 11:44             ` [Make-wifi-fast] [PATCH v8] " Toke Høiland-Jørgensen
  2016-09-06 22:04               ` Felix Fietkau
@ 2016-09-12 12:35               ` Johannes Berg
  2016-09-12 13:08                 ` Toke Høiland-Jørgensen
  2016-09-22 17:04               ` [Make-wifi-fast] [PATCH v9 0/2] mac80211: TXQ dequeue path rework Toke Høiland-Jørgensen
                                 ` (2 subsequent siblings)
  4 siblings, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-09-12 12:35 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless


> +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
> +static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
> +				       struct sta_info *sta, u8 pn_offs,
> +				       struct ieee80211_key_conf *key_conf,
> +				       struct sk_buff *skb);
> +

I'm not very happy with this - I think you should do some
refactoring/code move in a separate prior patch to avoid this.

> +	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
>  		struct sta_info *sta = container_of(txq->sta, struct sta_info,
>  						    sta);
> -		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
> +		u8 pn_offs = 0;
>  
> -		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
> -		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
> -			info->flags |= IEEE80211_TX_CTL_AMPDU;
> -		else
> -			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
> +		if (info->control.hw_key)
> +			pn_offs = ieee80211_hdrlen(hdr->frame_control);

Not very happy with this either - the fast-xmit path explicitly tries
to avoid all these calculations.

I suppose I don't have to care all that much about the TXQs, but ...

Then again, adding a field in the skb->cb for the sake of this? No, not really either.


> +		ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs,
> +					   info->control.hw_key, skb);

I don't see how keeping the info->control.hw_key pointer across the
TXQ/FQ/Codel queueing isn't a potential bug? Probably one that already
exists in your code today, before this patch, of course.


> +	} else {
> +		struct ieee80211_tx_data tx = { };
> +
> +		__skb_queue_head_init(&tx.skbs);
> +		tx.local = local;
> +		tx.skb = skb;

an empty initializer is weird - why not at least move local/skb
initializations into it? Even txq->sta, I guess, since you can assign
txq->sta either way.

> -	CALL_TXH(ieee80211_tx_h_select_key);
> +
>  	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>  		CALL_TXH(ieee80211_tx_h_rate_ctrl);
[...]
> 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
>  		__skb_queue_tail(&tx->skbs, tx->skb);
>  		tx->skb = NULL;
>  		goto txh_done;
>  	}
> 
> +	CALL_TXH(ieee80211_tx_h_select_key);

What happens for the IEEE80211_TX_INTFL_RETRANSMISSION packets wrt. key
selection? Why is it OK to change this?

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v8] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-12 12:35               ` Johannes Berg
@ 2016-09-12 13:08                 ` Toke Høiland-Jørgensen
  2016-09-12 13:19                   ` Johannes Berg
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-12 13:08 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
>> +static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
>> +				       struct sta_info *sta, u8 pn_offs,
>> +				       struct ieee80211_key_conf *key_conf,
>> +				       struct sk_buff *skb);
>> +
>
> I'm not very happy with this - I think you should do some
> refactoring/code move in a separate prior patch to avoid this.

Noted, will do.

>> +	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
>>  		struct sta_info *sta = container_of(txq->sta, struct sta_info,
>>  						    sta);
>> -		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
>> +		u8 pn_offs = 0;
>>  
>> -		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
>> -		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
>> -			info->flags |= IEEE80211_TX_CTL_AMPDU;
>> -		else
>> -			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
>> +		if (info->control.hw_key)
>> +			pn_offs = ieee80211_hdrlen(hdr->frame_control);
>
> Not very happy with this either - the fast-xmit path explicitly tries
> to avoid all these calculations.

Well, the TXQ already adds a lot of other overhead (hashing on the
packet header, for one), so my guess would be that this would be
negligible compared to all that? 

> I suppose I don't have to care all that much about the TXQs, but ...
>
> Then again, adding a field in the skb->cb for the sake of this? No,
> not really either.

So that's a "keep it", then? :)

>> +		ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs,
>> +					   info->control.hw_key, skb);
>
> I don't see how keeping the info->control.hw_key pointer across the
> TXQ/FQ/Codel queueing isn't a potential bug? Probably one that already
> exists in your code today, before this patch, of course.

You mean the key could get removed from the hardware while the packet
was queued? Can certainly add a check for that. Under what conditions
does that happen? Does it make sense to try to recover from it (I guess
by calling tx_h_select_key), or is it rare enough that giving up and
dropping the packet makes more sense?

>> +	} else {
>> +		struct ieee80211_tx_data tx = { };
>> +
>> +		__skb_queue_head_init(&tx.skbs);
>> +		tx.local = local;
>> +		tx.skb = skb;
>
> an empty initializer is weird - why not at least move local/skb
> initializations into it? Even txq->sta, I guess, since you can assign
> txq->sta either way.

Yup, makes sense. Noted.

>> -	CALL_TXH(ieee80211_tx_h_select_key);
>> +
>>  	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>>  		CALL_TXH(ieee80211_tx_h_rate_ctrl);
> [...]
>> 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
>>  		__skb_queue_tail(&tx->skbs, tx->skb);
>>  		tx->skb = NULL;
>>  		goto txh_done;
>>  	}
>> 
>> +	CALL_TXH(ieee80211_tx_h_select_key);
>
> What happens for the IEEE80211_TX_INTFL_RETRANSMISSION packets wrt.
> key selection? Why is it OK to change this?

You're right, that's an oversight on my part. Will fix.

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v8] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-12 13:08                 ` Toke Høiland-Jørgensen
@ 2016-09-12 13:19                   ` Johannes Berg
  0 siblings, 0 replies; 77+ messages in thread
From: Johannes Berg @ 2016-09-12 13:19 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless


> Well, the TXQ already adds a lot of other overhead (hashing on the
> packet header, for one), so my guess would be that this would be
> negligible compared to all that? 
> 
> > 
> > I suppose I don't have to care all that much about the TXQs, but
> > ...
> > 
> > Then again, adding a field in the skb->cb for the sake of this? No,
> > not really either.
> 
> So that's a "keep it", then? :)

Yeah I think so :)

> > > +		ieee80211_xmit_fast_finish(sta->sdata, sta,
> > > pn_offs,
> > > +					   info->control.hw_key, 
> > > skb);
> > 
> > I don't see how keeping the info->control.hw_key pointer across the
> > TXQ/FQ/Codel queueing isn't a potential bug? Probably one that
> > already exists in your code today, before this patch, of course.
> 
> You mean the key could get removed from the hardware while the packet
> was queued? Can certainly add a check for that. Under what conditions
> does that happen? Does it make sense to try to recover from it (I
> guess by calling tx_h_select_key), or is it rare enough that giving
> up and dropping the packet makes more sense?

Not just from the hardware, more importantly the whole key structure
can be kfree()d, leading to use-after-free here, no?

Fast-xmit solves this by invalidating the fast-xmit cache when the key
pointer changes/goes away and possibly punting some frames to the slow
path, but you've absolutely no protection on these pointers here within
the TXQs, afaict?

A similar situation occurs with other pointers, like stations and vifs,
but when those are removed then obviously the entire TXQs are flushed,
so they're not relevant.

With the key though, frames can be on the queue while a key is removed,
and even before this patch, drivers would consequently access an
invalid key pointer.

Mind you, as I just wrote I think that issue exists even before this
patch, so you should probably look at it separately. Felix might know
better too.

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [Make-wifi-fast] [PATCH v9 0/2] mac80211: TXQ dequeue path rework
  2016-09-06 11:44             ` [Make-wifi-fast] [PATCH v8] " Toke Høiland-Jørgensen
  2016-09-06 22:04               ` Felix Fietkau
  2016-09-12 12:35               ` Johannes Berg
@ 2016-09-22 17:04               ` Toke Høiland-Jørgensen
  2016-09-22 17:04               ` [Make-wifi-fast] [PATCH v9 1/2] mac80211: Move ieee802111_tx_dequeue() to later in tx.c Toke Høiland-Jørgensen
  2016-09-22 17:04               ` [Make-wifi-fast] [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue Toke Høiland-Jørgensen
  4 siblings, 0 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-22 17:04 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless

This is the ninth iteration of my attempts to reorder the TXQ dequeue
path to avoid issues with reorder-sensitive operations. This version is
split into two patches; the first one moves ieee80211_tx_dequeue() to
avoid adding function stubs at the top of tx.c.

Changes since v8:
- Don't add function stubs to the beginning of tx.c
- Don't use control.hw_key from the dequeued packet, since that can go
  away while the packet is queued. Instead, run the select_key handler
  on dequeue and use the key from that.
- Change places that check tin.backlog_packets as an indication of
  whether the TXQ has anything queued to also look at the 'frags' queue.
- Don't change the order of the select_key handler with respect to the
  other handlers.
- Rebase on current mac80211-next tree.

Toke Høiland-Jørgensen (2):
  mac80211: Move ieee802111_tx_dequeue() to later in tx.c
  mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue

 include/net/mac80211.h     |   2 +
 net/mac80211/ieee80211_i.h |   8 ++
 net/mac80211/rx.c          |   4 +-
 net/mac80211/sta_info.c    |  10 +-
 net/mac80211/tx.c          | 335 +++++++++++++++++++++++++++++++--------------
 net/mac80211/util.c        |  11 +-
 6 files changed, 256 insertions(+), 114 deletions(-)

-- 
2.9.3

base-commit: c13ed534b8db543e4d8ead3885f4b06585a5771c

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [Make-wifi-fast] [PATCH v9 1/2] mac80211: Move ieee802111_tx_dequeue() to later in tx.c
  2016-09-06 11:44             ` [Make-wifi-fast] [PATCH v8] " Toke Høiland-Jørgensen
                                 ` (2 preceding siblings ...)
  2016-09-22 17:04               ` [Make-wifi-fast] [PATCH v9 0/2] mac80211: TXQ dequeue path rework Toke Høiland-Jørgensen
@ 2016-09-22 17:04               ` Toke Høiland-Jørgensen
  2016-09-30 11:13                 ` Johannes Berg
  2016-09-22 17:04               ` [Make-wifi-fast] [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue Toke Høiland-Jørgensen
  4 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-22 17:04 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless

The TXQ path restructure requires ieee80211_tx_dequeue() to call TX
handlers and parts of the xmit_fast path. Move the function to later in
tx.c in preparation for this.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 net/mac80211/tx.c | 90 +++++++++++++++++++++++++++----------------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 61d302d..e8c9964 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1476,51 +1476,6 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local)
 	spin_unlock_bh(&fq->lock);
 }
 
-struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
-				     struct ieee80211_txq *txq)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct txq_info *txqi = container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
-	struct sk_buff *skb = NULL;
-	struct fq *fq = &local->fq;
-	struct fq_tin *tin = &txqi->tin;
-
-	spin_lock_bh(&fq->lock);
-
-	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
-		goto out;
-
-	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
-	if (!skb)
-		goto out;
-
-	ieee80211_set_skb_vif(skb, txqi);
-
-	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
-		struct sta_info *sta = container_of(txq->sta, struct sta_info,
-						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
-
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
-	}
-
-out:
-	spin_unlock_bh(&fq->lock);
-
-	if (skb && skb_has_frag_list(skb) &&
-	    !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
-		skb_linearize(skb);
-
-	return skb;
-}
-EXPORT_SYMBOL(ieee80211_tx_dequeue);
-
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -3311,6 +3266,51 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	return true;
 }
 
+struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
+				     struct ieee80211_txq *txq)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct txq_info *txqi = container_of(txq, struct txq_info, txq);
+	struct ieee80211_hdr *hdr;
+	struct sk_buff *skb = NULL;
+	struct fq *fq = &local->fq;
+	struct fq_tin *tin = &txqi->tin;
+
+	spin_lock_bh(&fq->lock);
+
+	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
+		goto out;
+
+	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
+	if (!skb)
+		goto out;
+
+	ieee80211_set_skb_vif(skb, txqi);
+
+	hdr = (struct ieee80211_hdr *)skb->data;
+	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+		struct sta_info *sta = container_of(txq->sta, struct sta_info,
+						    sta);
+		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
+		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
+			info->flags |= IEEE80211_TX_CTL_AMPDU;
+		else
+			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+	}
+
+out:
+	spin_unlock_bh(&fq->lock);
+
+	if (skb && skb_has_frag_list(skb) &&
+	    !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
+		skb_linearize(skb);
+
+	return skb;
+}
+EXPORT_SYMBOL(ieee80211_tx_dequeue);
+
 void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 				  struct net_device *dev,
 				  u32 info_flags)
-- 
2.9.3

^ permalink raw reply	[flat|nested] 77+ messages in thread

* [Make-wifi-fast] [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-06 11:44             ` [Make-wifi-fast] [PATCH v8] " Toke Høiland-Jørgensen
                                 ` (3 preceding siblings ...)
  2016-09-22 17:04               ` [Make-wifi-fast] [PATCH v9 1/2] mac80211: Move ieee802111_tx_dequeue() to later in tx.c Toke Høiland-Jørgensen
@ 2016-09-22 17:04               ` Toke Høiland-Jørgensen
  2016-09-30 10:27                 ` Johannes Berg
  2016-09-30 12:49                 ` Johannes Berg
  4 siblings, 2 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-22 17:04 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

Because fragments shouldn't be split up or reordered, the fragmentation
handler is run after dequeue. Any fragments are then kept in the TXQ and
on subsequent dequeues they take precedence over dequeueing from the FQ
structure.

This approach avoids having to scatter special cases all over the place
for when TXQ is enabled, at the cost of making the fast_xmit and TX
handler code slightly more complex.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 include/net/mac80211.h     |   2 +
 net/mac80211/ieee80211_i.h |   8 ++
 net/mac80211/rx.c          |   4 +-
 net/mac80211/sta_info.c    |  10 +-
 net/mac80211/tx.c          | 287 +++++++++++++++++++++++++++++++++------------
 net/mac80211/util.c        |  11 +-
 6 files changed, 232 insertions(+), 90 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5296100..9463039 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
 };
 
 /*
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c71c735..caca265 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -813,12 +813,14 @@ enum txq_info_flags {
  * @def_flow: used as a fallback flow when a packet destined to @tin hashes to
  *	a fq_flow which is already owned by a different tin
  * @def_cvars: codel vars for @def_flow
+ * @frags: used to keep fragments created after dequeue
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
 	struct codel_vars def_cvars;
 	struct codel_stats cstats;
+	struct sk_buff_head frags;
 	unsigned long flags;
 
 	/* keep last! */
@@ -1481,6 +1483,12 @@ static inline struct txq_info *to_txq_info(struct ieee80211_txq *txq)
 	return container_of(txq, struct txq_info, txq);
 }
 
+static inline bool txq_has_queue(struct ieee80211_txq *txq)
+{
+	struct txq_info *txqi = to_txq_info(txq);
+	return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
+}
+
 static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr)
 {
 	return ether_addr_equal(raddr, addr) ||
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index e796060..ae5786b8 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1323,9 +1323,7 @@ static void sta_ps_start(struct sta_info *sta)
 		return;
 
 	for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
-		struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
-
-		if (txqi->tin.backlog_packets)
+		if (txq_has_queue(sta->sta.txq[tid]))
 			set_bit(tid, &sta->txq_buffered_tids);
 		else
 			clear_bit(tid, &sta->txq_buffered_tids);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 1b1b28f..167bff0 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1212,12 +1212,10 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
 
 	if (sta->sta.txq[0]) {
 		for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
-			struct txq_info *txqi = to_txq_info(sta->sta.txq[i]);
-
-			if (!txqi->tin.backlog_packets)
+			if (!txq_has_queue(sta->sta.txq[i]))
 				continue;
 
-			drv_wake_tx_queue(local, txqi);
+			drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i]));
 		}
 	}
 
@@ -1649,9 +1647,7 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
 			return;
 
 		for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
-			struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
-
-			if (!(tids & BIT(tid)) || txqi->tin.backlog_packets)
+			if (!(tids & BIT(tid)) || txq_has_queue(sta->sta.txq[tid]))
 				continue;
 
 			sta_info_recalc_tim(sta);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index e8c9964..75e6adf 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -853,8 +853,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
 
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+	hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
 
 	return TX_CONTINUE;
 }
@@ -1404,6 +1403,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 	fq_flow_init(&txqi->def_flow);
 	codel_vars_init(&txqi->def_cvars);
 	codel_stats_init(&txqi->cstats);
+	__skb_queue_head_init(&txqi->frags);
 
 	txqi->txq.vif = &sdata->vif;
 
@@ -1426,6 +1426,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
 	struct fq_tin *tin = &txqi->tin;
 
 	fq_tin_reset(fq, tin, fq_skb_free_func);
+	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 }
 
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1476,6 +1477,47 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local)
 	spin_unlock_bh(&fq->lock);
 }
 
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct fq *fq = &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+	struct ieee80211_sta *pubsta;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type == NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta = &sta->sta;
+	else
+		pubsta = NULL;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	vif = &sdata->vif;
+	txqi = ieee80211_get_txq(local, vif, pubsta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif = vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1483,9 +1525,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control = {};
-	struct fq *fq = &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1500,21 +1540,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
-		txqi = ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif = vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1635,10 +1660,13 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res = TX_DROP;
 
 #define CALL_TXH(txh) \
@@ -1656,6 +1684,31 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
 
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Late handlers can be called while the sta lock is held. Handlers that can
+ * cause packets to be generated will cause deadlock!
+ */
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	ieee80211_tx_result res = TX_CONTINUE;
+
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
 		__skb_queue_tail(&tx->skbs, tx->skb);
 		tx->skb = NULL;
@@ -1688,6 +1741,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	return 0;
 }
 
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r = invoke_tx_handlers_early(tx);
+
+	if (r)
+		return r;
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1762,7 +1824,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		info->hw_queue =
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
 
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result = __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
 
@@ -3106,8 +3174,73 @@ out:
 	return ret;
 }
 
+/*
+ * Can be called while the sta lock is held. Anything that can cause packets to
+ * be generated will cause deadlock!
+ */
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta, u8 pn_offs,
+				       struct ieee80211_key *key,
+				       struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	u8 tid = IEEE80211_NUM_TIDS;
+
+	if (key)
+		info->control.hw_key = &key->conf;
+
+	ieee80211_tx_stats(skb->dev, skb->len);
+
+	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+		*ieee80211_get_qos_ctl(hdr) = tid;
+		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+	} else {
+		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
+		sdata->sequence_number += 0x10;
+	}
+
+	if (skb_shinfo(skb)->gso_size)
+		sta->tx_stats.msdu[tid] +=
+			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
+	else
+		sta->tx_stats.msdu[tid]++;
+
+	info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
+	/* statistics normally done by ieee80211_tx_h_stats (but that
+	 * has to consider fragmentation, so is more complex)
+	 */
+	sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
+	sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
+
+	if (pn_offs) {
+		u64 pn;
+		u8 *crypto_hdr = skb->data + pn_offs;
+
+		switch (key->conf.cipher) {
+		case WLAN_CIPHER_SUITE_CCMP:
+		case WLAN_CIPHER_SUITE_CCMP_256:
+		case WLAN_CIPHER_SUITE_GCMP:
+		case WLAN_CIPHER_SUITE_GCMP_256:
+			pn = atomic64_inc_return(&key->conf.tx_pn);
+			crypto_hdr[0] = pn;
+			crypto_hdr[1] = pn >> 8;
+			crypto_hdr[4] = pn >> 16;
+			crypto_hdr[5] = pn >> 24;
+			crypto_hdr[6] = pn >> 32;
+			crypto_hdr[7] = pn >> 40;
+			break;
+		}
+	}
+
+	return true;
+}
+
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
-				struct net_device *dev, struct sta_info *sta,
+				struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
 				struct sk_buff *skb)
 {
@@ -3158,8 +3291,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 			return true;
 	}
 
-	ieee80211_tx_stats(dev, skb->len + extra_head);
-
 	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
 	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
 		return true;
@@ -3188,24 +3319,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
-
-	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
-		*ieee80211_get_qos_ctl(hdr) = tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
-	} else {
-		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
-		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
-		sdata->sequence_number += 0x10;
-	}
-
-	if (skb_shinfo(skb)->gso_size)
-		sta->tx_stats.msdu[tid] +=
-			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
-	else
-		sta->tx_stats.msdu[tid]++;
-
-	info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
 
 	__skb_queue_head_init(&tx.skbs);
 
@@ -3215,9 +3329,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	tx.sta = sta;
 	tx.key = fast_tx->key;
 
-	if (fast_tx->key)
-		info->control.hw_key = &fast_tx->key->conf;
-
 	if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) {
 		tx.skb = skb;
 		r = ieee80211_tx_h_rate_ctrl(&tx);
@@ -3231,31 +3342,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	/* statistics normally done by ieee80211_tx_h_stats (but that
-	 * has to consider fragmentation, so is more complex)
-	 */
-	sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
-	sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
-
-	if (fast_tx->pn_offs) {
-		u64 pn;
-		u8 *crypto_hdr = skb->data + fast_tx->pn_offs;
+	if (ieee80211_queue_skb(local, sdata, sta, skb))
+		return true;
 
-		switch (fast_tx->key->conf.cipher) {
-		case WLAN_CIPHER_SUITE_CCMP:
-		case WLAN_CIPHER_SUITE_CCMP_256:
-		case WLAN_CIPHER_SUITE_GCMP:
-		case WLAN_CIPHER_SUITE_GCMP_256:
-			pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn);
-			crypto_hdr[0] = pn;
-			crypto_hdr[1] = pn >> 8;
-			crypto_hdr[4] = pn >> 16;
-			crypto_hdr[5] = pn >> 24;
-			crypto_hdr[6] = pn >> 32;
-			crypto_hdr[7] = pn >> 40;
-			break;
-		}
-	}
+	ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs,
+				   fast_tx->key, skb);
 
 	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		sdata = container_of(sdata->bss,
@@ -3275,12 +3366,22 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	struct sk_buff *skb = NULL;
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+
 
 	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
+	/* Make sure fragments stay together. */
+	skb = __skb_dequeue(&txqi->frags);
+	if (skb)
+		goto out;
+
+begin:
 	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
@@ -3288,16 +3389,46 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	ieee80211_set_skb_vif(skb, txqi);
 
 	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info = IEEE80211_SKB_CB(skb);
+
+	memset(&tx, 0, sizeof(tx));
+	__skb_queue_head_init(&tx.skbs);
+	tx.local = local;
+	tx.skb = skb;
+	tx.sdata = vif_to_sdata(info->control.vif);
+
+	if (txq->sta)
+		tx.sta = container_of(txq->sta, struct sta_info, sta);
+
+	/*
+	 * The key can be removed while the packet was queued, so need to call
+	 * this here to get the current key.
+	 */
+	r = ieee80211_tx_h_select_key(&tx);
+	if (r != TX_CONTINUE) {
+		ieee80211_free_txskb(&local->hw, skb);
+		goto begin;
+	}
+
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		u8 pn_offs = 0;
 
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+		if (tx.key &&
+		    (tx.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV))
+			pn_offs = ieee80211_hdrlen(hdr->frame_control);
+
+		ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs,
+					   tx.key, skb);
+	} else {
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		skb = __skb_dequeue(&tx.skbs);
+
+		if (!skb_queue_empty(&tx.skbs))
+			skb_queue_splice_tail(&tx.skbs, &txqi->frags);
 	}
 
 out:
@@ -3335,7 +3466,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 		fast_tx = rcu_dereference(sta->fast_tx);
 
 		if (fast_tx &&
-		    ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+		    ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
 			goto out;
 	}
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index b6865d8..8006f9a 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3393,11 +3393,18 @@ void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
 			     unsigned long *byte_cnt)
 {
 	struct txq_info *txqi = to_txq_info(txq);
+	u32 frag_cnt = 0, frag_bytes = 0;
+	struct sk_buff *skb;
+
+	skb_queue_walk(&txqi->frags, skb) {
+		frag_cnt++;
+		frag_bytes += skb->len;
+	}
 
 	if (frame_cnt)
-		*frame_cnt = txqi->tin.backlog_packets;
+		*frame_cnt = txqi->tin.backlog_packets + frag_cnt;
 
 	if (byte_cnt)
-		*byte_cnt = txqi->tin.backlog_bytes;
+		*byte_cnt = txqi->tin.backlog_bytes + frag_bytes;
 }
 EXPORT_SYMBOL(ieee80211_txq_get_depth);
-- 
2.9.3

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-22 17:04               ` [Make-wifi-fast] [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue Toke Høiland-Jørgensen
@ 2016-09-30 10:27                 ` Johannes Berg
  2016-09-30 12:39                   ` Toke Høiland-Jørgensen
  2016-09-30 12:49                 ` Johannes Berg
  1 sibling, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-09-30 10:27 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless

Hi Toke,

Sorry for the delay reviewing this.

I think I still have a few comments/questions.

> +static inline bool txq_has_queue(struct ieee80211_txq *txq)
> +{
> +	struct txq_info *txqi = to_txq_info(txq);
> +	return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
> +}

Tiny nit - there should probably be a blank line between the two lines
here, but I could just fix that when I apply if you don't resend anyway
for some other reason.

[snip helper stuff that looks fine]

> -	if (!tx->sta->sta.txq[0])
> -		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
> +	hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);

Just to make sure I get this right - this is because the handler is now
run on dequeue, so the special case is no longer needed?

>  #define CALL_TXH(txh) \
> @@ -1656,6 +1684,31 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
>  	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>  		CALL_TXH(ieee80211_tx_h_rate_ctrl);

Just for reference - the code block here that's unchanged contains
this:

        CALL_TXH(ieee80211_tx_h_dynamic_ps);
        CALL_TXH(ieee80211_tx_h_check_assoc);
        CALL_TXH(ieee80211_tx_h_ps_buf);
        CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
        CALL_TXH(ieee80211_tx_h_select_key);
        if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
                CALL_TXH(ieee80211_tx_h_rate_ctrl);

> +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
> +{
> +	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
> +	ieee80211_tx_result res = TX_CONTINUE;
> +
>  	if (unlikely(info->flags &
> IEEE80211_TX_INTFL_RETRANSMISSION)) {
>  		__skb_queue_tail(&tx->skbs, tx->skb);
>  		tx->skb = NULL;

And this code here is also unchanged from the original TX handler
invocation, so contains this:

        if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
                __skb_queue_tail(&tx->skbs, tx->skb);
                tx->skb = NULL;
                goto txh_done;
        }

        CALL_TXH(ieee80211_tx_h_michael_mic_add);
        CALL_TXH(ieee80211_tx_h_sequence);
        CALL_TXH(ieee80211_tx_h_fragment);
        /* handlers after fragment must be aware of tx info fragmentation! */
        CALL_TXH(ieee80211_tx_h_stats);
        CALL_TXH(ieee80211_tx_h_encrypt);
        if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
                CALL_TXH(ieee80211_tx_h_calculate_duration);

But now you have a problem (that you solved) that the key pointer can
be invalidated while you have the packet queued between the two points,
and then the tx_h_michael_mic_add and/or tx_h_encrypt would crash.

You solve this by re-running tx_h_select_key() on dequeue, but it's not
clear to me why you didn't move that to the late handlers instead?

I *think* it should commute with the rate control handler, but even so,
wouldn't it make more sense to have rate control late? Assuming the
packets are queued for some amount of time, having rate control
information queued with them would get stale.

Similarly, it seems to me that checking the control port protocol later
(or perhaps duplicating that?) would be a good idea?


> +/*
> + * Can be called while the sta lock is held. Anything that can cause
> packets to
> + * be generated will cause deadlock!
> + */
> +static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data
> *sdata,
> +				       struct sta_info *sta, u8
> pn_offs,
> +				       struct ieee80211_key *key,
> +				       struct sk_buff *skb)

That should be a void function now, you never check the return value
and only return true anyway.

> +	struct ieee80211_tx_info *info;
> +	struct ieee80211_tx_data tx;
> +	ieee80211_tx_result r;
> +

nit: extra blank line

>  	spin_lock_bh(&fq->lock);
>  
>  	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
>  		goto out;
>  
> +	/* Make sure fragments stay together. */
> +	skb = __skb_dequeue(&txqi->frags);
> +	if (skb)
> +		goto out;
> +
> +begin:

I guess now that you introduced that anyway, we should consider making
the skb_linearize() failure go there. Should be a follow-up patch
though.

> +	/*
> +	 * The key can be removed while the packet was queued, so
> need to call
> +	 * this here to get the current key.
> +	 */
> +	r = ieee80211_tx_h_select_key(&tx);
> +	if (r != TX_CONTINUE) {
> +		ieee80211_free_txskb(&local->hw, skb);
> +		goto begin;
> +	}
> +
> +	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {

It's a bit unfortunate that you lose fast-xmit here completely for the
key stuff, but I don't see a good way to avoid that, other than
completely rejiggering all the (possibly affected) queues when keys
change... might be very complex to do that, certainly a follow-up patch
if it's desired.

This check seems a bit weird though - how could fast-xmit be set
without a TXQ station?

> +++ b/net/mac80211/util.c
> @@ -3393,11 +3393,18 @@ void ieee80211_txq_get_depth(struct
> ieee80211_txq *txq,
>  			     unsigned long *byte_cnt)
>  {
>  	struct txq_info *txqi = to_txq_info(txq);
> +	u32 frag_cnt = 0, frag_bytes = 0;
> +	struct sk_buff *skb;
> +
> +	skb_queue_walk(&txqi->frags, skb) {
> +		frag_cnt++;
> +		frag_bytes += skb->len;
> +	}

I hope this is called infrequently :)

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v9 1/2] mac80211: Move ieee802111_tx_dequeue() to later in tx.c
  2016-09-22 17:04               ` [Make-wifi-fast] [PATCH v9 1/2] mac80211: Move ieee802111_tx_dequeue() to later in tx.c Toke Høiland-Jørgensen
@ 2016-09-30 11:13                 ` Johannes Berg
  0 siblings, 0 replies; 77+ messages in thread
From: Johannes Berg @ 2016-09-30 11:13 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless

On Thu, 2016-09-22 at 19:04 +0200, Toke Høiland-Jørgensen wrote:
> The TXQ path restructure requires ieee80211_tx_dequeue() to call TX
> handlers and parts of the xmit_fast path. Move the function to later
> in tx.c in preparation for this.
> 
Applied.

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-30 10:27                 ` Johannes Berg
@ 2016-09-30 12:39                   ` Toke Høiland-Jørgensen
  2016-09-30 12:43                     ` Johannes Berg
  0 siblings, 1 reply; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-30 12:39 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

> Hi Toke,
>
> Sorry for the delay reviewing this.
>
> I think I still have a few comments/questions.

No worries. And not terribly surprised ;)

>> +static inline bool txq_has_queue(struct ieee80211_txq *txq)
>> +{
>> +	struct txq_info *txqi = to_txq_info(txq);
>> +	return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
>> +}
>
> Tiny nit - there should probably be a blank line between the two lines
> here, but I could just fix that when I apply if you don't resend anyway
> for some other reason.

Noted.

> [snip helper stuff that looks fine]
>
>> -	if (!tx->sta->sta.txq[0])
>> -		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
>> +	hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
>
> Just to make sure I get this right - this is because the handler is now
> run on dequeue, so the special case is no longer needed?

Yup. The same change is made in xmit_fast (but obscured by the moving of
the surrounding code into _finish()).

>>  #define CALL_TXH(txh) \
>> @@ -1656,6 +1684,31 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
>>  	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>>  		CALL_TXH(ieee80211_tx_h_rate_ctrl);
>
> Just for reference - the code block here that's unchanged contains
> this:
>
>         CALL_TXH(ieee80211_tx_h_dynamic_ps);
>         CALL_TXH(ieee80211_tx_h_check_assoc);
>         CALL_TXH(ieee80211_tx_h_ps_buf);
>         CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
>         CALL_TXH(ieee80211_tx_h_select_key);
>         if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>                 CALL_TXH(ieee80211_tx_h_rate_ctrl);
>
>> +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
>> +{
>> +	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
>> +	ieee80211_tx_result res = TX_CONTINUE;
>> +
>>  	if (unlikely(info->flags &
>> IEEE80211_TX_INTFL_RETRANSMISSION)) {
>>  		__skb_queue_tail(&tx->skbs, tx->skb);
>>  		tx->skb = NULL;
>
> And this code here is also unchanged from the original TX handler
> invocation, so contains this:
>
>         if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
>                 __skb_queue_tail(&tx->skbs, tx->skb);
>                 tx->skb = NULL;
>                 goto txh_done;
>         }
>
>         CALL_TXH(ieee80211_tx_h_michael_mic_add);
>         CALL_TXH(ieee80211_tx_h_sequence);
>         CALL_TXH(ieee80211_tx_h_fragment);
>         /* handlers after fragment must be aware of tx info fragmentation! */
>         CALL_TXH(ieee80211_tx_h_stats);
>         CALL_TXH(ieee80211_tx_h_encrypt);
>         if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>                 CALL_TXH(ieee80211_tx_h_calculate_duration);
>
> But now you have a problem (that you solved) that the key pointer can
> be invalidated while you have the packet queued between the two points,
> and then the tx_h_michael_mic_add and/or tx_h_encrypt would crash.
>
> You solve this by re-running tx_h_select_key() on dequeue, but it's not
> clear to me why you didn't move that to the late handlers instead?

Because I need to run it anyway for the xmit_fast path on dequeue. I
thought doing it this way simplifies the code (at the cost of the
handler getting called twice when xmit_fast is not active).

> I *think* it should commute with the rate control handler, but even
> so, wouldn't it make more sense to have rate control late? Assuming
> the packets are queued for some amount of time, having rate control
> information queued with them would get stale.

Yes, having rate control run at dequeue would be good, and that's what I
did initially. However, I found that this would lead to a deadlock
because the rate control handler would send out packets in some cases (I
forget the details but can go back and check if needed). And since the
dequeue function is called with the driver TXQ lock held, that would
lead to a deadlock when those packets made it to the driver TX path.

So I decided to just keep it this way for now; I plan to go poking into
the rate controller later anyway, so moving the handler to later could
be part of that.

> Similarly, it seems to me that checking the control port protocol later
> (or perhaps duplicating that?) would be a good idea?

But that handler only sets a few flags? Is
tx->sdata->control_port_protocol likely to change while the packet is
queued?

>> +/*
>> + * Can be called while the sta lock is held. Anything that can cause
>> packets to
>> + * be generated will cause deadlock!
>> + */
>> +static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data
>> *sdata,
>> +				       struct sta_info *sta, u8
>> pn_offs,
>> +				       struct ieee80211_key *key,
>> +				       struct sk_buff *skb)
>
> That should be a void function now, you never check the return value
> and only return true anyway.

Noted.

>> +	struct ieee80211_tx_info *info;
>> +	struct ieee80211_tx_data tx;
>> +	ieee80211_tx_result r;
>> +
>
> nit: extra blank line

The horror ;) (thought I got rid of all those; ah well, will fix)
>
>>  	spin_lock_bh(&fq->lock);
>>  
>>  	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
>>  		goto out;
>>  
>> +	/* Make sure fragments stay together. */
>> +	skb = __skb_dequeue(&txqi->frags);
>> +	if (skb)
>> +		goto out;
>> +
>> +begin:
>
> I guess now that you introduced that anyway, we should consider making
> the skb_linearize() failure go there. Should be a follow-up patch
> though.

Can do.

>
>> +	/*
>> +	 * The key can be removed while the packet was queued, so
>> need to call
>> +	 * this here to get the current key.
>> +	 */
>> +	r = ieee80211_tx_h_select_key(&tx);
>> +	if (r != TX_CONTINUE) {
>> +		ieee80211_free_txskb(&local->hw, skb);
>> +		goto begin;
>> +	}
>> +
>> +	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
>
> It's a bit unfortunate that you lose fast-xmit here completely for the
> key stuff, but I don't see a good way to avoid that, other than
> completely rejiggering all the (possibly affected) queues when keys
> change... might be very complex to do that, certainly a follow-up
> patch if it's desired.

Yeah, figured it was better to have something that's correct and then go
back and change it if the performance hit turns out to be too high.

> This check seems a bit weird though - how could fast-xmit be set
> without a TXQ station?

I think that is probably just left over from before I introduced the
control flag. Should be fine to remove it.

>> +++ b/net/mac80211/util.c
>> @@ -3393,11 +3393,18 @@ void ieee80211_txq_get_depth(struct
>> ieee80211_txq *txq,
>>  			     unsigned long *byte_cnt)
>>  {
>>  	struct txq_info *txqi = to_txq_info(txq);
>> +	u32 frag_cnt = 0, frag_bytes = 0;
>> +	struct sk_buff *skb;
>> +
>> +	skb_queue_walk(&txqi->frags, skb) {
>> +		frag_cnt++;
>> +		frag_bytes += skb->len;
>> +	}
>
> I hope this is called infrequently :)

Well, ath10k is the only user. It does get called on each wake_tx_queue,
though, so not that infrequently. My reasoning was that since the frags
queue is never going to have more than a fairly small number of packets
in it (those produced from a single split packet), counting this way is
acceptable instead of keeping a state variable up to date. Can change it
if you disagree :)


Not sure if you want a v10, or if you're satisfied with the above
comments and will just fix up the nits on merging?

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-30 12:39                   ` Toke Høiland-Jørgensen
@ 2016-09-30 12:43                     ` Johannes Berg
  2016-09-30 12:45                       ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-09-30 12:43 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless

> Because I need to run it anyway for the xmit_fast path on dequeue. I
> thought doing it this way simplifies the code (at the cost of the
> handler getting called twice when xmit_fast is not active).

Ok, that's fair.

> > I *think* it should commute with the rate control handler, but even
> > so, wouldn't it make more sense to have rate control late? Assuming
> > the packets are queued for some amount of time, having rate control
> > information queued with them would get stale.
> 
> Yes, having rate control run at dequeue would be good, and that's
> what I did initially. However, I found that this would lead to a
> deadlock because the rate control handler would send out packets in
> some cases (I forget the details but can go back and check if
> needed). And since the dequeue function is called with the driver TXQ
> lock held, that would lead to a deadlock when those packets made it
> to the driver TX path.

That seems really odd, but I can see how a deadlock happens then.

> So I decided to just keep it this way for now; I plan to go poking
> into the rate controller later anyway, so moving the handler to later
> could be part of that.

Sure, that's fair.

> But that handler only sets a few flags? Is
> tx->sdata->control_port_protocol likely to change while the packet is
> queued?

Oh right, I confused things there. We check the controlled port much
earlier, but anyway that should be OK.

> > It's a bit unfortunate that you lose fast-xmit here completely for
> > the key stuff, but I don't see a good way to avoid that, other than
> > completely rejiggering all the (possibly affected) queues when keys
> > change... might be very complex to do that, certainly a follow-up
> > patch if it's desired.
> 
> Yeah, figured it was better to have something that's correct and then
> go back and change it if the performance hit turns out to be too
> high.

Makes sense.

> > This check seems a bit weird though - how could fast-xmit be set
> > without a TXQ station?
> 
> I think that is probably just left over from before I introduced the
> control flag. Should be fine to remove it.

Ok.

> > 
> > > 
> > > +++ b/net/mac80211/util.c
> > > @@ -3393,11 +3393,18 @@ void ieee80211_txq_get_depth(struct
> > > ieee80211_txq *txq,
> > >  			     unsigned long *byte_cnt)
> > >  {
> > >  	struct txq_info *txqi = to_txq_info(txq);
> > > +	u32 frag_cnt = 0, frag_bytes = 0;
> > > +	struct sk_buff *skb;
> > > +
> > > +	skb_queue_walk(&txqi->frags, skb) {
> > > +		frag_cnt++;
> > > +		frag_bytes += skb->len;
> > > +	}
> > 
> > I hope this is called infrequently :)
> 
> Well, ath10k is the only user. It does get called on each
> wake_tx_queue, though, so not that infrequently. My reasoning was
> that since the frags queue is never going to have more than a fairly
> small number of packets in it (those produced from a single split
> packet), counting this way is acceptable instead of keeping a state
> variable up to date. Can change it if you disagree :)

No, I guess you're right, it can't be a long queue.

> Not sure if you want a v10, or if you're satisfied with the above
> comments and will just fix up the nits on merging?
> 

I'll fix it up. Thanks!

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-30 12:43                     ` Johannes Berg
@ 2016-09-30 12:45                       ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-30 12:45 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> Not sure if you want a v10, or if you're satisfied with the above
>> comments and will just fix up the nits on merging?
>> 
>
> I'll fix it up. Thanks!

Cool, thanks :)

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-22 17:04               ` [Make-wifi-fast] [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue Toke Høiland-Jørgensen
  2016-09-30 10:27                 ` Johannes Berg
@ 2016-09-30 12:49                 ` Johannes Berg
  2016-09-30 14:01                   ` Toke Høiland-Jørgensen
  1 sibling, 1 reply; 77+ messages in thread
From: Johannes Berg @ 2016-09-30 12:49 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless

Applied, with the nits fixed as discussed.

Come to think of it, if somebody is bored ;-) perhaps a hwsim option to
use TXQs (should be optional I guess) would be nice so we can exercise
this code with the wpa_supplicant hwsim tests. That would have caught
the TKIP issues etc. pretty early on too, I think.

johannes

^ permalink raw reply	[flat|nested] 77+ messages in thread

* Re: [Make-wifi-fast] [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-30 12:49                 ` Johannes Berg
@ 2016-09-30 14:01                   ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 77+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-30 14:01 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

> Applied, with the nits fixed as discussed.

Awesome, thanks!

> Come to think of it, if somebody is bored ;-) perhaps a hwsim option
> to use TXQs (should be optional I guess) would be nice so we can
> exercise this code with the wpa_supplicant hwsim tests. That would
> have caught the TKIP issues etc. pretty early on too, I think.

Noted. I'll look into that the next time I'm bored ;)

-Toke

^ permalink raw reply	[flat|nested] 77+ messages in thread

end of thread, other threads:[~2016-09-30 14:01 UTC | newest]

Thread overview: 77+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-17 12:58 [Make-wifi-fast] [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue Toke Høiland-Jørgensen
2016-08-17 13:08 ` Johannes Berg
2016-08-17 13:16   ` Toke Høiland-Jørgensen
2016-08-17 13:18     ` Johannes Berg
2016-08-17 13:23       ` Toke Høiland-Jørgensen
2016-08-17 14:45 ` [Make-wifi-fast] [PATCH v2] " Toke Høiland-Jørgensen
2016-08-17 15:47   ` Noah Causin
2016-08-17 22:33     ` Toke Høiland-Jørgensen
2016-08-19  3:06       ` Noah Causin
2016-08-22 14:24         ` Toke Høiland-Jørgensen
2016-08-23 17:06           ` Noah Causin
2016-08-23 17:51             ` Toke Høiland-Jørgensen
2016-08-17 19:49   ` Johannes Berg
2016-08-17 20:07     ` Dave Taht
2016-08-17 20:43       ` Johannes Berg
2016-08-22 14:47         ` Toke Høiland-Jørgensen
2016-08-26  8:38           ` Johannes Berg
2016-08-26  8:54             ` Toke Høiland-Jørgensen
2016-08-24 16:20   ` [Make-wifi-fast] [PATCH v3] mac80211: Move reorder-sensitive TX handlers " Toke Høiland-Jørgensen
2016-08-24 22:40     ` Noah Causin
2016-08-25 12:45       ` Toke Høiland-Jørgensen
2016-08-26 14:30         ` Toke Høiland-Jørgensen
2016-08-26 14:51           ` Dave Taht
2016-08-30 13:15     ` [Make-wifi-fast] [PATCH v4] " Toke Høiland-Jørgensen
2016-08-30 13:17       ` Toke Høiland-Jørgensen
2016-08-31 21:06       ` Johannes Berg
2016-09-01  8:23         ` Toke Høiland-Jørgensen
2016-09-01  8:34           ` Johannes Berg
2016-09-01  8:38             ` Toke Høiland-Jørgensen
2016-09-01  9:07               ` Johannes Berg
2016-09-01  9:20                 ` Toke Høiland-Jørgensen
2016-09-01  9:27                   ` Johannes Berg
2016-09-01  9:42                     ` Toke Høiland-Jørgensen
2016-09-01 16:03       ` [Make-wifi-fast] [PATCH v5] " Toke Høiland-Jørgensen
2016-09-01 17:59         ` Johannes Berg
2016-09-01 18:30           ` Toke Høiland-Jørgensen
2016-09-01 18:35             ` Johannes Berg
2016-09-02  2:48         ` Jason Andryuk
2016-09-02  9:27           ` Toke Høiland-Jørgensen
2016-09-02 13:41         ` [Make-wifi-fast] [PATCH v6] " Toke Høiland-Jørgensen
2016-09-02 14:44           ` Toke Høiland-Jørgensen
2016-09-05 11:30           ` [Make-wifi-fast] [PATCH v7] " Toke Høiland-Jørgensen
2016-09-05 16:06             ` Toke Høiland-Jørgensen
2016-09-05 17:00               ` Dave Taht
2016-09-05 17:26                 ` Toke Høiland-Jørgensen
2016-09-05 17:59                   ` Dave Taht
2016-09-05 20:23                     ` Dave Taht
2016-09-05 20:45                       ` Toke Høiland-Jørgensen
2016-09-05 21:02                         ` Dave Taht
2016-09-05 21:25                           ` Toke Høiland-Jørgensen
2016-09-05 21:29                             ` Dave Taht
2016-09-05 21:35                               ` Toke Høiland-Jørgensen
2016-09-05 21:42                                 ` Dave Taht
2016-09-05 22:04                                   ` Dave Taht
2016-09-05 22:01                             ` Toke Høiland-Jørgensen
2016-09-05 22:08                               ` Dave Taht
2016-09-05 22:31                                 ` Dave Taht
2016-09-05 17:49             ` Felix Fietkau
2016-09-05 17:59               ` Toke Høiland-Jørgensen
2016-09-05 18:45                 ` Felix Fietkau
2016-09-06 11:43             ` Toke Høiland-Jørgensen
2016-09-06 11:45               ` Toke Høiland-Jørgensen
2016-09-06 11:44             ` [Make-wifi-fast] [PATCH v8] " Toke Høiland-Jørgensen
2016-09-06 22:04               ` Felix Fietkau
2016-09-12 12:35               ` Johannes Berg
2016-09-12 13:08                 ` Toke Høiland-Jørgensen
2016-09-12 13:19                   ` Johannes Berg
2016-09-22 17:04               ` [Make-wifi-fast] [PATCH v9 0/2] mac80211: TXQ dequeue path rework Toke Høiland-Jørgensen
2016-09-22 17:04               ` [Make-wifi-fast] [PATCH v9 1/2] mac80211: Move ieee802111_tx_dequeue() to later in tx.c Toke Høiland-Jørgensen
2016-09-30 11:13                 ` Johannes Berg
2016-09-22 17:04               ` [Make-wifi-fast] [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue Toke Høiland-Jørgensen
2016-09-30 10:27                 ` Johannes Berg
2016-09-30 12:39                   ` Toke Høiland-Jørgensen
2016-09-30 12:43                     ` Johannes Berg
2016-09-30 12:45                       ` Toke Høiland-Jørgensen
2016-09-30 12:49                 ` Johannes Berg
2016-09-30 14:01                   ` Toke Høiland-Jørgensen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox