[Bloat] Fwd: [PATCH RFC v1 net-next 1/4] net: Introduce Qdisc backpressure infrastructure

Dave Taht dave.taht at gmail.com
Sun May 8 22:22:14 EDT 2022


I am encouraged this layer of the stack can finally see backpressure for udp

---------- Forwarded message ---------
From: Peilin Ye <yepeilin.cs at gmail.com>
Date: Sun, May 8, 2022 at 2:28 AM
Subject: [PATCH RFC v1 net-next 1/4] net: Introduce Qdisc backpressure
infrastructure
To: David S. Miller <davem at davemloft.net>, Eric Dumazet
<eric.dumazet at gmail.com>, Jakub Kicinski <kuba at kernel.org>, Paolo
Abeni <pabeni at redhat.com>, Hideaki YOSHIFUJI
<yoshfuji at linux-ipv6.org>, David Ahern <dsahern at kernel.org>, Jamal
Hadi Salim <jhs at mojatatu.com>, Cong Wang <xiyou.wangcong at gmail.com>,
Jiri Pirko <jiri at resnulli.us>
Cc: Peilin Ye <peilin.ye at bytedance.com>, <netdev at vger.kernel.org>,
<linux-kernel at vger.kernel.org>, Cong Wang <cong.wang at bytedance.com>,
Peilin Ye <yepeilin.cs at gmail.com>


From: Peilin Ye <peilin.ye at bytedance.com>

Currently sockets (especially UDP ones) can drop a lot of traffic at TC
egress when rate limited by shaper Qdiscs like HTB.  Improve it by
implementing the following state machine for sockets (currently only
support UDP and TCP ones):

  ┌────────────────┐  [a]  ┌────────────────┐  [b]  ┌────────────────┐
  │ SK_UNTHROTTLED │─ ─ ─ >│  SK_OVERLIMIT  │─ ─ ─ >│  SK_THROTTLED  │
  └────────────────┘       └────────────────┘       └────────────────┘
           └─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ < ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┘
                                   [c]

Take TBF as an example,

  [a] When TBF's inner Qdisc (e.g. bfifo) becomes full, TBF fails to
      enqueue an skb belonging to UNTHROTTLED socket A.  socket A is
      marked as OVERLIMIT, and added to TBF's "backpressure_list";

  [b] When TBF runs out of tokens, it marks all OVERLIMIT sockets
      (including A) on its backpressure_list as THROTTLED, and schedules
      a Qdisc watchdog timer to wait for more tokens;

  [c] After the timer expires, all THROTTLED sockets (including A) are
      removed from TBF's backpressure_list, and marked as UNTHROTTLED.

UDP and TCP sleep on THROTTLED sockets in sock_wait_for_wmem() and
sk_stream_wait_memory() respectively.  epoll() and friends should not
report EPOLL{OUT,WRNORM} for THROTTLED sockets.  When unthrottling in [c],
call ->sk_write_space() to wake up UDP and/or TCP waiters, and notify
SOCKWQ_ASYNC_NOSPACE subscribers.

For each device, backpressure_list operations are always serialized by
Qdisc root_lock.

When marking a socket as OVERLIMIT in [a], use a cmpxchg() to make sure
that multiple CPUs do not try to add this socket to different
backpressure_lists (on different devices) concurrently.

After removing a THROTTLED socket from backpressure_list in [c], use a
smp_store_release() to make sure changes have been committed to memory
before marking the socket as UNTHROTTLED.

Suggested-by: Cong Wang <cong.wang at bytedance.com>
Signed-off-by: Peilin Ye <peilin.ye at bytedance.com>
---
 include/net/sch_generic.h | 43 +++++++++++++++++++++++++++++++++++++++
 include/net/sock.h        | 18 +++++++++++++++-
 net/core/dev.c            |  1 +
 net/core/sock.c           |  6 ++++--
 net/ipv4/tcp_ipv4.c       | 11 +++++++---
 net/sched/sch_generic.c   |  4 ++++
 6 files changed, 77 insertions(+), 6 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9bab396c1f3b..5ddbe0b65cb6 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -19,6 +19,7 @@
 #include <net/gen_stats.h>
 #include <net/rtnetlink.h>
 #include <net/flow_offload.h>
+#include <net/sock.h>

 struct Qdisc_ops;
 struct qdisc_walker;
@@ -108,6 +109,7 @@ struct Qdisc {
        struct gnet_stats_queue __percpu *cpu_qstats;
        int                     pad;
        refcount_t              refcnt;
+       struct list_head        backpressure_list;

        /*
         * For performance sake on SMP, we put highly modified fields at the end
@@ -1221,6 +1223,47 @@ static inline int qdisc_drop_all(struct sk_buff
*skb, struct Qdisc *sch,
        return NET_XMIT_DROP;
 }

+static inline void qdisc_backpressure_overlimit(struct Qdisc *sch,
struct sk_buff *skb)
+{
+       struct sock *sk = skb->sk;
+
+       if (!sk || !sk_fullsock(sk))
+               return;
+
+       if (cmpxchg(&sk->sk_backpressure_status, SK_UNTHROTTLED,
SK_OVERLIMIT) == SK_UNTHROTTLED) {
+               sock_hold(sk);
+               list_add_tail(&sk->sk_backpressure_node,
&sch->backpressure_list);
+       }
+}
+
+static inline void qdisc_backpressure_throttle(struct Qdisc *sch)
+{
+       struct list_head *pos;
+       struct sock *sk;
+
+       list_for_each(pos, &sch->backpressure_list) {
+               sk = list_entry(pos, struct sock, sk_backpressure_node);
+
+               WRITE_ONCE(sk->sk_backpressure_status, SK_THROTTLED);
+       }
+}
+
+static inline void qdisc_backpressure_unthrottle(struct Qdisc *sch)
+{
+       struct list_head *pos, *next;
+       struct sock *sk;
+
+       list_for_each_safe(pos, next, &sch->backpressure_list) {
+               sk = list_entry(pos, struct sock, sk_backpressure_node);
+
+               list_del_init(pos);
+               smp_store_release(&sk->sk_backpressure_status, SK_UNTHROTTLED);
+               sk->sk_write_space(sk);
+
+               sock_put(sk);
+       }
+}
+
 /* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how
    long it will take to send a packet given its size.
  */
diff --git a/include/net/sock.h b/include/net/sock.h
index 73063c88a249..6ed2de43dc98 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -315,6 +315,8 @@ struct sk_filter;
   *    @sk_rcvtimeo: %SO_RCVTIMEO setting
   *    @sk_sndtimeo: %SO_SNDTIMEO setting
   *    @sk_txhash: computed flow hash for use on transmit
+  *    @sk_backpressure_status: Qdisc backpressure status
+  *    @sk_backpressure_node: linkage for Qdisc backpressure list
   *    @sk_txrehash: enable TX hash rethink
   *    @sk_filter: socket filtering instructions
   *    @sk_timer: sock cleanup timer
@@ -468,6 +470,8 @@ struct sock {
        unsigned int            sk_gso_max_size;
        gfp_t                   sk_allocation;
        __u32                   sk_txhash;
+       u32                     sk_backpressure_status; /* see enum
sk_backpressure */
+       struct list_head        sk_backpressure_node;

        /*
         * Because of non atomicity rules, all
@@ -548,6 +552,12 @@ enum sk_pacing {
        SK_PACING_FQ            = 2,
 };

+enum sk_backpressure {
+       SK_UNTHROTTLED  = 0,
+       SK_OVERLIMIT    = 1,
+       SK_THROTTLED    = 2,
+};
+
 /* Pointer stored in sk_user_data might not be suitable for copying
  * when cloning the socket. For instance, it can point to a reference
  * counted object. sk_user_data bottom bit is set if pointer must not
@@ -2522,12 +2532,18 @@ static inline struct page_frag
*sk_page_frag(struct sock *sk)

 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);

+static inline bool sk_is_throttled(const struct sock *sk)
+{
+       return READ_ONCE(sk->sk_backpressure_status) == SK_THROTTLED;
+}
+
 /*
  *     Default write policy as shown to user space via poll/select/SIGIO
  */
 static inline bool sock_writeable(const struct sock *sk)
 {
-       return refcount_read(&sk->sk_wmem_alloc) <
(READ_ONCE(sk->sk_sndbuf) >> 1);
+       return !sk_is_throttled(sk) &&
+              refcount_read(&sk->sk_wmem_alloc) <
(READ_ONCE(sk->sk_sndbuf) >> 1);
 }

 static inline gfp_t gfp_any(void)
diff --git a/net/core/dev.c b/net/core/dev.c
index c2d73595a7c3..7c3d136725b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5013,6 +5013,7 @@ static __latent_entropy void
net_tx_action(struct softirq_action *h)
                        if (!(q->flags & TCQ_F_NOLOCK)) {
                                root_lock = qdisc_lock(q);
                                spin_lock(root_lock);
+                               qdisc_backpressure_unthrottle(q);
                        } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
                                                     &q->state))) {
                                /* There is a synchronize_net() between
diff --git a/net/core/sock.c b/net/core/sock.c
index be20a1af20e5..7ed9d2bd991f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2034,6 +2034,7 @@ struct sock *sk_alloc(struct net *net, int
family, gfp_t priority,

                sock_net_set(sk, net);
                refcount_set(&sk->sk_wmem_alloc, 1);
+               INIT_LIST_HEAD(&sk->sk_backpressure_node);

                mem_cgroup_sk_alloc(sk);
                cgroup_sk_alloc(&sk->sk_cgrp_data);
@@ -2589,7 +2590,8 @@ static long sock_wait_for_wmem(struct sock *sk,
long timeo)
                        break;
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-               if (refcount_read(&sk->sk_wmem_alloc) <
READ_ONCE(sk->sk_sndbuf))
+               if (!sk_is_throttled(sk) &&
+                   refcount_read(&sk->sk_wmem_alloc) <
READ_ONCE(sk->sk_sndbuf))
                        break;
                if (sk->sk_shutdown & SEND_SHUTDOWN)
                        break;
@@ -2624,7 +2626,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock
*sk, unsigned long header_len,
                if (sk->sk_shutdown & SEND_SHUTDOWN)
                        goto failure;

-               if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
+               if (!sk_is_throttled(sk) && sk_wmem_alloc_get(sk) <
READ_ONCE(sk->sk_sndbuf))
                        break;

                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 918816ec5dd4..6e905995bfa2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3006,9 +3006,14 @@ void tcp4_proc_exit(void)
  */
 bool tcp_stream_memory_free(const struct sock *sk, int wake)
 {
-       const struct tcp_sock *tp = tcp_sk(sk);
-       u32 notsent_bytes = READ_ONCE(tp->write_seq) -
-                           READ_ONCE(tp->snd_nxt);
+       const struct tcp_sock *tp;
+       u32 notsent_bytes;
+
+       if (sk_is_throttled(sk))
+               return false;
+
+       tp = tcp_sk(sk);
+       notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);

        return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
 }
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index dba0b3e24af5..9ab314b874a7 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -674,6 +674,7 @@ struct Qdisc noop_qdisc = {
                .qlen = 0,
                .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
        },
+       .backpressure_list =    LIST_HEAD_INIT(noop_qdisc.backpressure_list),
 };
 EXPORT_SYMBOL(noop_qdisc);

@@ -947,6 +948,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
        qdisc_skb_head_init(&sch->q);
        gnet_stats_basic_sync_init(&sch->bstats);
        spin_lock_init(&sch->q.lock);
+       INIT_LIST_HEAD(&sch->backpressure_list);

        if (ops->static_flags & TCQ_F_CPUSTATS) {
                sch->cpu_bstats =
@@ -1025,6 +1027,8 @@ void qdisc_reset(struct Qdisc *qdisc)
        if (ops->reset)
                ops->reset(qdisc);

+       qdisc_backpressure_unthrottle(qdisc);
+
        __skb_queue_purge(&qdisc->gso_skb);
        __skb_queue_purge(&qdisc->skb_bad_txq);

--
2.20.1



-- 
FQ World Domination pending: https://blog.cerowrt.org/post/state_of_fq_codel/
Dave Täht CEO, TekLibre, LLC


More information about the Bloat mailing list