[Codel] [RFC PATCH v2] tcp: TCP Small Queues
Yuchung Cheng
ycheng at google.com
Tue Jul 10 13:37:01 EDT 2012
On Tue, Jul 10, 2012 at 8:13 AM, Eric Dumazet <eric.dumazet at gmail.com> wrote:
> This introduce TSQ (TCP Small Queues)
>
> TSQ goal is to reduce number of TCP packets in xmit queues (qdisc &
> device queues), to reduce RTT and cwnd bias, part of the bufferbloat
> problem.
>
> sk->sk_wmem_alloc not allowed to grow above a given limit,
> allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a
> given time.
>
> TSO packets are sized/capped to half the limit, so that we have two
> TSO packets in flight, allowing better bandwidth use.
>
> As a side effect, setting the limit to 40000 automatically reduces the
> standard gso max limit (65536) to 40000/2 : It can help to reduce
> latencies of high prio packets, having smaller TSO packets.
>
> This means we divert sock_wfree() to a tcp_wfree() handler, to
> queue/send following frames when skb_orphan() [2] is called for the
> already queued skbs.
>
> Results on my dev machine (tg3 nic) are really impressive, using
> standard pfifo_fast, and with or without TSO/GSO. Without reduction of
> nominal bandwidth.
>
> I no longer have 3MBytes backlogged in qdisc by a single netperf
> session, and both side socket autotuning no longer use 4 Mbytes.
>
> As skb destructor cannot restart xmit itself ( as qdisc lock might be
> taken at this point ), we delegate the work to a tasklet. We use one
> tasklest per cpu for performance reasons.
>
>
>
> [1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable
> [2] skb_orphan() is usually called at TX completion time,
> but some drivers call it in their start_xmit() handler.
> These drivers should at least use BQL, or else a single TCP
> session can still fill the whole NIC TX ring, since TSQ will
> have no effect.
>
> Not-Yet-Signed-off-by: Eric Dumazet <edumazet at google.com>
> ---
> include/linux/tcp.h | 9 ++
> include/net/tcp.h | 3
> net/ipv4/sysctl_net_ipv4.c | 7 +
> net/ipv4/tcp.c | 14 ++-
> net/ipv4/tcp_minisocks.c | 1
> net/ipv4/tcp_output.c | 132 ++++++++++++++++++++++++++++++++++-
> 6 files changed, 160 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 7d3bced..55b8cf9 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -339,6 +339,9 @@ struct tcp_sock {
> u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */
> u32 lsndtime; /* timestamp of last sent data packet (for restart window) */
>
> + struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
> + unsigned long tsq_flags;
> +
> /* Data for direct copy to user */
> struct {
> struct sk_buff_head prequeue;
> @@ -494,6 +497,12 @@ struct tcp_sock {
> struct tcp_cookie_values *cookie_values;
> };
>
> +enum tsq_flags {
> + TSQ_THROTTLED,
> + TSQ_QUEUED,
> + TSQ_OWNED, /* tcp_tasklet_func() found socket was locked */
> +};
> +
> static inline struct tcp_sock *tcp_sk(const struct sock *sk)
> {
> return (struct tcp_sock *)sk;
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 53fb7d8..3a6ed09 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -253,6 +253,7 @@ extern int sysctl_tcp_cookie_size;
> extern int sysctl_tcp_thin_linear_timeouts;
> extern int sysctl_tcp_thin_dupack;
> extern int sysctl_tcp_early_retrans;
> +extern int sysctl_tcp_limit_output_bytes;
>
> extern atomic_long_t tcp_memory_allocated;
> extern struct percpu_counter tcp_sockets_allocated;
> @@ -321,6 +322,8 @@ extern struct proto tcp_prot;
>
> extern void tcp_init_mem(struct net *net);
>
> +extern void tcp_tasklet_init(void);
> +
> extern void tcp_v4_err(struct sk_buff *skb, u32);
>
> extern void tcp_shutdown (struct sock *sk, int how);
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 12aa0c5..70730f7 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -598,6 +598,13 @@ static struct ctl_table ipv4_table[] = {
> .mode = 0644,
> .proc_handler = proc_dointvec
> },
> + {
> + .procname = "tcp_limit_output_bytes",
> + .data = &sysctl_tcp_limit_output_bytes,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec
> + },
> #ifdef CONFIG_NET_DMA
> {
> .procname = "tcp_dma_copybreak",
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 3ba605f..8838bd2 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -376,6 +376,7 @@ void tcp_init_sock(struct sock *sk)
> skb_queue_head_init(&tp->out_of_order_queue);
> tcp_init_xmit_timers(sk);
> tcp_prequeue_init(tp);
> + INIT_LIST_HEAD(&tp->tsq_node);
>
> icsk->icsk_rto = TCP_TIMEOUT_INIT;
> tp->mdev = TCP_TIMEOUT_INIT;
> @@ -786,15 +787,17 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
> int large_allowed)
> {
> struct tcp_sock *tp = tcp_sk(sk);
> - u32 xmit_size_goal, old_size_goal;
> + u32 xmit_size_goal, old_size_goal, gso_max_size;
>
> xmit_size_goal = mss_now;
>
> if (large_allowed && sk_can_gso(sk)) {
> - xmit_size_goal = ((sk->sk_gso_max_size - 1) -
> - inet_csk(sk)->icsk_af_ops->net_header_len -
> - inet_csk(sk)->icsk_ext_hdr_len -
> - tp->tcp_header_len);
> + gso_max_size = min_t(u32, sk->sk_gso_max_size,
> + sysctl_tcp_limit_output_bytes >> 1);
> + xmit_size_goal = (gso_max_size - 1) -
> + inet_csk(sk)->icsk_af_ops->net_header_len -
> + inet_csk(sk)->icsk_ext_hdr_len -
> + tp->tcp_header_len;
>
> xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
>
> @@ -3573,4 +3576,5 @@ void __init tcp_init(void)
> tcp_secret_primary = &tcp_secret_one;
> tcp_secret_retiring = &tcp_secret_two;
> tcp_secret_secondary = &tcp_secret_two;
> + tcp_tasklet_init();
> }
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index 72b7c63..83b358f 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -482,6 +482,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
> treq->snt_isn + 1 + tcp_s_data_size(oldtp);
>
> tcp_prequeue_init(newtp);
> + INIT_LIST_HEAD(&newtp->tsq_node);
>
> tcp_init_wl(newtp, treq->rcv_isn);
>
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index c465d3e..991ae45 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
> */
> int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
>
> +/* Default TSQ limit of two TSO segments */
> +int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
> +
> /* This limits the percentage of the congestion window which we
> * will allow a single TSO frame to consume. Building TSO frames
> * which are too large can cause TCP streams to be bursty.
> @@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
> int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
> EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
>
> +static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
> + int push_one, gfp_t gfp);
>
> /* Account for new data that has been sent to the network. */
> static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
> @@ -783,6 +788,118 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
> return size;
> }
>
> +
> +/* TCP SMALL QUEUES (TSQ)
> + *
> + * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
> + * to reduce RTT and bufferbloat.
> + * We do this using a special skb destructor (tcp_wfree).
> + *
> + * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
> + * needs to be reallocated in a driver.
> + * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
> + *
> + * Since transmit from skb destructor is forbidden, we use a tasklet
> + * to process all sockets that eventually need to send more skbs.
> + * We use one tasklet per cpu, with its own queue of sockets.
> + */
> +struct tsq_tasklet {
> + struct tasklet_struct tasklet;
> + struct list_head head; /* queue of tcp sockets */
> +};
> +static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
> +
> +/*
> + * One tasklest per cpu tries to send more skbs.
> + * We run in tasklet context but need to disable irqs when
> + * transfering tsq->head because tcp_wfree() might
> + * interrupt us (non NAPI drivers)
> + */
> +static void tcp_tasklet_func(unsigned long data)
> +{
> + struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
> + LIST_HEAD(list);
> + unsigned long flags;
> + struct list_head *q, *n;
> + struct tcp_sock *tp;
> + struct sock *sk;
> +
> + local_irq_save(flags);
> + list_splice_init(&tsq->head, &list);
> + local_irq_restore(flags);
> +
> + list_for_each_safe(q, n, &list) {
> + tp = list_entry(q, struct tcp_sock, tsq_node);
> + list_del(&tp->tsq_node);
> +
> + sk = (struct sock *)tp;
> + bh_lock_sock(sk);
> +
> + if (!sock_owned_by_user(sk)) {
> + if ((1 << sk->sk_state) &
> + (TCPF_CLOSE_WAIT | TCPF_ESTABLISHED))
> + tcp_write_xmit(sk,
> + tcp_current_mss(sk),
> + 0, 0,
> + GFP_ATOMIC);
Is this case possible: app does a large send and immediately closes
the socket. then
the queue is throttled and tcp_write_xmit is called back when state is
in TCP_FIN_WAIT1.
I think tcp_write_xmit should continue regardless of the current state
because the
send maybe throttled/delayed but state change is synchronous.
> + } else {
> + /* TODO:
> + * setup a timer, or check TSQ_OWNED in release_sock()
> + */
> + set_bit(TSQ_OWNED, &tp->tsq_flags);
> + }
> + bh_unlock_sock(sk);
> +
> + clear_bit(TSQ_QUEUED, &tp->tsq_flags);
> + sk_free(sk);
> + }
> +}
> +
> +void __init tcp_tasklet_init(void)
> +{
> + int i;
> +
> + for_each_possible_cpu(i) {
> + struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
> +
> + INIT_LIST_HEAD(&tsq->head);
> + tasklet_init(&tsq->tasklet,
> + tcp_tasklet_func,
> + (unsigned long)tsq);
> + }
> +}
> +
> +/*
> + * Write buffer destructor automatically called from kfree_skb.
> + * We cant xmit new skbs from this context, as we might already
> + * hold qdisc lock.
> + */
> +void tcp_wfree(struct sk_buff *skb)
> +{
> + struct sock *sk = skb->sk;
> + struct tcp_sock *tp = tcp_sk(sk);
> +
> + if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
> + !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
> + unsigned long flags;
> + struct tsq_tasklet *tsq;
> +
> + /* Keep a ref on socket.
> + * This last ref will be released in tcp_tasklet_func()
> + */
> + atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
> +
> + /* queue this socket to tasklet queue */
> + local_irq_save(flags);
> + tsq = &__get_cpu_var(tsq_tasklet);
> + list_add(&tp->tsq_node, &tsq->head);
> + tasklet_schedule(&tsq->tasklet);
> + local_irq_restore(flags);
> + } else {
> + sock_wfree(skb);
> + }
> +}
> +
> /* This routine actually transmits TCP packets queued in by
> * tcp_do_sendmsg(). This is used by both the initial
> * transmission and possible later retransmissions.
> @@ -844,7 +961,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
>
> skb_push(skb, tcp_header_size);
> skb_reset_transport_header(skb);
> - skb_set_owner_w(skb, sk);
> +
> + skb_orphan(skb);
> + skb->sk = sk;
> + skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
> + tcp_wfree : sock_wfree;
> + atomic_add(skb->truesize, &sk->sk_wmem_alloc);
>
> /* Build TCP header and checksum it. */
> th = tcp_hdr(skb);
> @@ -1780,6 +1902,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
> while ((skb = tcp_send_head(sk))) {
> unsigned int limit;
>
> +
> tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
> BUG_ON(!tso_segs);
>
> @@ -1800,6 +1923,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
> break;
> }
>
> + /* TSQ : sk_wmem_alloc accounts skb truesize,
> + * including skb overhead. But thats OK.
> + */
> + if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
> + set_bit(TSQ_THROTTLED, &tp->tsq_flags);
> + break;
> + }
> limit = mss_now;
> if (tso_segs > 1 && !tcp_urg_mode(tp))
> limit = tcp_mss_split_point(sk, skb, mss_now,
>
>
More information about the Codel
mailing list