[Cake] [PATCH net-next] tcp: add tcp_reset_xmit_timer() helper

Tue Oct 23 15:06:34 EDT 2018

I guess I worry that now we have to redo all the flent tests to date.
On everything.

Eric Dumazet <edumazet at google.com> writes:

> With EDT model, SRTT no longer is inflated by pacing delays.
>
> This means that RTO and some other xmit timers might be setup
> incorrectly. This is particularly visible with either :
>
> - Very small enforced pacing rates (SO_MAX_PACING_RATE)
> - Reduced rto (from the default 200 ms)
>
> This can lead to TCP flows aborts in the worst case,
> or spurious retransmits in other cases.
>
> For example, this session gets far more throughput
> than the requested 80kbit :
>
> $ netperf -H 127.0.0.2 -l 100 -- -q 10000
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.2 () port 0 AF_INET
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
>
> 540000 262144 262144    104.00      2.66
>
> With the fix :
>
> $ netperf -H 127.0.0.2 -l 100 -- -q 10000
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.2 () port 0 AF_INET
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
>
> 540000 262144 262144    104.00      0.12
>
> EDT allows for better control of rtx timers, since TCP has
> a better idea of the earliest departure time of each skb
> in the rtx queue. We only have to eventually add to the
> timer the difference of the EDT time with current time.
>
> Signed-off-by: Eric Dumazet <edumazet at google.com>
> ---
>  include/net/tcp.h     | 30 +++++++++++++++++++++++++++---
>  net/ipv4/tcp_input.c  |  8 ++++----
>  net/ipv4/tcp_output.c | 18 ++++++++++--------
>  3 files changed, 41 insertions(+), 15 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 8a61c3e8c15dfaf1095490814288c41c13a629f3..a18914d204864e3016fe8121ec7065da6696f9ef 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -1245,8 +1245,31 @@ static inline bool tcp_needs_internal_pacing(const struct sock *sk)
>  	return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
>  }
>  
> +/* Return in jiffies the delay before one skb is sent.
> + * If @skb is NULL, we look at EDT for next packet being sent on the socket.
> + */
> +static inline unsigned long tcp_pacing_delay(const struct sock *sk,
> +					     const struct sk_buff *skb)
> +{
> +	s64 pacing_delay = skb ? skb->tstamp : tcp_sk(sk)->tcp_wstamp_ns;
> +
> +	pacing_delay -= tcp_sk(sk)->tcp_clock_cache;
> +
> +	return pacing_delay > 0 ? nsecs_to_jiffies(pacing_delay) : 0;
> +}
> +
> +static inline void tcp_reset_xmit_timer(struct sock *sk,
> +					const int what,
> +					unsigned long when,
> +					const unsigned long max_when,
> +					const struct sk_buff *skb)
> +{
> +	inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk, skb),
> +				  max_when);
> +}
> +
>  /* Something is really bad, we could not queue an additional packet,
> - * because qdisc is full or receiver sent a 0 window.
> + * because qdisc is full or receiver sent a 0 window, or we are paced.
>   * We do not want to add fuel to the fire, or abort too early,
>   * so make sure the timer we arm now is at least 200ms in the future,
>   * regardless of current icsk_rto value (as it could be ~2ms)
> @@ -1268,8 +1291,9 @@ static inline unsigned long tcp_probe0_when(const struct sock *sk,
>  static inline void tcp_check_probe_timer(struct sock *sk)
>  {
>  	if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
> -		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
> -					  tcp_probe0_base(sk), TCP_RTO_MAX);
> +		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
> +				     tcp_probe0_base(sk), TCP_RTO_MAX,
> +				     NULL);
>  }
>  
>  static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 188980c58f873ffdc81c018dcab0996f603cd7eb..2868ef28ce52179b3c5874e749b680ffbdc0521a 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -2979,8 +2979,8 @@ void tcp_rearm_rto(struct sock *sk)
>  			 */
>  			rto = usecs_to_jiffies(max_t(int, delta_us, 1));
>  		}
> -		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
> -					  TCP_RTO_MAX);
> +		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
> +				     TCP_RTO_MAX, tcp_rtx_queue_head(sk));
>  	}
>  }
>  
> @@ -3255,8 +3255,8 @@ static void tcp_ack_probe(struct sock *sk)
>  	} else {
>  		unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
>  
> -		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
> -					  when, TCP_RTO_MAX);
> +		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
> +				     when, TCP_RTO_MAX, NULL);
>  	}
>  }
>  
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index c07990a35ff3bd9438d32c82863ef207c93bdb9e..9c34b97d365d719ff76250bc9fe7fa20495a3ed2 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -2455,8 +2455,8 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
>  	if (rto_delta_us > 0)
>  		timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
>  
> -	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
> -				  TCP_RTO_MAX);
> +	tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
> +			     TCP_RTO_MAX, NULL);
>  	return true;
>  }
>  
> @@ -3020,9 +3020,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
>  
>  		if (skb == rtx_head &&
>  		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
> -			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
> -						  inet_csk(sk)->icsk_rto,
> -						  TCP_RTO_MAX);
> +			tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
> +					     inet_csk(sk)->icsk_rto,
> +					     TCP_RTO_MAX,
> +					     skb);
>  	}
>  }
>  
> @@ -3752,9 +3753,10 @@ void tcp_send_probe0(struct sock *sk)
>  			icsk->icsk_probes_out = 1;
>  		probe_max = TCP_RESOURCE_PROBE_INTERVAL;
>  	}
> -	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
> -				  tcp_probe0_when(sk, probe_max),
> -				  TCP_RTO_MAX);
> +	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
> +			     tcp_probe0_when(sk, probe_max),
> +			     TCP_RTO_MAX,
> +			     NULL);
>  }
>  
>  int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)