[Cake] [RFC PATCH 2/3] Add cake related includes and source files

Fri Nov 17 14:52:55 EST 2017

And now, some comments. Not a lot. But it is a lot easier to read now
without the compatability cruft.

Dave Taht <dave.taht at gmail.com> writes:

> ---
>  include/net/cobalt.h |  132 +++
>  net/sched/cobalt.c   |  258 ++++++
>  net/sched/sch_cake.c | 2206 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 2596 insertions(+)
>  create mode 100644 include/net/cobalt.h
>  create mode 100644 net/sched/cobalt.c
>  create mode 100644 net/sched/sch_cake.c
>
> diff --git a/include/net/cobalt.h b/include/net/cobalt.h
> new file mode 100644
> index 0000000..618128d
> --- /dev/null
> +++ b/include/net/cobalt.h
> @@ -0,0 +1,132 @@
> +#ifndef __NET_SCHED_COBALT_H
> +#define __NET_SCHED_COBALT_H
> +
> +/* COBALT - Codel-BLUE Alternate AQM algorithm.
> + *
> + *  Copyright (C) 2011-2012 Kathleen Nichols <nichols at pollere.com>
> + *  Copyright (C) 2011-2012 Van Jacobson <van at pollere.net>
> + *  Copyright (C) 2012 Eric Dumazet <edumazet at google.com>
> + *  Copyright (C) 2016 Michael D. Täht <dave.taht at gmail.com>
> + *  Copyright (c) 2015-2016 Jonathan Morton <chromatix99 at gmail.com>
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions, and the following disclaimer,
> + *    without modification.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + * 3. The names of the authors may not be used to endorse or promote products
> + *    derived from this software without specific prior written permission.
> + *
> + * Alternatively, provided that this notice is retained in full, this
> + * software may be distributed under the terms of the GNU General
> + * Public License ("GPL") version 2, in which case the provisions of the
> + * GPL apply INSTEAD OF those given above.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> + * DAMAGE.
> + */

Arguably all the cobalt stuff could revert to one cobalt.h file.

> +
> +#include <linux/version.h>
> +#include <linux/types.h>
> +#include <linux/ktime.h>
> +#include <linux/skbuff.h>
> +#include <net/pkt_sched.h>
> +#include <net/inet_ecn.h>
> +#include <linux/reciprocal_div.h>
> +
> +typedef u64 cobalt_time_t;
> +typedef s64 cobalt_tdiff_t;
> +
> +#define MS2TIME(a) (a * (u64) NSEC_PER_MSEC)
> +#define US2TIME(a) (a * (u64) NSEC_PER_USEC)
> +
> +#define codel_stats_copy_queue(a, b, c, d) gnet_stats_copy_queue(a, b, c, d)
> +#define codel_watchdog_schedule_ns(a, b, c) qdisc_watchdog_schedule_ns(a, b, c)

I think thus stuff can go. Also a review of current fq_codel and tbf/htb
code would be helpful as changes to locking, and per-cpu stats and a lot
else has happened.

> +
> +static inline cobalt_time_t cobalt_get_time(void)
> +{
> +	return ktime_get_ns();
> +}

These days I'm finding the abstraction of time hard to handle and would
rather just call ktime_get_ns directly.

> +
> +static inline u32 cobalt_time_to_us(cobalt_time_t val)
> +{
> +	do_div(val, NSEC_PER_USEC);
> +	return (u32)val;
> +}

Sometimes I think the api could be revised to just pass nanosec.

> +
> +struct cobalt_skb_cb {
> +	cobalt_time_t enqueue_time;
> +};

The new ack stuff could stash a value here for the "ackstate", possibly,
instead of elsewhere.

> +
> +/**
> + * struct cobalt_params - contains codel and blue parameters
> + * @interval:	codel initial drop rate
> + * @target:     maximum persistent sojourn time & blue update rate
> + * @threshold:	tolerance for product of sojourn time and time above target

Threshold is not here anymore. I will look at some older code.

> + * @p_inc:      increment of blue drop probability (0.32 fxp)
> + * @p_dec:      decrement of blue drop probability (0.32 fxp)
> + */
> +struct cobalt_params {
> +	cobalt_time_t	interval;
> +	cobalt_time_t	target;
> +	u32		p_inc;
> +	u32		p_dec;
> +};
> +
> +/**
> + * struct cobalt_vars - contains codel and blue variables
> + * @count:		  dropping frequency
> + * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1
> + * @drop_next:    time to drop next packet, or when we dropped last
> + * @drop_count:	  temp count of dropped packets in dequeue()
> + * @ecn_mark:     number of packets we ECN marked instead of dropping
> + * @p_drop:       BLUE drop probability (0.32 fxp)
> + * @dropping:     set if in dropping state
> + */
> +struct cobalt_vars {
> +	u32		count;
> +	u32		rec_inv_sqrt;
> +	cobalt_time_t	drop_next;
> +	cobalt_time_t	blue_timer;
> +	u32     p_drop;
> +	bool	dropping;
> +	bool    ecn_marked;
> +};
> +
> +/* Initialise visible and internal data. */
> +void cobalt_vars_init(struct cobalt_vars *vars);
> +
> +struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb);
> +cobalt_time_t cobalt_get_enqueue_time(const struct sk_buff *skb);
> +
> +/* Call this when a packet had to be dropped due to queue overflow. */
> +bool cobalt_queue_full(struct cobalt_vars *vars, struct cobalt_params *p,
> +		       cobalt_time_t now);
> +
> +/* Call this when the queue was serviced but turned out to be empty. */
> +bool cobalt_queue_empty(struct cobalt_vars *vars, struct cobalt_params *p,
> +			cobalt_time_t now);
> +
> +/* Call this with a freshly dequeued packet for possible congestion marking.
> + * Returns true as an instruction to drop the packet, false for delivery.
> + */
> +bool cobalt_should_drop(struct cobalt_vars *vars,
> +	struct cobalt_params *p,
> +	cobalt_time_t now,
> +	struct sk_buff *skb);
> +
> +#endif
> diff --git a/net/sched/cobalt.c b/net/sched/cobalt.c
> new file mode 100644
> index 0000000..803cfe1
> --- /dev/null
> +++ b/net/sched/cobalt.c
> @@ -0,0 +1,258 @@
> +/* COBALT - Codel-BLUE Alternate AQM algorithm.
> + *
> + *  Copyright (C) 2011-2012 Kathleen Nichols <nichols at pollere.com>
> + *  Copyright (C) 2011-2012 Van Jacobson <van at pollere.net>
> + *  Copyright (C) 2012 Eric Dumazet <edumazet at google.com>
> + *  Copyright (C) 2016-2017 Täht <dave.taht at gmail.com>
> + *  Copyright (c) 2015-2016 Jonathan Morton <chromatix99 at gmail.com>
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions, and the following disclaimer,
> + *    without modification.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + * 3. The names of the authors may not be used to endorse or promote products
> + *    derived from this software without specific prior written permission.
> + *
> + * Alternatively, provided that this notice is retained in full, this
> + * software may be distributed under the terms of the GNU General
> + * Public License ("GPL") version 2, in which case the provisions of the
> + * GPL apply INSTEAD OF those given above.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> + * DAMAGE.
> + */
> +
> +#include <linux/version.h>
> +#include <linux/types.h>
> +#include <linux/ktime.h>
> +#include <linux/skbuff.h>
> +#include <net/pkt_sched.h>
> +#include <net/inet_ecn.h>
> +#include <linux/reciprocal_div.h>
> +#include <linux/random.h>
> +#include <net/cobalt.h>
> +
> +/* COBALT operates the Codel and BLUE algorithms in parallel, in order
> + * to obtain the best features of each.  Codel is excellent on flows
> + * which respond to congestion signals in a TCP-like way.  BLUE is far
> + * more effective on unresponsive flows.
> + */

Really remains to be proven. In my longer RTT tests and at higher
bandwidths, at least of as this spring, the cobalt branch
of cake was very agressive. I've been trying to get to where we can
actually look at cobalt's true behaviors with the netns stuff I'm doing,
and thus this patch against the net-next kernel.

> +
> +struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
> +{
> +	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
> +	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
> +}
> +
> +cobalt_time_t cobalt_get_enqueue_time(const struct sk_buff *skb)
> +{
> +	return get_cobalt_cb(skb)->enqueue_time;
> +}
> +
> +void cobalt_set_enqueue_time(struct sk_buff *skb, cobalt_time_t now)
> +{
> +	get_cobalt_cb(skb)->enqueue_time = now;
> +}

All these should be inline.

> +
> +#define REC_INV_SQRT_CACHE (16)

Never been proven that this cache either improves cpu utiliziation, nor
that the added accuracy has any effect. I'd like to be able to
completely compile out the cache.

> +static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
> +
> +/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
> + * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
> + *
> + * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
> + */
> +static void cobalt_newton_step(struct cobalt_vars *vars)
> +{
> +	u32 invsqrt = vars->rec_inv_sqrt;
> +	u32 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
> +	u64 val = (3LL << 32) - ((u64)vars->count * invsqrt2);
> +
> +	val >>= 2; /* avoid overflow in following multiply */
> +	val = (val * invsqrt) >> (32 - 2 + 1);
> +
> +	vars->rec_inv_sqrt = val;
> +}
> +
> +static void cobalt_invsqrt(struct cobalt_vars *vars)
> +{
> +	if (vars->count < REC_INV_SQRT_CACHE)
> +		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
> +	else
> +		cobalt_newton_step(vars);
> +}
> +
> +static void cobalt_cache_init(void)
> +{
> +	struct cobalt_vars v;
> +
> +	memset(&v, 0, sizeof(v));
> +	v.rec_inv_sqrt = ~0U;
> +	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
> +
> +	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
> +		cobalt_newton_step(&v);
> +		cobalt_newton_step(&v);
> +		cobalt_newton_step(&v);
> +		cobalt_newton_step(&v);
> +
> +		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
> +	}
> +}
> +
> +void cobalt_vars_init(struct cobalt_vars *vars)
> +{
> +	memset(vars, 0, sizeof(*vars));
> +
> +	if (!cobalt_rec_inv_sqrt_cache[0]) {
> +		cobalt_cache_init();
> +		cobalt_rec_inv_sqrt_cache[0] = ~0;
> +	}
> +}
> +
> +/* CoDel control_law is t + interval/sqrt(count)
> + * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
> + * both sqrt() and divide operation.
> + */
> +static cobalt_time_t cobalt_control_law(cobalt_time_t t,
> +					cobalt_time_t interval,
> +					u32 rec_inv_sqrt)
> +{
> +	return t + reciprocal_scale(interval, rec_inv_sqrt);
> +}
> +
> +/* Call this when a packet had to be dropped due to queue overflow.
> + * True if the BLUE state was quiescent before but active after this call.
> + */
> +bool cobalt_queue_full(struct cobalt_vars *vars, struct cobalt_params *p,
> +		       cobalt_time_t now)
> +{
> +	bool up = false;
> +
> +	if ((now - vars->blue_timer) > p->target) {
> +		up = !vars->p_drop;
> +		vars->p_drop += p->p_inc;
> +		if (vars->p_drop < p->p_inc)
> +			vars->p_drop = ~0;
> +		vars->blue_timer = now;
> +	}
> +	vars->dropping = true;
> +	vars->drop_next = now;
> +	if (!vars->count)
> +		vars->count = 1;
> +
> +	return up;
> +}
> +
> +/* Call this when the queue was serviced but turned out to be empty.
> + * True if the BLUE state was active before but quiescent after this call.
> + */
> +bool cobalt_queue_empty(struct cobalt_vars *vars, struct cobalt_params *p,
> +			cobalt_time_t now)
> +{
> +	bool down = false;
> +
> +	if (vars->p_drop && (now - vars->blue_timer) > p->target) {
> +		if (vars->p_drop < p->p_dec)
> +			vars->p_drop = 0;
> +		else
> +			vars->p_drop -= p->p_dec;
> +		vars->blue_timer = now;
> +		down = !vars->p_drop;
> +	}
> +	vars->dropping = false;
> +
> +	if (vars->count && (now - vars->drop_next) >= 0) {
> +		vars->count--;
> +		cobalt_invsqrt(vars);
> +		vars->drop_next = cobalt_control_law(vars->drop_next,
> +						     p->interval,
> +						     vars->rec_inv_sqrt);
> +	}
> +
> +	return down;
> +}
> +
> +/* Call this with a freshly dequeued packet for possible congestion marking.
> + * Returns true as an instruction to drop the packet, false for delivery.
> + */
> +bool cobalt_should_drop(struct cobalt_vars *vars,
> +			struct cobalt_params *p,
> +	cobalt_time_t now,
> +	struct sk_buff *skb)
> +{
> +	bool drop = false;
> +
> +	/* Simplified Codel implementation */
> +	cobalt_tdiff_t sojourn  = now - cobalt_get_enqueue_time(skb);
> +	cobalt_tdiff_t schedule = now - vars->drop_next;
> +	bool over_target = sojourn > p->target;
> +	bool next_due    = vars->count && schedule >= 0;
> +
> +	vars->ecn_marked = false;
> +
> +	if (over_target) {
> +		if (!vars->dropping) {
> +			vars->dropping = true;
> +			vars->drop_next = cobalt_control_law(now,
> +							     p->interval,
> +							     vars->rec_inv_sqrt);
> +		}
> +
> +		if (!vars->count)
> +			vars->count = 1;
> +	} else if (vars->dropping) {
> +		vars->dropping = false;
> +	}
> +
> +	if (next_due && vars->dropping) {
> +		/* Use ECN mark if possible, otherwise drop */
> +		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
> +
> +		vars->count++;
> +		if (!vars->count)
> +			vars->count--;
> +		cobalt_invsqrt(vars);
> +		vars->drop_next = cobalt_control_law(vars->drop_next,
> +						     p->interval,
> +						     vars->rec_inv_sqrt);
> +		schedule = now - vars->drop_next;

I don't understand the schedule concept that well. Kind of goes with not
understanding the overloading of the drop_next field.

> +	} else {
> +		while (next_due) {
> +			vars->count--;
> +			cobalt_invsqrt(vars);
> +			vars->drop_next = cobalt_control_law(vars->drop_next,
> +							     p->interval,
> +							     vars->rec_inv_sqrt);
> +			schedule = now - vars->drop_next;
> +			next_due = vars->count && schedule >= 0;
> +		}
> +	}
> +
> +	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
> +	if (vars->p_drop)
> +		drop |= (prandom_u32() < vars->p_drop);

I think if (!drop && var->p_drop) would be more effecient here.

> +	/* Overload the drop_next field as an activity timeout */
> +	if (!vars->count)
> +		vars->drop_next = now + p->interval;
> +	else if (schedule > 0 && !drop)
> +		vars->drop_next = now;
> +
> +	return drop;
> +}
> diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
> new file mode 100644
> index 0000000..afbff6c
> --- /dev/null
> +++ b/net/sched/sch_cake.c
> @@ -0,0 +1,2206 @@
> +/* COMMON Applications Kept Enhanced (CAKE) discipline - version 5
> + *
> + * Copyright (C) 2014-2017 Jonathan Morton <chromatix99 at gmail.com>
> + * Copyright (C) 2015-2017 Toke Høiland-Jørgensen <toke at toke.dk>
> + * Copyright (C) 2014-2017 Dave Täht <dave.taht at gmail.com>
> + * Copyright (C) 2015-2017 Sebastian Moeller <moeller0 at gmx.de>
> + * Copyright (C) 2015-2017 Kevin Darbyshire-Bryant <kevin at darbyshire-bryant.me.uk>
> + * Copyright (C) 2017 Ryan Mounce <ryan at mounce.com.au>
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *	notice, this list of conditions, and the following disclaimer,
> + *	without modification.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *	notice, this list of conditions and the following disclaimer in the
> + *	documentation and/or other materials provided with the distribution.
> + * 3. The names of the authors may not be used to endorse or promote products
> + *	derived from this software without specific prior written permission.
> + *
> + * Alternatively, provided that this notice is retained in full, this
> + * software may be distributed under the terms of the GNU General
> + * Public License ("GPL") version 2, in which case the provisions of the
> + * GPL apply INSTEAD OF those given above.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> + * DAMAGE.
> + *
> + */
> +
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/jiffies.h>
> +#include <linux/string.h>
> +#include <linux/in.h>
> +#include <linux/errno.h>
> +#include <linux/init.h>
> +#include <linux/skbuff.h>
> +#include <linux/jhash.h>
> +#include <linux/slab.h>
> +#include <linux/vmalloc.h>
> +#include <linux/reciprocal_div.h>
> +#include <net/netlink.h>
> +#include <linux/version.h>
> +#include <linux/pkt_sched.h>
> +#include <linux/if_vlan.h>
> +#include <net/flow_dissector.h>
> +#include "cobalt.c"
> +
> +#if IS_ENABLED(NF_CONNTRACK)
> +#include <net/netfilter/nf_conntrack_core.h>
> +#include <net/netfilter/nf_conntrack_zones.h>
> +#include <net/netfilter/nf_conntrack.h>
> +#endif
> +
> +/* The CAKE Principles:
> + *	 (or, how to have your cake and eat it too)
> + *
> + * This is a combination of several shaping, AQM and FQ techniques into one
> + * easy-to-use package:
> + *
> + * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
> + *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
> + *   eliminating the need for any sort of burst parameter (eg. token bucket
> + *   depth).  Burst support is limited to that necessary to overcome scheduling
> + *   latency.
> + *
> + * - A Diffserv-aware priority queue, giving more priority to certain classes,
> + *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
> + *   the priority is reduced to avoid starving other tins.
> + *
> + * - Each priority tin has a separate Flow Queue system, to isolate traffic
> + *   flows from each other.  This prevents a burst on one flow from increasing
> + *   the delay to another.  Flows are distributed to queues using a
> + *   set-associative hash function.
> + *
> + * - Each queue is actively managed by Codel.  This serves flows fairly, and
> + *   signals congestion early via ECN (if available) and/or packet drops, to
> + *   keep latency low.  The codel parameters are auto-tuned based on the
> + *   bandwidth setting, as is necessary at low bandwidths.
> + *
> + * The configuration parameters are kept deliberately simple for ease of use.
> + * Everything has sane defaults.  Complete generality of configuration is *not*
> + * a goal.
> + *
> + * The priority queue operates according to a weighted DRR scheme, combined with
> + * a bandwidth tracker which reuses the shaper logic to detect which side of the
> + * bandwidth sharing threshold the tin is operating.  This determines whether a
> + * priority-based weight (high) or a bandwidth-based weight (low) is used for
> + * that tin in the current pass.
> + *
> + * This qdisc incorporates much of Eric Dumazet's fq_codel code, which he kindly
> + * granted us permission to use, which we customised for use as an integrated
> + * subordinate.  See sch_fq_codel.c for details of operation.
> + */
> +
> +#define CAKE_SET_WAYS (8)
> +#define CAKE_MAX_TINS (8)
> +#define CAKE_QUEUES (1024)

One of my issues with the classification schemes is that we end up with
a lot more CAKE_QUEUES than this, and thus we end up not comparing
apples to apples in terms of numbers of queues.

> +#ifndef CAKE_VERSION
> +#define CAKE_VERSION "unknown"
> +#endif
> +static char *cake_version __attribute__((used)) = "Cake version: "
> +		CAKE_VERSION;
> +
> +enum {
> +	CAKE_SET_NONE = 0,
> +	CAKE_SET_SPARSE,
> +	CAKE_SET_SPARSE_WAIT, // counted in SPARSE, actually in BULK
> +	CAKE_SET_BULK,
> +	CAKE_SET_DECAYING
> +};
> +
> +struct cake_flow {
> +	/* this stuff is all needed per-flow at dequeue time */
> +	struct sk_buff	  *head;
> +	struct sk_buff	  *tail;
> +	struct sk_buff	  *ackcheck;
> +	struct list_head  flowchain;
> +	s32		  deficit;
> +	struct cobalt_vars cvars;
> +	u16		  srchost; /* index into cake_host table */
> +	u16		  dsthost;
> +	u8		  set;
> +}; /* please try to keep this structure <= 64 bytes */

Is it?

> +
> +struct cake_host {
> +	u32 srchost_tag;
> +	u32 dsthost_tag;
> +	u16 srchost_refcnt;
> +	u16 dsthost_refcnt;
> +	u32 pad;
> +};
> +
> +struct cake_heap_entry {
> +	u16 t:3, b:10;
> +};
> +
> +struct cake_tin_data {
> +	struct cake_flow flows[CAKE_QUEUES];
> +	u32	backlogs[CAKE_QUEUES];
> +	u32 tags[CAKE_QUEUES];	/* for set association */
> +	u16 overflow_idx[CAKE_QUEUES];
> +	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
> +	u32	perturbation;
> +	u16	flow_quantum;
> +
> +	struct cobalt_params cparams;
> +	u32	drop_overlimit;
> +	u16	bulk_flow_count;
> +	u16	sparse_flow_count;
> +	u16	decaying_flow_count;
> +	u16	unresponsive_flow_count;
> +
> +	u16	max_skblen;
> +
> +	struct list_head new_flows;
> +	struct list_head old_flows;
> +	struct list_head decaying_flows;
> +
> +	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
> +	u64	tin_time_next_packet;
> +	u32	tin_rate_ns;
> +	u32	tin_rate_bps;
> +	u16	tin_rate_shft;
> +
> +	u16	tin_quantum_prio;
> +	u16	tin_quantum_band;
> +	s32	tin_deficit;
> +	u32	tin_backlog;
> +	u32	tin_dropped;
> +	u32	tin_ecn_mark;
> +
> +	u32	packets;
> +	u64	bytes;
> +
> +	u32	ack_drops;
> +
> +	/* moving averages */
> +	cobalt_time_t avge_delay;
> +	cobalt_time_t peak_delay;
> +	cobalt_time_t base_delay;
> +
> +	/* hash function stats */
> +	u32	way_directs;
> +	u32	way_hits;
> +	u32	way_misses;
> +	u32	way_collisions;
> +}; /* number of tins is small, so size of this struct doesn't matter much */
> +
> +struct cake_sched_data {
> +	struct cake_tin_data *tins;
> +
> +	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
> +	u16		overflow_timeout;
> +
> +	u16		tin_cnt;
> +	u8		tin_mode;
> +	u8		flow_mode;
> +
> +	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
> +	u16		rate_shft;
> +	u64		time_next_packet;
> +	u32		rate_ns;
> +	u32		rate_bps;
> +	u16		rate_flags;
> +	s16		rate_overhead;
> +	u16		rate_mpu;
> +	u32		interval;
> +	u32		target;
> +
> +	/* resource tracking */
> +	u32		buffer_used;
> +	u32		buffer_max_used;
> +	u32		buffer_limit;
> +	u32		buffer_config_limit;
> +
> +	/* indices for dequeue */
> +	u16		cur_tin;
> +	u16		cur_flow;
> +
> +	struct qdisc_watchdog watchdog;
> +	const u8	*tin_index;
> +	const u8	*tin_order;
> +
> +	/* bandwidth capacity estimate */
> +	u64		last_packet_time;
> +	u64		avg_packet_interval;
> +	u64		avg_window_begin;
> +	u32		avg_window_bytes;
> +	u32		avg_peak_bandwidth;
> +	u64		last_reconfig_time;
> +};
> +
> +enum {
> +	CAKE_MODE_BESTEFFORT = 1,
> +	CAKE_MODE_PRECEDENCE,
> +	CAKE_MODE_DIFFSERV8,
> +	CAKE_MODE_DIFFSERV4,
> +	CAKE_MODE_LLT,
> +	CAKE_MODE_DIFFSERV3,
> +	CAKE_MODE_MAX
> +};
> +
> +enum {
> +	CAKE_FLAG_ATM = 0x0001,
> +	CAKE_FLAG_PTM = 0x0002,
> +	CAKE_FLAG_AUTORATE_INGRESS = 0x0010,
> +	CAKE_FLAG_INGRESS = 0x0040,
> +	CAKE_FLAG_WASH = 0x0100,
> +	CAKE_FLAG_ACK_FILTER = 0x0200
> +};
> +
> +enum {
> +	CAKE_FLOW_NONE = 0,
> +	CAKE_FLOW_SRC_IP,
> +	CAKE_FLOW_DST_IP,
> +	CAKE_FLOW_HOSTS,    /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */
> +	CAKE_FLOW_FLOWS,
> +	CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */
> +	CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */
> +	CAKE_FLOW_TRIPLE,   /* = CAKE_FLOW_HOSTS  | CAKE_FLOW_FLOWS */
> +	CAKE_FLOW_MAX,
> +	CAKE_FLOW_NAT_FLAG = 64
> +};
> +
> +static u16 quantum_div[CAKE_QUEUES + 1] = {0};
> +
> +/* Diffserv lookup tables */
> +
> +static const u8 precedence[] = {0, 0, 0, 0, 0, 0, 0, 0,
> +				1, 1, 1, 1, 1, 1, 1, 1,
> +				2, 2, 2, 2, 2, 2, 2, 2,
> +				3, 3, 3, 3, 3, 3, 3, 3,
> +				4, 4, 4, 4, 4, 4, 4, 4,
> +				5, 5, 5, 5, 5, 5, 5, 5,
> +				6, 6, 6, 6, 6, 6, 6, 6,
> +				7, 7, 7, 7, 7, 7, 7, 7,
> +				};
> +
> +static const u8 diffserv_llt[] = {1, 0, 0, 1, 2, 2, 1, 1,
> +				3, 1, 1, 1, 1, 1, 1, 1,
> +				1, 1, 1, 1, 1, 1, 1, 1,
> +				1, 1, 1, 1, 1, 1, 1, 1,
> +				1, 1, 1, 1, 1, 1, 1, 1,
> +				1, 1, 1, 1, 2, 1, 2, 1,
> +				4, 1, 1, 1, 1, 1, 1, 1,
> +				4, 1, 1, 1, 1, 1, 1, 1,
> +				};
> +
> +static const u8 diffserv8[] = {2, 5, 1, 2, 4, 2, 2, 2,
> +			       0, 2, 1, 2, 1, 2, 1, 2,
> +			       5, 2, 4, 2, 4, 2, 4, 2,
> +				3, 2, 3, 2, 3, 2, 3, 2,
> +				6, 2, 3, 2, 3, 2, 3, 2,
> +				6, 2, 2, 2, 6, 2, 6, 2,
> +				7, 2, 2, 2, 2, 2, 2, 2,
> +				7, 2, 2, 2, 2, 2, 2, 2,
> +				};
> +
> +static const u8 diffserv4[] = {1, 2, 1, 1, 2, 1, 1, 1,
> +			       0, 1, 1, 1, 1, 1, 1, 1,
> +				2, 1, 2, 1, 2, 1, 2, 1,
> +				2, 1, 2, 1, 2, 1, 2, 1,
> +				3, 1, 2, 1, 2, 1, 2, 1,
> +				3, 1, 1, 1, 3, 1, 3, 1,
> +				3, 1, 1, 1, 1, 1, 1, 1,
> +				3, 1, 1, 1, 1, 1, 1, 1,
> +				};
> +
> +static const u8 diffserv3[] = {1, 1, 1, 1, 2, 1, 1, 1,
> +			       0, 1, 1, 1, 1, 1, 1, 1,
> +				1, 1, 1, 1, 1, 1, 1, 1,
> +				1, 1, 1, 1, 1, 1, 1, 1,
> +				1, 1, 1, 1, 1, 1, 1, 1,
> +				1, 1, 1, 1, 2, 1, 2, 1,
> +				2, 1, 1, 1, 1, 1, 1, 1,
> +				2, 1, 1, 1, 1, 1, 1, 1,
> +				};
> +
> +static const u8 besteffort[] = {0, 0, 0, 0, 0, 0, 0, 0,
> +				0, 0, 0, 0, 0, 0, 0, 0,
> +				0, 0, 0, 0, 0, 0, 0, 0,
> +				0, 0, 0, 0, 0, 0, 0, 0,
> +				0, 0, 0, 0, 0, 0, 0, 0,
> +				0, 0, 0, 0, 0, 0, 0, 0,
> +				0, 0, 0, 0, 0, 0, 0, 0,
> +				0, 0, 0, 0, 0, 0, 0, 0,
> +				};
> +
> +/* tin priority order, ascending */
> +static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
> +static const u8 bulk_order[] = {1, 0, 2, 3};
> +
> +#if IS_ENABLED(NF_CONNTRACK)
> +
> +static inline void cake_update_flowkeys(struct flow_keys *keys,
> +					const struct sk_buff *skb)
> +{
> +	enum ip_conntrack_info ctinfo;
> +	bool rev = false; /* reversed src/dst */
> +
> +	struct nf_conn *ct;
> +	const struct nf_conntrack_tuple *tuple;
> +
> +	if (tc_skb_protocol(skb) != htons(ETH_P_IP))
> +		return;
> +
> +	ct = nf_ct_get(skb, &ctinfo);
> +	if (ct) {
> +		tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
> +	} else {
> +		const struct nf_conntrack_tuple_hash *hash;
> +		struct nf_conntrack_tuple srctuple;
> +
> +		if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
> +				       NFPROTO_IPV4,
> +				       dev_net(skb->dev), &srctuple))
> +			return;
> +
> +		hash = nf_conntrack_find_get(dev_net(skb->dev),
> +					     &nf_ct_zone_dflt, &srctuple);
> +		if (!hash)
> +			return;
> +
> +		rev = true;
> +		ct = nf_ct_tuplehash_to_ctrack(hash);
> +		tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
> +	}
> +
> +	keys->addrs.v4addrs.src = (rev ? tuple->dst.u3.ip : tuple->src.u3.ip);
> +	keys->addrs.v4addrs.dst = (rev ? tuple->src.u3.ip : tuple->dst.u3.ip);
> +
> +	if (keys->ports.ports) {
> +		keys->ports.src = (rev ? tuple->dst.u.all : tuple->src.u.all);
> +		keys->ports.dst = (rev ? tuple->src.u.all : tuple->dst.u.all);
> +	}
> +
> +	if (rev)
> +		nf_ct_put(ct);
> +}
> +#else
> +static inline void cake_update_flowkeys(struct flow_keys *keys,
> +					const struct sk_buff *skb)
> +{
> +	/* There is nothing we can do here without CONNTRACK */
> +}
> +#endif
> +
> +static inline u32
> +cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode)
> +{
> +	struct flow_keys keys, host_keys;
> +	u32 flow_hash = 0, srchost_hash, dsthost_hash;
> +	u16 reduced_hash, srchost_idx, dsthost_idx;
> +
> +	if (unlikely(flow_mode == CAKE_FLOW_NONE))
> +		return 0;
> +
> +	skb_flow_dissect_flow_keys(skb, &keys,
> +				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
> +	if (flow_mode & CAKE_FLOW_NAT_FLAG)
> +		cake_update_flowkeys(&keys, skb);
> +
> +	/* flow_hash_from_keys() sorts the addresses by value, so we have
> +	 * to preserve their order in a separate data structure to treat
> +	 * src and dst host addresses as independently selectable.
> +	 */
> +	host_keys = keys;
> +	host_keys.ports.ports     = 0;
> +	host_keys.basic.ip_proto  = 0;
> +	host_keys.keyid.keyid     = 0;
> +	host_keys.tags.flow_label = 0;
> +
> +	switch (host_keys.control.addr_type) {
> +	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
> +		host_keys.addrs.v4addrs.src = 0;
> +		dsthost_hash = flow_hash_from_keys(&host_keys);
> +		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
> +		host_keys.addrs.v4addrs.dst = 0;
> +		srchost_hash = flow_hash_from_keys(&host_keys);
> +		break;
> +
> +	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
> +		memset(&host_keys.addrs.v6addrs.src, 0,
> +		       sizeof(host_keys.addrs.v6addrs.src));
> +		dsthost_hash = flow_hash_from_keys(&host_keys);
> +		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
> +		memset(&host_keys.addrs.v6addrs.dst, 0,
> +		       sizeof(host_keys.addrs.v6addrs.dst));
> +		srchost_hash = flow_hash_from_keys(&host_keys);
> +		break;
> +
> +	default:
> +		dsthost_hash = 0;
> +		srchost_hash = 0;
> +	};
> +
> +	/* This *must* be after the above switch, since as a
> +	 * side-effect it sorts the src and dst addresses.
> +	 */
> +	if (flow_mode & CAKE_FLOW_FLOWS)
> +		flow_hash = flow_hash_from_keys(&keys);
> +
> +	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
> +		if (flow_mode & CAKE_FLOW_SRC_IP)
> +			flow_hash ^= srchost_hash;
> +
> +		if (flow_mode & CAKE_FLOW_DST_IP)
> +			flow_hash ^= dsthost_hash;
> +	}
> +
> +	reduced_hash = flow_hash    % CAKE_QUEUES;
> +	srchost_idx  = srchost_hash % CAKE_QUEUES;
> +	dsthost_idx  = dsthost_hash % CAKE_QUEUES;
> +
> +	/* set-associative hashing */
> +	/* fast path if no hash collision (direct lookup succeeds) */
> +	if (likely(q->tags[reduced_hash] == flow_hash &&
> +		   q->flows[reduced_hash].set)) {
> +		q->way_directs++;
> +	} else {
> +		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
> +		u32 outer_hash = reduced_hash - inner_hash;
> +		u32 i, k;
> +		bool need_allocate_src = false;
> +		bool need_allocate_dst = false;
> +
> +		/* check if any active queue in the set is reserved for
> +		 * this flow.
> +		 */
> +		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
> +		     i++, k = (k + 1) % CAKE_SET_WAYS) {
> +			if (q->tags[outer_hash + k] == flow_hash) {
> +				if (i)
> +					q->way_hits++;
> +
> +				if (!q->flows[outer_hash + k].set) {
> +					/* need to increment host refcnts */
> +					need_allocate_src = true;
> +					need_allocate_dst = true;
> +				}
> +
> +				goto found;
> +			}
> +		}
> +
> +		/* no queue is reserved for this flow, look for an
> +		 * empty one.
> +		 */
> +		for (i = 0; i < CAKE_SET_WAYS;
> +			 i++, k = (k + 1) % CAKE_SET_WAYS) {
> +			if (!q->flows[outer_hash + k].set) {
> +				q->way_misses++;
> +				need_allocate_src = true;
> +				need_allocate_dst = true;
> +				goto found;
> +			}
> +		}
> +
> +		/* With no empty queues, default to the original
> +		 * queue, accept the collision, update the host tags.
> +		 */
> +		q->way_collisions++;
> +		q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
> +		q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
> +		need_allocate_src = true;
> +		need_allocate_dst = true;
> +
> +		/* reserve queue for future packets in same flow */
> +found:		reduced_hash = outer_hash + k;
> +		q->tags[reduced_hash] = flow_hash;
> +
> +		if (need_allocate_src) {
> +			inner_hash = srchost_idx % CAKE_SET_WAYS;
> +			outer_hash = srchost_idx - inner_hash;
> +			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
> +				i++, k = (k + 1) % CAKE_SET_WAYS) {
> +				if (q->hosts[outer_hash + k].srchost_tag ==
> +				    srchost_hash)
> +					goto found_src;
> +			}
> +			for (i = 0; i < CAKE_SET_WAYS;
> +				i++, k = (k + 1) % CAKE_SET_WAYS) {
> +				if (!q->hosts[outer_hash + k].srchost_refcnt)
> +					break;
> +			}
> +			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
> +
> +found_src:		srchost_idx = outer_hash + k;
> +			q->hosts[srchost_idx].srchost_refcnt++;
> +			q->flows[reduced_hash].srchost = srchost_idx;
> +		}
> +
> +		if (need_allocate_dst) {
> +			inner_hash = dsthost_idx % CAKE_SET_WAYS;
> +			outer_hash = dsthost_idx - inner_hash;
> +			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
> +				i++, k = (k + 1) % CAKE_SET_WAYS) {
> +				if (q->hosts[outer_hash + k].dsthost_tag ==
> +				    dsthost_hash)
> +					goto found_dst;
> +			}
> +			for (i = 0; i < CAKE_SET_WAYS;
> +				i++, k = (k + 1) % CAKE_SET_WAYS) {
> +				if (!q->hosts[outer_hash + k].dsthost_refcnt)
> +					break;
> +			}
> +			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
> +
> +found_dst:		dsthost_idx = outer_hash + k;
> +			q->hosts[dsthost_idx].dsthost_refcnt++;
> +			q->flows[reduced_hash].dsthost = dsthost_idx;
> +		}
> +	}
> +
> +	return reduced_hash;
> +}

What makes me happy nowadays is that I actually understand this.

> +
> +/* helper functions : might be changed when/if skb use a standard list_head */
> +/* remove one skb from head of slot queue */
> +
> +static inline struct sk_buff *dequeue_head(struct cake_flow *flow)
> +{
> +	struct sk_buff *skb = flow->head;
> +
> +	if (skb) {
> +		flow->head = skb->next;
> +		skb->next = NULL;
> +
> +		if (skb == flow->ackcheck)
> +			flow->ackcheck = NULL;

Not sure why this is needed here, can't it be checked only if ack_filter
is on, and much later?

> +	}
> +
> +	return skb;
> +}
> +
> +/* add skb to flow queue (tail add) */
> +
> +static inline void
> +flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
> +{
> +	if (!flow->head)
> +		flow->head = skb;
> +	else
> +		flow->tail->next = skb;
> +	flow->tail = skb;
> +	skb->next = NULL;
> +}
> +
> +static struct sk_buff *ack_filter(struct cake_flow *flow, struct sk_buff *skb)
> +{
> +	int seglen;
> +	struct sk_buff *skb_check, *skb_check_prev, *rogue_ack = NULL;
> +	struct iphdr *iph, *iph_check;
> +	struct ipv6hdr *ipv6h, *ipv6h_check;
> +	struct tcphdr *tcph, *tcph_check;
> +
> +	/* no other possible ACKs to filter */
> +	if (flow->head == skb)
> +		return NULL;
> +
> +	iph = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb);
> +	ipv6h = skb->encapsulation ? inner_ipv6_hdr(skb) : ipv6_hdr(skb);

When I tackled this originally, I tried to leverage the existing
extremly robost flow_keys (which we get as a side-effect of the hashing
arch), but didn't get anywhere. I'll have to look into these macros.

> +
> +	/* check that the innermost network header is v4/v6, and contains TCP */
> +	if (iph->version == 4) {
> +		if (iph->protocol != IPPROTO_TCP)
> +			return NULL;
> +		seglen = ntohs(iph->tot_len) - (4 * iph->ihl);
> +		tcph = (struct tcphdr *)((void *)iph + (4 * iph->ihl));
> +	} else if (ipv6h->version == 6) {
> +		if (ipv6h->nexthdr != IPPROTO_TCP)
> +			return NULL;
> +		seglen = ntohs(ipv6h->payload_len);
> +		tcph = (struct tcphdr *)((void *)ipv6h + sizeof(struct ipv6hdr));
> +	} else {
> +		return NULL;
> +	}
> +
> +	/* the 'triggering' packet need only have the ACK flag set.
> +	 * also check that SYN is not set, as there won't be any previous ACKs.
> +	 */
> +	if ((tcp_flag_word(tcph) &
> +		cpu_to_be32(0x00120000)) != TCP_FLAG_ACK)
> +		return NULL;

Not huge on magic numbers.

> +	/* the 'triggering' ACK is at the end of the queue,
> +	 * we have already returned if it is the only packet in the flow.
> +	 * stop before last packet in queue, don't compare trigger ACK to itself
> +	 * start where we finished last time if recorded in ->ackcheck
> +	 * otherwise start from the the head of the flow queue.
> +	 */
> +	skb_check_prev = flow->ackcheck ?: NULL;
> +	skb_check = flow->ackcheck ?: flow->head;

I think something I did must have corrupted this. I'll go fix.

> +
> +	while (skb_check->next) {
> +		/* don't increment if at head of flow queue (_prev == NULL) */
> +		if (skb_check_prev) {
> +			skb_check_prev = skb_check;
> +			skb_check = skb_check->next;
> +			if (!skb_check->next)
> +				break;
> +		} else {
> +			skb_check_prev = skb_check;
> +		}
> +
> +		iph_check = skb_check->encapsulation ?
> +			inner_ip_hdr(skb_check) : ip_hdr(skb_check);
> +		ipv6h_check = skb_check->encapsulation ?
> +			inner_ipv6_hdr(skb_check) : ipv6_hdr(skb_check);
> +
> +		if (iph_check->version == 4) {
> +			if (iph_check->protocol != IPPROTO_TCP)
> +				continue;
> +			seglen = ntohs(iph_check->tot_len) - (4 * iph_check->ihl);
> +			tcph_check = (struct tcphdr *)((void *)iph_check
> +				+ (4 * iph_check->ihl));
> +
> +		} else if (ipv6h_check->version == 6) {
> +			if (ipv6h_check->nexthdr != IPPROTO_TCP)
> +				continue;
> +			seglen = ntohs(ipv6h_check->payload_len);
> +			tcph_check = (struct tcphdr *)((void *)ipv6h_check
> +				+ sizeof(struct ipv6hdr));
> +
> +		} else {
> +			continue;
> +		}
> +
> +		/* stricter criteria apply to ACKs that we may filter
> +		 * 3 reserved flags must be unset to avoid future breakage
> +		 * ECE/CWR/NS can be safely ignored
> +		 * ACK must be set
> +		 * All other flags URG/PSH/RST/SYN/FIN must be unset
> +		 * must be 'pure' ACK, contain zero bytes of segment data
> +		 * options are ignored
> +		 */
> +		if (((tcp_flag_word(tcph_check) &
> +			cpu_to_be32(0x0E3F0000)) != TCP_FLAG_ACK) ||
Magic number
> +		    ((seglen - 4 * tcph_check->doff) != 0)) {
> +			continue;
> +		}
> +
> +		/* if the hosts or ports don't match, we have found a 'rogue'
> +		 * ACK in this flow belonging to a different connection.
> +		 * continue checking for other ACKs this round however
> +		 * restart checking from the 'rogue' next time.
> +		 */
> +		if (tcph_check->source != tcph->source ||
> +		    tcph_check->dest != tcph->dest ||
> +		    (iph_check->version == 4 && iph->version == 4 &&
> +			(iph_check->saddr != iph->saddr ||
> +			 iph_check->daddr != iph->daddr)) ||
> +		    (ipv6h_check->version == 6 && ipv6h->version == 6 &&
> +			(ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
> +			 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)))) {

I think this always checks ipv6, even when we have ipv4.

> +		/* very minor issue: if a 'rogue' ACK is seen at the head of
> +		 * this flow queue it can never be filtered.
> +		 * this is unlikely, and harmless.
> +		 * solveable by assigning this case a sentinel rogue_ack value
> +		 * not worth any extra effort or cpu cycles
> +		 */
> +			if (!rogue_ack && skb_check != flow->head)
> +				rogue_ack = skb_check_prev;
> +			continue;
> +		}
> +
> +		/* new ack sequence must be greater
> +		 * equal DupACKs won't be filtered, would break fast retransmit
> +		 * SACKs won't be filtered as they look like DupACKs
> +		 * they won't be dropped either, safely reverts to unfiltered
> +		 * specific handling and filtering of SACKs is possible
> +		 * this is left as an exercise for the reader :)
> +		 */
> +		if (ntohl(tcph_check->ack_seq) >= ntohl(tcph->ack_seq))
> +			continue;
> +
> +		if (skb_check == flow->head) {
> +			flow->head = skb_check->next;
> +			flow->ackcheck = NULL;
> +		} else {
> +			skb_check_prev->next = skb_check->next;
> +			flow->ackcheck = rogue_ack ?: skb_check_prev;
> +		}
> +
> +		return skb_check;
> +	}
> +
> +	flow->ackcheck = rogue_ack ?: skb_check_prev;
> +	return NULL;
> +}

I like it.

> +static inline u32 cake_overhead(struct cake_sched_data *q, u32 in)
> +{
> +	u32 out = in + q->rate_overhead;
> +
> +	if (q->rate_mpu && out < q->rate_mpu)
> +		out = q->rate_mpu;
> +
> +	if (q->rate_flags & CAKE_FLAG_ATM) {
> +		out += 47;
> +		out /= 48;
> +		out *= 53;
> +	} else if (q->rate_flags & CAKE_FLAG_PTM) {
> +		/* the following adds one byte per 64 bytes or part thereof.
> +		 * this is conservative and easier to calculate than the
> +		 * precise value.
> +		 */
> +		out += (out / 64) + !!(out % 64);

This comment I guess is talking about the estimate, not the cpu
overhead? The cpu overhead is actually worse than ATM.

> +	}
> +
> +	return out;
> +}
> +
> +static inline cobalt_time_t cake_ewma(cobalt_time_t avg, cobalt_time_t sample,
> +				      u32 shift)
> +{
> +	avg -= avg >> shift;
> +	avg += sample >> shift;
> +	return avg;
> +}
> +
> +static inline void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
> +{
> +	struct cake_heap_entry ii = q->overflow_heap[i];
> +	struct cake_heap_entry jj = q->overflow_heap[j];
> +
> +	q->overflow_heap[i] = jj;
> +	q->overflow_heap[j] = ii;
> +
> +	q->tins[ii.t].overflow_idx[ii.b] = j;
> +	q->tins[jj.t].overflow_idx[jj.b] = i;
> +}
> +
> +static inline u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
> +{
> +	struct cake_heap_entry ii = q->overflow_heap[i];
> +
> +	return q->tins[ii.t].backlogs[ii.b];
> +}
> +
> +static void cake_heapify(struct cake_sched_data *q, u16 i)
> +{
> +	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
> +	u32 m = i;
> +	u32 mb = cake_heap_get_backlog(q, m);
> +
> +	while (m < a) {
> +		u32 l = m + m + 1;
> +		u32 r = l + 1;
> +
> +		if (l < a) {
> +			u32 lb = cake_heap_get_backlog(q, l);
> +
> +			if (lb > mb) {
> +				m  = l;
> +				mb = lb;
> +			}
> +		}
> +
> +		if (r < a) {
> +			u32 rb = cake_heap_get_backlog(q, r);
> +
> +			if (rb > mb) {
> +				m  = r;
> +				mb = rb;
> +			}
> +		}
> +
> +		if (m != i) {
> +			cake_heap_swap(q, i, m);
> +			i = m;
> +		} else {
> +			break;
> +		}
> +	}
> +}
> +
> +static void cake_heapify_up(struct cake_sched_data *q, u16 i)
> +{
> +	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
> +		u16 p = (i - 1) >> 1;
> +		u32 ib = cake_heap_get_backlog(q, i);
> +		u32 pb = cake_heap_get_backlog(q, p);
> +
> +		if (ib > pb) {
> +			cake_heap_swap(q, i, p);
> +			i = p;
> +		} else {
> +			break;
> +		}
> +	}
> +}
> +
> +static void cake_advance_shaper(struct cake_sched_data *q,
> +				struct cake_tin_data *b, u32 len, u64 now);
> +
> +static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)

Actually using **to_free sanely (here, in qdisc_reduce_backlog, etc)
would be good. I like very much how fq_codel does bulk dropping now.

I think, but am unsure, that the whole heap concept would just "go",
in that case.

> +{
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	struct sk_buff *skb;
> +	u32 idx = 0, tin = 0, len;
> +	struct cake_tin_data *b;
> +	struct cake_flow *flow;
> +	struct cake_heap_entry qq;
> +	u64 now = cobalt_get_time();
> +
> +	if (!q->overflow_timeout) {
> +		int i;
> +		/* Build fresh max-heap */
> +		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
> +			cake_heapify(q, i);
> +	}
> +	q->overflow_timeout = 65535;
> +
> +	/* select longest queue for pruning */
> +	qq  = q->overflow_heap[0];
> +	tin = qq.t;
> +	idx = qq.b;
> +
> +	b = &q->tins[tin];
> +	flow = &b->flows[idx];
> +	skb = dequeue_head(flow);
> +	if (unlikely(!skb)) {
> +		/* heap has gone wrong, rebuild it next time */
> +		q->overflow_timeout = 0;
> +		return idx + (tin << 16);
> +	}
> +
> +	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
> +		b->unresponsive_flow_count++;
> +
> +	len = qdisc_pkt_len(skb);
> +	q->buffer_used      -= skb->truesize;
> +	b->backlogs[idx]    -= len;
> +	b->tin_backlog      -= len;
> +	sch->qstats.backlog -= len;
> +	qdisc_tree_reduce_backlog(sch, 1, len);
> +
> +	b->tin_dropped++;
> +	sch->qstats.drops++;
> +
> +	if (q->rate_flags & CAKE_FLAG_INGRESS)
> +		cake_advance_shaper(q, b, cake_overhead(q, len), now);
> +
> +	__qdisc_drop(skb, to_free);

We drop packets in several different places, several different ways.

> +	sch->q.qlen--;
> +
> +	cake_heapify(q, 0);
> +
> +	return idx + (tin << 16);
> +}
> +
> +static inline void cake_wash_diffserv(struct sk_buff *skb)
> +{
> +	switch (skb->protocol) {
> +	case htons(ETH_P_IP):
> +		ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
> +		break;
> +	case htons(ETH_P_IPV6):
> +		ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
> +		break;
> +	default:
> +		break;
> +	};
> +}
> +
> +static inline u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
> +{
> +	u8 dscp;
> +
> +	switch (skb->protocol) {
> +	case htons(ETH_P_IP):
> +		dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
> +		if (wash && dscp)
> +			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
> +		return dscp;
> +
> +	case htons(ETH_P_IPV6):
> +		dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
> +		if (wash && dscp)
> +			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
> +		return dscp;
> +
> +	case htons(ETH_P_ARP):
> +		return 0x38;  /* CS7 - Net Control */

Would this be where we can try to find DSL LLC packets?

> +
> +	default:
> +		/* If there is no Diffserv field, treat as best-effort */
> +		return 0;
> +	};
> +}
> +
> +static void cake_reconfigure(struct Qdisc *sch);
> +
> +static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> +			struct sk_buff **to_free)
> +{
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	u32 idx, tin;
> +	struct cake_tin_data *b;
> +	struct cake_flow *flow;
> +	/* signed len to handle corner case filtered ACK larger than trigger */
> +	int len = qdisc_pkt_len(skb);
> +	u64 now = cobalt_get_time();
> +	struct sk_buff *skb_filtered_ack = NULL;
> +
> +	/* extract the Diffserv Precedence field, if it exists */
> +	/* and clear DSCP bits if washing */
> +	if (q->tin_mode != CAKE_MODE_BESTEFFORT) {
> +		tin = q->tin_index[cake_handle_diffserv(skb,
> +				q->rate_flags & CAKE_FLAG_WASH)];
> +		if (unlikely(tin >= q->tin_cnt))
> +			tin = 0;
> +	} else {
> +		tin = 0;
> +		if (q->rate_flags & CAKE_FLAG_WASH)
> +			cake_wash_diffserv(skb);
> +	}
> +
> +	b = &q->tins[tin];
> +
> +	/* choose flow to insert into */
> +	idx = cake_hash(b, skb, q->flow_mode);
> +	flow = &b->flows[idx];
> +
> +	/* ensure shaper state isn't stale */
> +	if (!b->tin_backlog) {
> +		if (b->tin_time_next_packet < now)
> +			b->tin_time_next_packet = now;
> +
> +		if (!sch->q.qlen) {
> +			if (q->time_next_packet < now) {
> +				q->time_next_packet = now;
> +			} else if (q->time_next_packet > now) {
> +				sch->qstats.overlimits++;
> +				qdisc_watchdog_schedule_ns(&q->watchdog,
> +							   q->time_next_packet);
> +			}
> +		}
> +	}
> +
> +	if (unlikely(len > b->max_skblen))
> +		b->max_skblen = len;
> +
> +	/* Split GSO aggregates if they're likely to impair flow isolation
> +	 * or if we need to know individual packet sizes for framing overhead.
> +	 */

Long ago, we argued over the need to peel, and we ended up just peeling
always to make sure we had accurate packet counts.

Either we change the comment (preferred!), or we go back to
experimenting with not peeling below a quantum except with ack mode or
framing enabled.

> +	if (skb_is_gso(skb)) {
> +		struct sk_buff *segs, *nskb;
> +		netdev_features_t features = netif_skb_features(skb);
> +		/* signed slen to handle corner case
> +		 * suppressed ACK larger than trigger
> +		 */
> +		int slen = 0;
> +
> +		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
> +
> +		if (IS_ERR_OR_NULL(segs))
> +			return qdisc_drop(skb, sch, to_free);
> +
> +		while (segs) {
> +			nskb = segs->next;
> +			segs->next = NULL;
> +			qdisc_skb_cb(segs)->pkt_len = segs->len;
> +			cobalt_set_enqueue_time(segs, now);
> +			flow_queue_add(flow, segs);
> +
> +			if (q->rate_flags & CAKE_FLAG_ACK_FILTER)
> +				skb_filtered_ack = ack_filter(flow, segs);
> +			if (skb_filtered_ack) {
> +				b->ack_drops++;
> +				slen += segs->len - skb_filtered_ack->len;
> +				q->buffer_used += segs->truesize
> +					- skb_filtered_ack->truesize;
> +				qdisc_tree_reduce_backlog(sch, 1,
> +							  qdisc_pkt_len(skb_filtered_ack));

We should try to do this in bulk, a bit later.

> +				consume_skb(skb_filtered_ack);
> +			} else {
> +				sch->q.qlen++;
> +				b->packets++;
> +				slen += segs->len;
> +				q->buffer_used += segs->truesize;
> +			}
> +			segs = nskb;
> +		}
> +		/* stats */
> +		b->bytes	    += slen;
> +		b->backlogs[idx]    += slen;
> +		b->tin_backlog      += slen;
> +		sch->qstats.backlog += slen;
> +		q->avg_window_bytes += slen;
> +
> +		qdisc_tree_reduce_backlog(sch, 1, len);

And I worry that we may be doing it twice

> +		consume_skb(skb);
> +	} else {
> +		/* not splitting */
> +		cobalt_set_enqueue_time(skb, now);
> +		flow_queue_add(flow, skb);
> +
> +		if (q->rate_flags & CAKE_FLAG_ACK_FILTER)
> +			skb_filtered_ack = ack_filter(flow, skb);
> +		if (skb_filtered_ack) {
> +			b->ack_drops++;
> +			len -= qdisc_pkt_len(skb_filtered_ack);
> +			q->buffer_used += skb->truesize
> +				- skb_filtered_ack->truesize;
> +			qdisc_tree_reduce_backlog(sch, 1,
> +						  qdisc_pkt_len(skb_filtered_ack));
> +			consume_skb(skb_filtered_ack);
> +		} else {
> +			sch->q.qlen++;
> +			b->packets++;
> +			q->buffer_used      += skb->truesize;
> +		}
> +		/* stats */
> +		b->bytes	    += len;
> +		b->backlogs[idx]    += len;
> +		b->tin_backlog      += len;
> +		sch->qstats.backlog += len;
> +		q->avg_window_bytes += len;
> +	}
> +
> +	if (q->overflow_timeout)
> +		cake_heapify_up(q, b->overflow_idx[idx]);
> +
> +	/* incoming bandwidth capacity estimate */
> +	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)	{
> +		u64 packet_interval = now - q->last_packet_time;
> +
> +		if (packet_interval > NSEC_PER_SEC)
> +			packet_interval = NSEC_PER_SEC;

I don't see how smoothing this interval to 1sec can possibly work.

> +
> +		/* filter out short-term bursts, eg. wifi aggregation */
> +		q->avg_packet_interval = cake_ewma(q->avg_packet_interval,
> +						   packet_interval,
> +			packet_interval > q->avg_packet_interval ? 2 : 8);
> +
> +		q->last_packet_time = now;
> +
> +		if (packet_interval > q->avg_packet_interval) {
> +			u64 window_interval = now - q->avg_window_begin;
> +			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
> +
> +			do_div(b, window_interval);
> +			q->avg_peak_bandwidth =
> +				cake_ewma(q->avg_peak_bandwidth, b,
> +					  b > q->avg_peak_bandwidth ? 2 : 8);
> +			q->avg_window_bytes = 0;
> +			q->avg_window_begin = now;
> +
> +			if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS &&
> +			    now - q->last_reconfig_time >
> +				(NSEC_PER_SEC / 4)) {
> +				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
> +				cake_reconfigure(sch);
> +			}
> +		}
> +	} else {
> +		q->avg_window_bytes = 0;
> +		q->last_packet_time = now;
> +	}
> +
> +	/* flowchain */
> +	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
> +		struct cake_host *srchost = &b->hosts[flow->srchost];
> +		struct cake_host *dsthost = &b->hosts[flow->dsthost];
> +		u16 host_load = 1;
> +
> +		if (!flow->set) {
> +			list_add_tail(&flow->flowchain, &b->new_flows);
> +		} else {
> +			b->decaying_flow_count--;
> +			list_move_tail(&flow->flowchain, &b->new_flows);
> +		}
> +		flow->set = CAKE_SET_SPARSE;
> +		b->sparse_flow_count++;
> +
> +		if ((q->flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC)
> +			host_load = max(host_load, srchost->srchost_refcnt);
> +
> +		if ((q->flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST)
> +			host_load = max(host_load, dsthost->dsthost_refcnt);
> +
> +		flow->deficit = (b->flow_quantum *
> +				 quantum_div[host_load]) >> 16;
> +
> +	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
> +		/* this flow was empty, accounted as a sparse flow, but actually
> +		 * in the bulk rotation
> +		 */
> +		flow->set = CAKE_SET_BULK;
> +		b->sparse_flow_count--;
> +		b->bulk_flow_count++;
> +	}
> +
> +	if (q->buffer_used > q->buffer_max_used)
> +		q->buffer_max_used = q->buffer_used;
> +
> +	if (q->buffer_used > q->buffer_limit) {
> +		u32 dropped = 0;
> +
> +		while (q->buffer_used > q->buffer_limit) {
> +			dropped++;
> +			cake_drop(sch, to_free);
> +		}
> +		b->drop_overlimit += dropped;
> +	}
> +	return NET_XMIT_SUCCESS;
> +}
> +
> +static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
> +{
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	struct cake_tin_data *b = &q->tins[q->cur_tin];
> +	struct cake_flow *flow = &b->flows[q->cur_flow];
> +	struct sk_buff *skb = NULL;
> +	u32 len;
> +
> +	if (flow->head) {
> +		skb = dequeue_head(flow);
> +		len = qdisc_pkt_len(skb);
> +		b->backlogs[q->cur_flow] -= len;
> +		b->tin_backlog		 -= len;
> +		sch->qstats.backlog      -= len;
> +		q->buffer_used		 -= skb->truesize;
> +		sch->q.qlen--;
> +
> +		if (q->overflow_timeout)
> +			cake_heapify(q, b->overflow_idx[q->cur_flow]);
> +	}
> +	return skb;
> +}
> +
> +/* Discard leftover packets from a tin no longer in use. */
> +static void cake_clear_tin(struct Qdisc *sch, u16 tin)
> +{
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	struct sk_buff *skb;
> +
> +	q->cur_tin = tin;
> +	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
> +		while (!!(skb = cake_dequeue_one(sch)))
> +			kfree_skb(skb);
> +}
> +
> +static struct sk_buff *cake_dequeue(struct Qdisc *sch)
> +{
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	struct sk_buff *skb;
> +	struct cake_tin_data *b = &q->tins[q->cur_tin];
> +	struct cake_flow *flow;
> +	struct cake_host *srchost, *dsthost;
> +	struct list_head *head;
> +	u32 len;
> +	u16 host_load;
> +	cobalt_time_t now = ktime_get_ns();
> +	cobalt_time_t delay;
> +	bool first_flow = true;
> +
> +begin:
> +	if (!sch->q.qlen)
> +		return NULL;
> +
> +	/* global hard shaper */
> +	if (q->time_next_packet > now) {
> +		sch->qstats.overlimits++;
> +		qdisc_watchdog_schedule_ns(&q->watchdog, q->time_next_packet);
> +		return NULL;
> +	}
> +
> +	/* Choose a class to work on. */
> +	if (!q->rate_ns) {
> +		/* in unlimited mode, can't rely on shaper timings, just balance
> +		 * with DRR
> +		 */
> +		while (b->tin_deficit < 0 ||
> +		       !(b->sparse_flow_count + b->bulk_flow_count)) {
> +			if (b->tin_deficit <= 0)
> +				b->tin_deficit += b->tin_quantum_band;
> +
> +			q->cur_tin++;
> +			b++;
> +			if (q->cur_tin >= q->tin_cnt) {
> +				q->cur_tin = 0;
> +				b = q->tins;
> +			}
> +		}
> +	} else {
> +		/* in shaped mode, choose:
> +		 * - highest-priority tin with queue meeting schedule, if any
> +		 * - earliest-scheduled tin with queue, otherwise
> +		 */
> +		int oi, best_tin = 0;
> +		s64 best_time = 0xFFFFFFFFFFFFUL;
> +
> +		for (oi = 0; oi < q->tin_cnt; oi++) {
> +			int tin = q->tin_order[oi];
> +
> +			b = q->tins + tin;
> +			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
> +				s64 tdiff = b->tin_time_next_packet - now;
> +
> +				if (tdiff <= 0 || tdiff <= best_time) {
> +					best_time = tdiff;
> +					best_tin = tin;
> +				}
> +			}
> +		}
> +
> +		q->cur_tin = best_tin;
> +		b = q->tins + best_tin;
> +	}
> +
> +retry:
> +	/* service this class */
> +	head = &b->decaying_flows;
> +	if (!first_flow || list_empty(head)) {
> +		head = &b->new_flows;
> +		if (list_empty(head)) {
> +			head = &b->old_flows;
> +			if (unlikely(list_empty(head))) {
> +				head = &b->decaying_flows;
> +				if (unlikely(list_empty(head)))
> +					goto begin;
> +			}
> +		}
> +	}
> +	flow = list_first_entry(head, struct cake_flow, flowchain);
> +	q->cur_flow = flow - b->flows;
> +	first_flow = false;
> +
> +	/* triple isolation (modified DRR++) */
> +	srchost = &b->hosts[flow->srchost];
> +	dsthost = &b->hosts[flow->dsthost];
> +	host_load = 1;
> +
> +	if ((q->flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC)
> +		host_load = max(host_load, srchost->srchost_refcnt);
> +
> +	if ((q->flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST)
> +		host_load = max(host_load, dsthost->dsthost_refcnt);
> +
> +	WARN_ON(host_load > CAKE_QUEUES);
> +
> +	/* flow isolation (DRR++) */
> +	if (flow->deficit <= 0) {
> +		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
> +				  (prandom_u32() >> 16)) >> 16;

So that's the magic. Boy. Just this little bit, I have to go an think
about that hard. A comment about how and why this works would be useful.

> +		list_move_tail(&flow->flowchain, &b->old_flows);
> +
> +		/* here we keep all flows with deficits out of the sparse and
> +		 * decaying rotations no non-empty flow can go into the decaying
> +		 * rotation, so they can't get deficits
> +		 */
> +		if (flow->set == CAKE_SET_SPARSE) {
> +			if (flow->head) {
> +				b->sparse_flow_count--;
> +				b->bulk_flow_count++;
> +				flow->set = CAKE_SET_BULK;
> +			} else {
> +				/* we've moved it to the bulk rotation for
> +				 * correct deficit accounting but we still want
> +				 * to count it as a sparse flow, not a bulk one
> +				 */
> +				flow->set = CAKE_SET_SPARSE_WAIT;
> +			}
> +		}
> +		goto retry;
> +	}
> +
> +	/* Retrieve a packet via the AQM */
> +	while (1) {
> +		skb = cake_dequeue_one(sch);
> +		if (!skb) {
> +			/* this queue was actually empty */
> +			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
> +				b->unresponsive_flow_count--;
> +
> +			if (flow->cvars.p_drop || flow->cvars.count ||
> +			    (now - flow->cvars.drop_next) < 0) {

Of these, btw, I think flow->cvars.count would hit first.

> +				/* keep in the flowchain until the state has
> +				 * decayed to rest
> +				 */
> +				list_move_tail(&flow->flowchain,
> +					       &b->decaying_flows);
> +				if (flow->set == CAKE_SET_BULK) {
> +					b->bulk_flow_count--;
> +					b->decaying_flow_count++;
> +				} else if (flow->set == CAKE_SET_SPARSE ||
> +					flow->set == CAKE_SET_SPARSE_WAIT) {
> +					b->sparse_flow_count--;
> +					b->decaying_flow_count++;
> +				}
> +				flow->set = CAKE_SET_DECAYING;
> +			} else {
> +				/* remove empty queue from the flowchain */
> +				list_del_init(&flow->flowchain);
> +				if (flow->set == CAKE_SET_SPARSE ||
> +				    flow->set == CAKE_SET_SPARSE_WAIT)
> +					b->sparse_flow_count--;
> +				else if (flow->set == CAKE_SET_BULK)
> +					b->bulk_flow_count--;
> +				else
> +					b->decaying_flow_count--;
> +
> +				flow->set = CAKE_SET_NONE;
> +				srchost->srchost_refcnt--;
> +				dsthost->dsthost_refcnt--;
> +			}
> +			goto begin;
> +		}
> +
> +		/* Last packet in queue may be marked, shouldn't be dropped */
> +		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb) ||
> +		    !flow->head)
> +			break;
> +
> +		/* drop this packet, get another one */
> +		if (q->rate_flags & CAKE_FLAG_INGRESS) {
> +			len = cake_overhead(q, qdisc_pkt_len(skb));
> +			cake_advance_shaper(q, b, len, now);
> +			flow->deficit -= len;
> +			b->tin_deficit -= len;
> +		}
> +		b->tin_dropped++;
> +		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
> +		qdisc_qstats_drop(sch);
> +		kfree_skb(skb);
> +		if (q->rate_flags & CAKE_FLAG_INGRESS)
> +			goto retry;
> +	}
> +
> +	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
> +	qdisc_bstats_update(sch, skb);
> +
> +	len = cake_overhead(q, qdisc_pkt_len(skb));
> +	flow->deficit -= len;
> +	b->tin_deficit -= len;
> +
> +	/* collect delay stats */
> +	delay = now - cobalt_get_enqueue_time(skb);
> +	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
> +	b->peak_delay = cake_ewma(b->peak_delay, delay,
> +				  delay > b->peak_delay ? 2 : 8);
> +	b->base_delay = cake_ewma(b->base_delay, delay,
> +				  delay < b->base_delay ? 2 : 8);
> +
> +	cake_advance_shaper(q, b, len, now);
> +	if (q->time_next_packet > now && sch->q.qlen) {
> +		qdisc_watchdog_schedule_ns(&q->watchdog, q->time_next_packet);
> +	} else if (!sch->q.qlen) {
> +		int i;
> +
> +		for (i = 0; i < q->tin_cnt; i++) {
> +			if (q->tins[i].decaying_flow_count) {
> +				qdisc_watchdog_schedule_ns(&q->watchdog,
> +							   now +
> +							   q->tins[i].cparams.target);
> +				break;
> +			}
> +		}
> +	}
> +
> +	if (q->overflow_timeout)
> +		q->overflow_timeout--;
> +
> +	return skb;
> +}
> +
> +static void cake_advance_shaper(struct cake_sched_data *q,
> +				struct cake_tin_data *b, u32 len, u64 now)
> +{
> +	/* charge packet bandwidth to this tin, lower tins,
> +	 * and to the global shaper.
> +	 */
> +	if (q->rate_ns) {
> +		s64 tdiff1 = b->tin_time_next_packet - now;
> +		s64 tdiff2 = (len * (u64)b->tin_rate_ns) >> b->tin_rate_shft;
> +		s64 tdiff3 = (len * (u64)q->rate_ns) >> q->rate_shft;
> +
> +		if (tdiff1 < 0)
> +			b->tin_time_next_packet += tdiff2;
> +		else if (tdiff1 < tdiff2)
> +			b->tin_time_next_packet = now + tdiff2;
> +
> +		q->time_next_packet += tdiff3;
> +	}
> +}
> +
> +static void cake_reset(struct Qdisc *sch)
> +{
> +	u32 c;
> +
> +	for (c = 0; c < CAKE_MAX_TINS; c++)
> +		cake_clear_tin(sch, c);
> +}
> +
> +static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
> +	[TCA_CAKE_BASE_RATE]     = { .type = NLA_U32 },
> +	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
> +	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
> +	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
> +	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
> +	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
> +	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
> +	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
> +	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
> +	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
> +	[TCA_CAKE_ETHERNET]      = { .type = NLA_U32 },
> +	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
> +	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
> +	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
> +	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
> +};
> +
> +static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
> +			  cobalt_time_t ns_target, cobalt_time_t rtt_est_ns)
> +{
> +	/* convert byte-rate into time-per-byte
> +	 * so it will always unwedge in reasonable time.
> +	 */
> +	static const u64 MIN_RATE = 64;
> +	u64 rate_ns = 0;
> +	u8  rate_shft = 0;
> +	cobalt_time_t byte_target_ns;
> +	u32 byte_target = mtu + (mtu >> 1);
> +
> +	b->flow_quantum = 1514;
> +	if (rate) {
> +		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
> +		rate_shft = 32;
> +		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
> +		do_div(rate_ns, max(MIN_RATE, rate));
> +		while (!!(rate_ns >> 32)) {
> +			rate_ns >>= 1;
> +			rate_shft--;
> +		}
> +	} /* else unlimited, ie. zero delay */
> +
> +	b->tin_rate_bps  = rate;
> +	b->tin_rate_ns   = rate_ns;
> +	b->tin_rate_shft = rate_shft;
> +
> +	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
> +
> +	b->cparams.target = max(byte_target_ns, ns_target);
> +	b->cparams.interval = max(rtt_est_ns +
> +				     b->cparams.target - ns_target,
> +				     b->cparams.target * 2);
> +	b->cparams.p_inc = 1 << 24; /* 1/256 */
> +	b->cparams.p_dec = 1 << 20; /* 1/4096 */
> +}
> +
> +static int cake_config_besteffort(struct Qdisc *sch)
> +{
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	struct cake_tin_data *b = &q->tins[0];
> +	u32 rate = q->rate_bps;
> +	u32 mtu = psched_mtu(qdisc_dev(sch));
> +
> +	q->tin_cnt = 1;
> +
> +	q->tin_index = besteffort;
> +	q->tin_order = normal_order;
> +
> +	cake_set_rate(b, rate, mtu, US2TIME(q->target), US2TIME(q->interval));
> +	b->tin_quantum_band = 65535;
> +	b->tin_quantum_prio = 65535;
> +
> +	return 0;
> +}
> +
> +static int cake_config_precedence(struct Qdisc *sch)
> +{
> +	/* convert high-level (user visible) parameters into internal format */
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	u32 rate = q->rate_bps;
> +	u32 mtu = psched_mtu(qdisc_dev(sch));
> +	u32 quantum1 = 256;
> +	u32 quantum2 = 256;

magic numbers

> +	u32 i;
> +
> +	q->tin_cnt = 8;
> +	q->tin_index = precedence;
> +	q->tin_order = normal_order;
> +
> +	for (i = 0; i < q->tin_cnt; i++) {
> +		struct cake_tin_data *b = &q->tins[i];
> +
> +		cake_set_rate(b, rate, mtu, US2TIME(q->target),
> +			      US2TIME(q->interval));
> +
> +		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
> +		b->tin_quantum_band = max_t(u16, 1U, quantum2);
> +
> +		/* calculate next class's parameters */
> +		rate  *= 7;
> +		rate >>= 3;
> +
> +		quantum1  *= 3;
> +		quantum1 >>= 1;
> +
> +		quantum2  *= 7;
> +		quantum2 >>= 3;
> +	}
> +
> +	return 0;
> +}
> +
> +/*	List of known Diffserv codepoints:
> + *
> + *	Least Effort (CS1)
> + *	Best Effort (CS0)
> + *	Max Reliability & LLT "Lo" (TOS1)
> + *	Max Throughput (TOS2)
> + *	Min Delay (TOS4)
> + *  LLT "La" (TOS5)
> + *	Assured Forwarding 1 (AF1x) - x3
> + *	Assured Forwarding 2 (AF2x) - x3
> + *	Assured Forwarding 3 (AF3x) - x3
> + *	Assured Forwarding 4 (AF4x) - x3
> + *	Precedence Class 2 (CS2)
> + *	Precedence Class 3 (CS3)
> + *	Precedence Class 4 (CS4)
> + *	Precedence Class 5 (CS5)
> + *	Precedence Class 6 (CS6)
> + *	Precedence Class 7 (CS7)
> + *	Voice Admit (VA)
> + *	Expedited Forwarding (EF)
> +
> + *	Total 25 codepoints.
> + */
> +
> +/*	List of traffic classes in RFC 4594:
> + *		(roughly descending order of contended priority)
> + *		(roughly ascending order of uncontended throughput)
> + *
> + *	Network Control (CS6,CS7)      - routing traffic
> + *	Telephony (EF,VA)         - aka. VoIP streams
> + *	Signalling (CS5)               - VoIP setup
> + *	Multimedia Conferencing (AF4x) - aka. video calls
> + *	Realtime Interactive (CS4)     - eg. games
> + *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
> + *	Broadcast Video (CS3)
> + *	Low Latency Data (AF2x,TOS4)      - eg. database
> + *	Ops, Admin, Management (CS2,TOS1) - eg. ssh
> + *	Standard Service (CS0 & unrecognised codepoints)
> + *	High Throughput Data (AF1x,TOS2)  - eg. web traffic
> + *	Low Priority Data (CS1)           - eg. BitTorrent
> +
> + *	Total 12 traffic classes.
> + */
> +
> +static int cake_config_diffserv8(struct Qdisc *sch)
> +{
> +/*	Pruned list of traffic classes for typical applications:
> + *
> + *		Network Control          (CS6, CS7)
> + *		Minimum Latency          (EF, VA, CS5, CS4)
> + *		Interactive Shell        (CS2, TOS1)
> + *		Low Latency Transactions (AF2x, TOS4)
> + *		Video Streaming          (AF4x, AF3x, CS3)
> + *		Bog Standard             (CS0 etc.)
> + *		High Throughput          (AF1x, TOS2)
> + *		Background Traffic       (CS1)
> + *
> + *		Total 8 traffic classes.
> + */
> +
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	u32 rate = q->rate_bps;
> +	u32 mtu = psched_mtu(qdisc_dev(sch));
> +	u32 quantum1 = 256;
> +	u32 quantum2 = 256;
> +	u32 i;
> +
> +	q->tin_cnt = 8;
> +
> +	/* codepoint to class mapping */
> +	q->tin_index = diffserv8;
> +	q->tin_order = normal_order;
> +
> +	/* class characteristics */
> +	for (i = 0; i < q->tin_cnt; i++) {
> +		struct cake_tin_data *b = &q->tins[i];
> +
> +		cake_set_rate(b, rate, mtu, US2TIME(q->target),
> +			      US2TIME(q->interval));
> +
> +		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
> +		b->tin_quantum_band = max_t(u16, 1U, quantum2);
> +
> +		/* calculate next class's parameters */
> +		rate  *= 7;
> +		rate >>= 3;
> +
> +		quantum1  *= 3;
> +		quantum1 >>= 1;
> +
> +		quantum2  *= 7;
> +		quantum2 >>= 3;
> +	}
> +
> +	return 0;
> +}
> +
> +static int cake_config_diffserv4(struct Qdisc *sch)
> +{
> +/*  Further pruned list of traffic classes for four-class system:
> + *
> + *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
> + *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
> + *	    Best Effort        (CS0, AF1x, TOS2, and those not specified)
> + *	    Background Traffic (CS1)
> + *
> + *		Total 4 traffic classes.
> + */
> +
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	u32 rate = q->rate_bps;
> +	u32 mtu = psched_mtu(qdisc_dev(sch));
> +	u32 quantum = 1024;
> +
> +	q->tin_cnt = 4;
> +
> +	/* codepoint to class mapping */
> +	q->tin_index = diffserv4;
> +	q->tin_order = bulk_order;
> +
> +	/* class characteristics */
> +	cake_set_rate(&q->tins[0], rate >> 4, mtu,
> +		      US2TIME(q->target), US2TIME(q->interval));
> +	cake_set_rate(&q->tins[1], rate, mtu,
> +		      US2TIME(q->target), US2TIME(q->interval));
> +	cake_set_rate(&q->tins[2], rate >> 1, mtu,
> +		      US2TIME(q->target), US2TIME(q->interval));
> +	cake_set_rate(&q->tins[3], rate >> 2, mtu,
> +		      US2TIME(q->target), US2TIME(q->interval));
> +
> +	/* priority weights */
> +	q->tins[0].tin_quantum_prio = quantum >> 4;
> +	q->tins[1].tin_quantum_prio = quantum;
> +	q->tins[2].tin_quantum_prio = quantum << 2;
> +	q->tins[3].tin_quantum_prio = quantum << 4;
> +
> +	/* bandwidth-sharing weights */
> +	q->tins[0].tin_quantum_band = quantum >> 4;
> +	q->tins[1].tin_quantum_band = quantum;
> +	q->tins[2].tin_quantum_band = quantum >> 1;
> +	q->tins[3].tin_quantum_band = quantum >> 2;
> +
> +	/* tin 0 is not 100% rate, but tin 1 is */
> +	return 1;
> +}
> +
> +static int cake_config_diffserv3(struct Qdisc *sch)
> +{
> +/*  Simplified Diffserv structure with 3 tins.
> + *		Low Priority		(CS1)
> + *		Best Effort
> + *		Latency Sensitive	(TOS4, VA, EF, CS6, CS7)
> + */
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	u32 rate = q->rate_bps;
> +	u32 mtu = psched_mtu(qdisc_dev(sch));
> +	u32 quantum = 1024;
> +
> +	q->tin_cnt = 3;
> +
> +	/* codepoint to class mapping */
> +	q->tin_index = diffserv3;
> +	q->tin_order = bulk_order;
> +
> +	/* class characteristics */
> +	cake_set_rate(&q->tins[0], rate >> 4, mtu,
> +		      US2TIME(q->target), US2TIME(q->interval));
> +	cake_set_rate(&q->tins[1], rate, mtu,
> +		      US2TIME(q->target), US2TIME(q->interval));
> +	cake_set_rate(&q->tins[2], rate >> 2, mtu,
> +		      US2TIME(q->target), US2TIME(q->target));
> +
> +	/* priority weights */
> +	q->tins[0].tin_quantum_prio = quantum >> 4;
> +	q->tins[1].tin_quantum_prio = quantum;
> +	q->tins[2].tin_quantum_prio = quantum << 4;
> +
> +	/* bandwidth-sharing weights */
> +	q->tins[0].tin_quantum_band = quantum >> 4;
> +	q->tins[1].tin_quantum_band = quantum;
> +	q->tins[2].tin_quantum_band = quantum >> 2;
> +
> +	/* tin 0 is not 100% rate, but tin 1 is */
> +	return 1;
> +}
> +
> +static int cake_config_diffserv_llt(struct Qdisc *sch)
> +{
> +/*  Diffserv structure specialised for Latency-Loss-Tradeoff spec.
> + *		Loss Sensitive		(TOS1, TOS2)
> + *		Best Effort
> + *		Latency Sensitive	(TOS4, TOS5, VA, EF)
> + *		Low Priority		(CS1)
> + *		Network Control		(CS6, CS7)
> + */
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	u32 rate = q->rate_bps;
> +	u32 mtu = psched_mtu(qdisc_dev(sch));
> +
> +	q->tin_cnt = 5;
> +
> +	/* codepoint to class mapping */
> +	q->tin_index = diffserv_llt;
> +	q->tin_order = normal_order;
> +
> +	/* class characteristics */
> +	cake_set_rate(&q->tins[5], rate, mtu,
> +		      US2TIME(q->target), US2TIME(q->interval));
> +
> +	cake_set_rate(&q->tins[0], rate / 3, mtu,
> +		      US2TIME(q->target * 4), US2TIME(q->interval * 4));
> +	cake_set_rate(&q->tins[1], rate / 3, mtu,
> +		      US2TIME(q->target), US2TIME(q->interval));
> +	cake_set_rate(&q->tins[2], rate / 3, mtu,
> +		      US2TIME(q->target), US2TIME(q->target));
> +	cake_set_rate(&q->tins[3], rate >> 4, mtu,
> +		      US2TIME(q->target), US2TIME(q->interval));
> +	cake_set_rate(&q->tins[4], rate >> 4, mtu,
> +		      US2TIME(q->target * 4), US2TIME(q->interval * 4));
> +
> +	/* priority weights */
> +	q->tins[0].tin_quantum_prio = 2048;
> +	q->tins[1].tin_quantum_prio = 2048;
> +	q->tins[2].tin_quantum_prio = 2048;
> +	q->tins[3].tin_quantum_prio = 16384;
> +	q->tins[4].tin_quantum_prio = 32768;
> +
> +	/* bandwidth-sharing weights */
> +	q->tins[0].tin_quantum_band = 2048;
> +	q->tins[1].tin_quantum_band = 2048;
> +	q->tins[2].tin_quantum_band = 2048;
> +	q->tins[3].tin_quantum_band = 256;
> +	q->tins[4].tin_quantum_band = 16;
> +
> +	return 5;
> +}
> +
> +static void cake_reconfigure(struct Qdisc *sch)
> +{
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	int c, ft;
> +
> +	switch (q->tin_mode) {
> +	case CAKE_MODE_BESTEFFORT:
> +		ft = cake_config_besteffort(sch);
> +		break;
> +
> +	case CAKE_MODE_PRECEDENCE:
> +		ft = cake_config_precedence(sch);
> +		break;
> +
> +	case CAKE_MODE_DIFFSERV8:
> +		ft = cake_config_diffserv8(sch);
> +		break;
> +
> +	case CAKE_MODE_DIFFSERV4:
> +		ft = cake_config_diffserv4(sch);
> +		break;
> +
> +	case CAKE_MODE_LLT:
> +		ft = cake_config_diffserv_llt(sch);
> +		break;
> +
> +	case CAKE_MODE_DIFFSERV3:
> +	default:
> +		ft = cake_config_diffserv3(sch);
> +		break;
> +	};
> +
> +	BUG_ON(q->tin_cnt > CAKE_MAX_TINS);
> +	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++)
> +		cake_clear_tin(sch, c);
> +
> +	q->rate_ns   = q->tins[ft].tin_rate_ns;
> +	q->rate_shft = q->tins[ft].tin_rate_shft;
> +
> +	if (q->buffer_config_limit) {
> +		q->buffer_limit = q->buffer_config_limit;
> +	} else if (q->rate_bps) {
> +		u64 t = (u64)q->rate_bps * q->interval;
> +
> +		do_div(t, USEC_PER_SEC / 4);
> +		q->buffer_limit = max_t(u32, t, 4U << 20);
> +	} else {
> +		q->buffer_limit = ~0;
> +	}
> +
> +	if (1 || q->rate_bps)
> +		sch->flags &= ~TCQ_F_CAN_BYPASS;
> +	else
> +		sch->flags |= TCQ_F_CAN_BYPASS;
> +
> +	q->buffer_limit = min(q->buffer_limit,
> +			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
> +				  q->buffer_config_limit));
> +}
> +
> +static int cake_change(struct Qdisc *sch, struct nlattr *opt)
> +{
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	struct nlattr *tb[TCA_CAKE_MAX + 1];
> +	int err;
> +
> +	if (!opt)
> +		return -EINVAL;
> +
> +	err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, NULL);
> +	if (err < 0)
> +		return err;
> +
> +	if (tb[TCA_CAKE_BASE_RATE])
> +		q->rate_bps = nla_get_u32(tb[TCA_CAKE_BASE_RATE]);
> +
> +	if (tb[TCA_CAKE_DIFFSERV_MODE])
> +		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
> +
> +	if (tb[TCA_CAKE_ATM]) {
> +		q->rate_flags &= ~(CAKE_FLAG_ATM | CAKE_FLAG_PTM);
> +		q->rate_flags |= nla_get_u32(tb[TCA_CAKE_ATM]) &
> +					    (CAKE_FLAG_ATM | CAKE_FLAG_PTM);
> +	}
> +
> +	if (tb[TCA_CAKE_WASH]) {
> +		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
> +			q->rate_flags |= CAKE_FLAG_WASH;
> +		else
> +			q->rate_flags &= ~CAKE_FLAG_WASH;
> +	}
> +
> +	if (tb[TCA_CAKE_FLOW_MODE])
> +		q->flow_mode = nla_get_u32(tb[TCA_CAKE_FLOW_MODE]);
> +
> +	if (tb[TCA_CAKE_NAT]) {
> +		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
> +		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
> +				!!nla_get_u32(tb[TCA_CAKE_NAT]);
> +	}
> +
> +	if (tb[TCA_CAKE_OVERHEAD]) {
> +		if (tb[TCA_CAKE_ETHERNET])
> +			q->rate_overhead = -(nla_get_s32(tb[TCA_CAKE_ETHERNET]));
> +		else
> +			q->rate_overhead = -(qdisc_dev(sch)->hard_header_len);
> +		q->rate_overhead += nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
> +	}
> +
> +	if (tb[TCA_CAKE_MPU]) {
> +		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
> +	}
> +
> +	if (tb[TCA_CAKE_RTT]) {
> +		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
> +
> +		if (!q->interval)
> +			q->interval = 1;
> +	}
> +
> +	if (tb[TCA_CAKE_TARGET]) {
> +		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
> +
> +		if (!q->target)
> +			q->target = 1;
> +	}
> +
> +	if (tb[TCA_CAKE_AUTORATE]) {
> +		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
> +			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
> +		else
> +			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
> +	}
> +
> +	if (tb[TCA_CAKE_INGRESS]) {
> +		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
> +			q->rate_flags |= CAKE_FLAG_INGRESS;
> +		else
> +			q->rate_flags &= ~CAKE_FLAG_INGRESS;
> +	}
> +
> +	if (tb[TCA_CAKE_ACK_FILTER]) {
> +		if (!!nla_get_u32(tb[TCA_CAKE_ACK_FILTER]))
> +			q->rate_flags |= CAKE_FLAG_ACK_FILTER;
> +		else
> +			q->rate_flags &= ~CAKE_FLAG_ACK_FILTER;
> +	}
> +
> +	if (tb[TCA_CAKE_MEMORY])
> +		q->buffer_config_limit = nla_get_s32(tb[TCA_CAKE_MEMORY]);
> +
> +	if (q->tins) {
> +		sch_tree_lock(sch);
> +		cake_reconfigure(sch);
> +		sch_tree_unlock(sch);
> +	}
> +
> +	return 0;
> +}
> +
> +static void *cake_zalloc(size_t sz)
> +{
> +	void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
> +
> +	if (!ptr)
> +		ptr = vzalloc(sz);
> +	return ptr;
> +}
> +
> +static void cake_free(void *addr)
> +{
> +	if (addr)
> +		kvfree(addr);
> +}
> +
> +static void cake_destroy(struct Qdisc *sch)
> +{
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +
> +	qdisc_watchdog_cancel(&q->watchdog);
> +
> +	if (q->tins)
> +		cake_free(q->tins);
> +}
> +
> +static int cake_init(struct Qdisc *sch, struct nlattr *opt)
> +{
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	int i, j;
> +
> +	/* codel_cache_init(); */
> +	sch->limit = 10240;
> +	q->tin_mode = CAKE_MODE_DIFFSERV3;
> +	q->flow_mode  = CAKE_FLOW_TRIPLE;
> +
> +	q->rate_bps = 0; /* unlimited by default */
> +
> +	q->interval = 100000; /* 100ms default */
> +	q->target   =   5000; /* 5ms: codel RFC argues
> +			       * for 5 to 10% of interval
> +			       */
> +
> +	q->cur_tin = 0;
> +	q->cur_flow  = 0;
> +
> +	if (opt) {
> +		int err = cake_change(sch, opt);
> +
> +		if (err)
> +			return err;
> +	}
> +
> +	qdisc_watchdog_init(&q->watchdog, sch);
> +
> +	quantum_div[0] = ~0;
> +	for (i = 1; i <= CAKE_QUEUES; i++)
> +		quantum_div[i] = 65535 / i;
> +
> +	q->tins = cake_zalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data));
> +	if (!q->tins)
> +		goto nomem;
> +
> +	for (i = 0; i < CAKE_MAX_TINS; i++) {
> +		struct cake_tin_data *b = q->tins + i;
> +
> +		b->perturbation = prandom_u32();

This doesn't actually *have* to be different per tin.

> +		INIT_LIST_HEAD(&b->new_flows);
> +		INIT_LIST_HEAD(&b->old_flows);
> +		INIT_LIST_HEAD(&b->decaying_flows);
> +		b->sparse_flow_count = 0;
> +		b->bulk_flow_count = 0;
> +		b->decaying_flow_count = 0;
> +		/* codel_params_init(&b->cparams); */
> +
> +		for (j = 0; j < CAKE_QUEUES; j++) {
> +			struct cake_flow *flow = b->flows + j;
> +			u32 k = j * CAKE_MAX_TINS + i;
> +
> +			INIT_LIST_HEAD(&flow->flowchain);
> +			cobalt_vars_init(&flow->cvars);
> +
> +			q->overflow_heap[k].t = i;
> +			q->overflow_heap[k].b = j;
> +			b->overflow_idx[j] = k;
> +		}
> +	}
> +
> +	cake_reconfigure(sch);
> +	q->avg_peak_bandwidth = q->rate_bps;
> +	return 0;
> +
> +nomem:
> +	cake_destroy(sch);
> +	return -ENOMEM;
> +}
> +
> +static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
> +{
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	struct nlattr *opts;
> +
> +	opts = nla_nest_start(skb, TCA_OPTIONS);
> +	if (!opts)
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_BASE_RATE, q->rate_bps))
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_ATM, (q->rate_flags &
> +					    (CAKE_FLAG_ATM | CAKE_FLAG_PTM))))
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, q->flow_mode))
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_WASH,
> +			!!(q->rate_flags & CAKE_FLAG_WASH)))
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead +
> +						qdisc_dev(sch)->hard_header_len))
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_ETHERNET, qdisc_dev(sch)->hard_header_len))
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
> +			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
> +			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER,
> +			!!(q->rate_flags & CAKE_FLAG_ACK_FILTER)))
> +		goto nla_put_failure;
> +
> +	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
> +		goto nla_put_failure;
> +
> +	return nla_nest_end(skb, opts);
> +
> +nla_put_failure:
> +	return -1;
> +}
> +
> +static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
> +{
> +	/* reuse fq_codel stats format */
> +	struct cake_sched_data *q = qdisc_priv(sch);
> +	struct tc_cake_xstats *st = cake_zalloc(sizeof(*st));
> +	int i;
> +
> +	if (!st)
> +		return -1;
> +
> +	BUG_ON(q->tin_cnt > TC_CAKE_MAX_TINS);
> +
> +	st->version = 5;
> +	st->max_tins = TC_CAKE_MAX_TINS;
> +	st->tin_cnt = q->tin_cnt;
> +
> +	for (i = 0; i < q->tin_cnt; i++) {
> +		struct cake_tin_data *b = &q->tins[i];
> +
> +		st->threshold_rate[i] = b->tin_rate_bps;
> +		st->target_us[i]      = cobalt_time_to_us(b->cparams.target);
> +		st->interval_us[i]    = cobalt_time_to_us(b->cparams.interval);
> +
> +		/* TODO FIXME: add missing aspects of these composite stats */
> +		st->sent[i].packets       = b->packets;
> +		st->sent[i].bytes	  = b->bytes;
> +		st->dropped[i].packets    = b->tin_dropped;
> +		st->ecn_marked[i].packets = b->tin_ecn_mark;
> +		st->backlog[i].bytes      = b->tin_backlog;
> +		st->ack_drops[i].packets  = b->ack_drops;
> +
> +		st->peak_delay_us[i] = cobalt_time_to_us(b->peak_delay);
> +		st->avge_delay_us[i] = cobalt_time_to_us(b->avge_delay);
> +		st->base_delay_us[i] = cobalt_time_to_us(b->base_delay);
> +
> +		st->way_indirect_hits[i] = b->way_hits;
> +		st->way_misses[i]	 = b->way_misses;
> +		st->way_collisions[i]    = b->way_collisions;
> +
> +		st->sparse_flows[i]      = b->sparse_flow_count +
> +					   b->decaying_flow_count;
> +		st->bulk_flows[i]	 = b->bulk_flow_count;
> +		st->unresponse_flows[i]  = b->unresponsive_flow_count;
> +		st->spare[i]		 = 0;
> +		st->max_skblen[i]	 = b->max_skblen;
> +	}
> +	st->capacity_estimate = q->avg_peak_bandwidth;
> +	st->memory_limit      = q->buffer_limit;
> +	st->memory_used       = q->buffer_max_used;
> +
> +	i = gnet_stats_copy_app(d, st, sizeof(*st));
> +	cake_free(st);
> +	return i;
> +}
> +
> +static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
> +	.id		=	"cake",
> +	.priv_size	=	sizeof(struct cake_sched_data),
> +	.enqueue	=	cake_enqueue,
> +	.dequeue	=	cake_dequeue,
> +	.peek		=	qdisc_peek_dequeued,
> +	.init		=	cake_init,
> +	.reset		=	cake_reset,
> +	.destroy	=	cake_destroy,
> +	.change		=	cake_change,
> +	.dump		=	cake_dump,
> +	.dump_stats	=	cake_dump_stats,
> +	.owner		=	THIS_MODULE,
> +};
> +
> +static int __init cake_module_init(void)
> +{
> +	return register_qdisc(&cake_qdisc_ops);
> +}
> +
> +static void __exit cake_module_exit(void)
> +{
> +	unregister_qdisc(&cake_qdisc_ops);
> +}
> +
> +module_init(cake_module_init)
> +module_exit(cake_module_exit)
> +MODULE_AUTHOR("Jonathan Morton");
> +MODULE_LICENSE("Dual BSD/GPL");
> +MODULE_DESCRIPTION("The Cake shaper. Version: " CAKE_VERSION);