[Codel] [PATCH] Preliminary codel implementation

Thu May 3 14:17:10 EDT 2012

On Thu, 2012-05-03 at 10:52 -0700, Dave Täht wrote:
> ---
>  include/linux/pkt_sched.h |   29 +++++
>  net/sched/Kconfig         |   11 ++
>  net/sched/Makefile        |    1 +
>  net/sched/sch_codel.c     |  296 +++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 337 insertions(+)
>  create mode 100644 net/sched/sch_codel.c
> 

Hi Dave.

Was this tested or is just a draft ?

> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
> index 0d5b793..c21b720 100644
> --- a/include/linux/pkt_sched.h
> +++ b/include/linux/pkt_sched.h
> @@ -633,4 +633,33 @@ struct tc_qfq_stats {
>  	__u32 lmax;
>  };
>  
> +/* CODEL */
> +
> +enum {
> +	TCA_CODEL_UNSPEC,
> +	TCA_CODEL_PARMS,
> +	TCA_CODEL_TARGET,
> +	TCA_CODEL_DEPTH,
> +	TCA_CODEL_MINBYTES,
> +	TCA_CODEL_INTERVAL,
> +	__TCA_CODEL_MAX
> +};
> +
> +#define TCA_CODEL_MAX	(__TCA_CODEL_MAX - 1)
> +#define TC_CODEL_ECN 1
> +
> +struct tc_codel_qopt {
> +        __u32 flags;	/* flags (e.g. ecn) */
> +	__u32 target;	/* max delay, in us */
> +        __u32 depth;	/* queue depth in packets */
> +        __u32 minbytes;	/* MTU (usually) */
> +        __u32 interval;	/* Sliding min time window width (us) */
> +};
> +
> +struct tc_codel_stats {
> +	__u64 drops;
> +	__u64 marks;
> +};
> +
> +
>  #endif
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index 2590e91..8106c42 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -250,6 +250,17 @@ config NET_SCH_QFQ
>  
>  	  If unsure, say N.
>  
> +config NET_SCH_CODEL
> +	tristate "Controlled Delay AQM (CODEL)"
> +	help
> +	  Say Y here if you want to use the Controlled Delay (CODEL)
> +	  packet scheduling algorithm.
> +
> +	  To compile this driver as a module, choose M here: the module
> +	  will be called sch_codel.
> +
> +	  If unsure, say N.
> +
>  config NET_SCH_INGRESS
>  	tristate "Ingress Qdisc"
>  	depends on NET_CLS_ACT
> diff --git a/net/sched/Makefile b/net/sched/Makefile
> index dc5889c..41130b5 100644
> --- a/net/sched/Makefile
> +++ b/net/sched/Makefile
> @@ -36,6 +36,7 @@ obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
>  obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o
>  obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
>  obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
> +obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o
>  
>  obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
>  obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
> diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
> new file mode 100644
> index 0000000..159df47
> --- /dev/null
> +++ b/net/sched/sch_codel.c
> @@ -0,0 +1,296 @@
> +/*
> + * net/sched/sch_codel.c	A Codel implementation
> + *
> + *		This program is free software; you can redistribute it and/or
> + *		modify it under the terms of the GNU General Public License
> + *		as published by the Free Software Foundation; either version
> + *		2 of the License, or (at your option) any later version.
> + *
> + * Based on ns2 simulation code presented by Kathie Nichols
> + * Authors:	Dave Täht <d at taht.net>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/errno.h>
> +#include <linux/ktime.h>
> +#include <linux/skbuff.h>
> +#include <net/pkt_sched.h>
> +
> +#define MS2TIME(a) (ns_to_ktime( (u64) a*1000000))
> +#define DEFAULT_CODEL_DEPTH 1000
> +
> +/* Per-queue state (codel_queue_t instance variables) */
> +
> +struct codel_sched_data {
> +	u32 flags;
> +	u32 minbytes;
> +	u32 count; /* packets dropped since we went into drop state */
> +	bool dropping;	/* 1 if in drop state, might just add to flags */
> +	ktime_t target;
> +	ktime_t interval;
> +        /* time to declare above q->target (0 if below)*/
> +	ktime_t first_above_time;
> +	ktime_t drop_next; /* time to drop next packet */
> +};
> +
> +struct codel_skb_cb {
> +	ktime_t enqueue_time;
> +	char data[16];
> +};
> +
> +static inline struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb)
> +{
> +	return (struct codel_skb_cb *)skb->cb;
> +}

You cant do that. You MUST keep the generic qdisc cb unchanged

{
	qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb)); 
	return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data;
}

> +
> +static inline ktime_t get_enqueue_time(const struct sk_buff *skb) {
> +	struct codel_skb_cb *cb = get_codel_cb(skb);

	const struct codel_skb_cb *cb = get_codel_cb(skb);

> +	return cb->enqueue_time;

	or : return get_codel_cb(skb)->enqueue_time

> +}
> +
> +static inline ktime_t set_enqueue_time(struct sk_buff *skb, ktime_t t ) {
> +	struct codel_skb_cb *cb = get_codel_cb(skb);
> +	cb->enqueue_time = t;

	get_codel_cb(skb)->enqueue_time = t;

> +	return t;
> +}
> +
> +static inline ktime_t control_law(const struct codel_sched_data *q, ktime_t t)
> +{
> +	t.tv64 = t.tv64 + q->interval.tv64 / int_sqrt(q->count);

thats a killer for lot of machines... (divide plus the int_sqrt())

> +	return t;
> +}
> +
> +/*
> +static int codel_prob_mark(const struct codel_sched_data *q)
> +{
> +	return q->flags & TC_CODEL_ECN;

Not clear if ECN plays well with CoDel at all.

> +}
> +*/
> +
> + /* wrappers for ultimate statistics collection */
> +
> +static int codel_drop(struct sk_buff *skb, struct Qdisc *sch) {
> +	return qdisc_drop(skb,sch);
> +}
> +
> +/*
> +static int codel_queue_drop(struct Qdisc *sch) {
> +	return qdisc_drop(skb,sch);
> +}
> +*/
> +
> +struct sk_buff *codel_dequeue_head(struct Qdisc *sch) {
> +	return(qdisc_dequeue_head(sch));
> +}
> +
> +bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now)
> +{
> +	struct codel_sched_data *q = qdisc_priv(sch);
> +	bool drop = 0;

	bool drop = false;

> +	if (skb == NULL) {
> +		q->first_above_time.tv64 = 0;
> +	} else {
> +		ktime_t sojourn_time = ktime_sub(now, get_enqueue_time(skb));
> +		if (sojourn_time.tv64 < 
> +		    q->target.tv64 || sch->qstats.backlog < q->minbytes) {
> +/* went below so we’ll stay below for at least q->interval */
> +			q->first_above_time.tv64 = 0;
> +		} else {
> +			if (q->first_above_time.tv64 == 0) {
> +
> +/* just went above from below. If we stay above
> + * for at least q->interval we’ll say it’s ok to drop 
> + */
> +				q->first_above_time = 
> +					ktime_add(now,q->interval);
> +			} else if (now.tv64 >= q->first_above_time.tv64) {
> +				drop = 1;
> +			}
> +		}
> +	}
> +	return drop;
> +}
> +
> +static struct sk_buff *codel_dequeue(struct Qdisc *sch)
> +{
> +	struct codel_sched_data *q = qdisc_priv(sch);
> +	struct sk_buff *skb = codel_dequeue_head(sch);
> +	ktime_t now;
> +	bool drop;

missing newline 

> +	if (skb == NULL) {
> +		q->dropping = 0;
> +		q->first_above_time.tv64 = 0;
> +		return skb;
> +	}
> +	now = ktime_get();
> +	drop = should_drop(skb, sch, now);
> +	if (q->dropping) {
> +		if (drop) {
> +/* sojourn time below target - leave dropping state */
> +			q->dropping = 0;
> +		} else if (now.tv64 >= q->drop_next.tv64) {
> +/*
> + * It’s time for the next drop. Drop the current packet and dequeue the next. 
> + * The dequeue might take us out of dropping state. If not, schedule the 
> + * next drop. A large backlog might result in drop rates so high that the next
> + * drop should happen now, hence the ‘while’ loop.
> + */  
> +			while (now.tv64 >= q->drop_next.tv64 && q->dropping) {
> +				codel_drop(skb, sch);
> +				q->count++;
> +				skb = codel_dequeue_head(sch);
> +				if (should_drop(skb,sch,now)) {
> +/* leave dropping state */
> +					q->dropping = 0;
> +				} else {
> +/* and schedule the next drop */
> +					q->drop_next = 
> +						control_law(q,q->drop_next);
> +				}
> +			}
> +		} else if (drop &&
> +			   ((now.tv64 - q->drop_next.tv64 < 
> +				16 * q->interval.tv64) ||
> +			    (now.tv64 - q->first_above_time.tv64 >= 
> +			     2 * q->interval.tv64))) {
> +			codel_drop(skb, sch);
> +			skb = codel_dequeue_head(sch);
> +			/* engage state machine */
> +			drop = should_drop(skb,sch,now); 
> +			q->dropping = 1;
> +			
> +/* if min went above target close to when we last went below it
> + * assume that the drop rate that controlled the queue on the
> + * last cycle is a good starting point to control it now.
> + */
> +			if (now.tv64 - q->drop_next.tv64 < 
> +				16 * q->interval.tv64) {

doing ktime_t compares is tricky. Here its not done properly.

Are you sure we need 64bit and ns resolution ?

IMHO 'unsigned long' values are the best, for 32bit arches.

CoDel paper mentions ms resolution...

Even if you use us resolution, we can have about 2000 secs of window in
an unsigned long.

> +				int c = q->count - 1;
> +				q->count = c < 1 ? 1 : c;
> +			} else {
> +				q->count = 1;
> +			}
> +			q->drop_next = control_law(q,now);
> +		}
> +	}
> +	return skb;
> +}
> +
> +
> +static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> +{

I am worried about the latency spike in dequeue(), if we need to drop
hundred of packets (while device is idle... If BQL is in use... its
really unfortunate)

Could we try to check if we can drop right now one or two packets that
were queued in previous ->queue() calls, to smooth the drop processing ?

> +	if (likely(skb_queue_len(&sch->q) < sch->limit)) {
> +		set_enqueue_time(skb,ktime_get());
> +		return qdisc_enqueue_tail(skb, sch);
> +	}
> +	return qdisc_reshape_fail(skb, sch);
> +}
> +
> +static int codel_change(struct Qdisc *sch, struct nlattr *opt)
> +{
> +	struct codel_sched_data *q = qdisc_priv(sch);
> +	struct tc_codel_qopt *ctl = nla_data(opt);
> +	unsigned int qlen;
> +
> +	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
> +		return -EINVAL;
> +	if (ctl->depth && (ctl->depth < 2 || ctl->depth > 65536))
> +		return -EINVAL;
> +	if (ctl->minbytes && (ctl->minbytes < 64 || ctl->minbytes > 65536))
> +		return -EINVAL;
> +	sch_tree_lock(sch);
> +	if (ctl->minbytes) q->minbytes = ctl->minbytes;

	if (cond)    // NEWLINE
		statement;

> +	if (ctl->flags) q->flags = ctl->flags;
> +	if (ctl->target) q->target = ns_to_ktime((u64) ctl->target * 1000); 
> +	if (ctl->interval) q->interval = 
> +			ns_to_ktime((u64) ctl->interval * 1000); 
> +
> +	/* something saner than this for depth is probably needed */
> +
> +	if (ctl->depth) sch->limit = ctl->depth;
> +	qlen = sch->q.qlen;
> +//	while (sch->q.qlen > ctl->depth)
> +//		codel_drop(skb,sch);
> +//	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); //?
> +	q->drop_next.tv64 = q->first_above_time.tv64 = 0;
> +	q->dropping = 0; /* exit dropping state */
> +	sch_tree_unlock(sch);
> +	return 0;
> +}
> +
> +static int codel_init(struct Qdisc *sch, struct nlattr *opt)
> +{
> +	struct codel_sched_data *q = qdisc_priv(sch);
> +	q->target = MS2TIME(5);
> +	sch->limit =  DEFAULT_CODEL_DEPTH;
> +	q->minbytes = 1500;

	Please dont hard code this value, use ETH_DATA_LEN, or better : 

psched_mtu(qdisc_dev(sch));

> +	q->interval = MS2TIME(100);
> +	if (opt) {
> +		int err = codel_change(sch, opt);
> +		if (err)
> +			return err;
> +	}
> +
> +/*	if (sch->limit >= 1)
> +		sch->flags |= TCQ_F_CAN_BYPASS;
> +	else
> +		sch->flags &= ~TCQ_F_CAN_BYPASS;
> +*/
> +	return 0;
> +}
> +
> +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
> +{
> +	struct codel_sched_data *q = qdisc_priv(sch);
> +	struct tc_codel_qopt opt;
> +	opt.target = (u32) ktime_to_us(q->target);
> +	opt.interval  = (u32) ktime_to_us(q->interval);
> +	opt.depth  = sch->limit;
> +	opt.flags  = q->flags;
> +	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);

NLA_PUT() doesnt exist anymore in net-next tree, please rebase on
net-next

> +	return skb->len;
> +
> +nla_put_failure:
> +//	nlmsg_trim(skb, b);
> +	return -1;
> +}
> +
> +static void
> +codel_reset(struct Qdisc *sch)
> +{
> +	struct sk_buff *skb;
> +
> +	while ((skb = codel_dequeue(sch)) != NULL)
> +		kfree_skb(skb);

> +}
> +
> +struct Qdisc_ops codel_qdisc_ops __read_mostly = {
> +	.id		=	"codel",
> +	.priv_size	=	sizeof(struct codel_sched_data),
> +	.enqueue	=	codel_enqueue,
> +	.dequeue	=	codel_dequeue,
> +	.peek		=	qdisc_peek_head,
> +/*	.drop		=	codel_queue_drop, */
> +	.init		=	codel_init,
> +	.reset		=	codel_reset,
> +	.change		=	codel_change,
> +	.dump		=	codel_dump,
> +	.owner		=	THIS_MODULE,
> +};
> +EXPORT_SYMBOL(codel_qdisc_ops);
> +
> +static int __init codel_module_init(void)
> +{
> +        return register_qdisc(&codel_qdisc_ops);
> +}
> +static void __exit codel_module_exit(void)
> +{
> +        unregister_qdisc(&codel_qdisc_ops);
> +}
> +module_init(codel_module_init)
> +module_exit(codel_module_exit)
> +MODULE_LICENSE("GPL");
> +
> _______________________________________________
> Codel mailing list
> Codel at lists.bufferbloat.net
> https://lists.bufferbloat.net/listinfo/codel