[Codel] [PATCH] codel take 2

Fri May 4 03:29:49 EDT 2012

I took most of the suggestions from the code review.

This attempthas some debugging code left in it. It looks logically correct,
and does exercise the various substates, does drop packets, and
does control delay. On a 2mbit htb setup like this:

tc qdisc add dev eth0 root handle 1: est 1sec 8sec htb default 1
tc class add dev eth0 parent 1: classid 1:1 est 1sec 8sec htb \
rate 2000kibit mtu 1500 quantum 1514
tc qdisc add dev eth0 parent 1:1 handle 10: est 1sec 4sec
        codel target 5ms interval 100ms depth 1000

talking to some servers a few ms away via netperf,

I see it holding delays below 30ms.

This does not mean that I have the constants or units perfect.

Also for giggles I setup codel as a sub qdisc of qfq, and it worked.

Lastly I ran several dozen GB through it, and pfifo fast on a fast
86_64 box and saw roughly comparable throughput for both.

I get mildly better results with TSO and GSO off.

BUGS:

I gave up on figuring out how to interface tc netlink to it, and
hard coded the default constants in the paper. ???

Still some questions outstanding from the first code review.

htb complains about it not being work conserving.

I'll take another pass at this tomorrow afternoon.
---
 include/linux/pkt_sched.h |   29 ++++
 net/sched/Kconfig         |   11 ++
 net/sched/Makefile        |    1 +
 net/sched/sch_codel.c     |  352 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 393 insertions(+)
 create mode 100644 net/sched/sch_codel.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 0d5b793..c21b720 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -633,4 +633,33 @@ struct tc_qfq_stats {
 	__u32 lmax;
 };
 
+/* CODEL */
+
+enum {
+	TCA_CODEL_UNSPEC,
+	TCA_CODEL_PARMS,
+	TCA_CODEL_TARGET,
+	TCA_CODEL_DEPTH,
+	TCA_CODEL_MINBYTES,
+	TCA_CODEL_INTERVAL,
+	__TCA_CODEL_MAX
+};
+
+#define TCA_CODEL_MAX	(__TCA_CODEL_MAX - 1)
+#define TC_CODEL_ECN 1
+
+struct tc_codel_qopt {
+        __u32 flags;	/* flags (e.g. ecn) */
+	__u32 target;	/* max delay, in us */
+        __u32 depth;	/* queue depth in packets */
+        __u32 minbytes;	/* MTU (usually) */
+        __u32 interval;	/* Sliding min time window width (us) */
+};
+
+struct tc_codel_stats {
+	__u64 drops;
+	__u64 marks;
+};
+
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2590e91..8106c42 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -250,6 +250,17 @@ config NET_SCH_QFQ
 
 	  If unsure, say N.
 
+config NET_SCH_CODEL
+	tristate "Controlled Delay AQM (CODEL)"
+	help
+	  Say Y here if you want to use the Controlled Delay (CODEL)
+	  packet scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_codel.
+
+	  If unsure, say N.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index dc5889c..41130b5 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
 obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o
 obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
+obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
new file mode 100644
index 0000000..3b74090
--- /dev/null
+++ b/net/sched/sch_codel.c
@@ -0,0 +1,352 @@
+/*
+ * net/sched/sch_codel.c	A Codel implementation
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Based on ns2 simulation code presented by Kathie Nichols
+ * Authors:	Dave Täht <d at taht.net>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/ktime.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+#define MS2TIME(a) (ns_to_ktime( (u64) a*1000000))
+#define DEFAULT_CODEL_DEPTH 1000
+
+/* 
+ * Via patch found at:
+ * http://lkml.indiana.edu/hypermail/linux/kernel/0802.0/0659.html 
+ * I don't know why this isn't in ktime.h as it seemed sane...
+*/
+
+/**
+ * ktime_compare - Compares two ktime_t variables
+ *
+ * Return val:
+ * lhs < rhs: < 0
+ * lhs == rhs: 0
+ * lhs > rhs: > 0
+ */
+
+#if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
+static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs)
+{
+	if (lhs.tv64 < rhs.tv64)
+		return -1;
+	if (lhs.tv64 > rhs.tv64)
+		return 1;
+	return 0;
+}
+#else
+static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs)
+{
+	if (lhs.tv.sec < rhs.tv.sec)
+		return -1;
+	if (lhs.tv.sec > rhs.tv.sec)
+		return 1;
+	return lhs.tv.nsec - rhs.tv.nsec;
+}
+#endif
+
+/* Per-queue state (codel_queue_t instance variables) */
+
+struct codel_sched_data {
+	u32 flags;
+	u32 minbytes;
+	u32 count; /* packets dropped since we went into drop state */
+	bool dropping;	/* 1 if in drop state, might just add to flags */
+	ktime_t target;
+        /* time to declare above q->target (0 if below)*/
+	ktime_t first_above_time;
+	ktime_t drop_next; /* time to drop next packet */
+	s64 interval; 
+};
+
+struct codel_skb_cb {
+	ktime_t enqueue_time;
+	char data[16];
+};
+
+static int debug = 1;
+static int state1 = 0;
+static int state2 = 0;
+static int state3 = 0;
+static int state4 = 0;
+static int states = 0;
+
+static inline struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb)
+{
+	qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb));
+	return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static inline ktime_t get_enqueue_time(const struct sk_buff *skb) {
+	return get_codel_cb(skb)->enqueue_time;
+}
+
+static inline ktime_t set_enqueue_time(struct sk_buff *skb, ktime_t t) {
+	get_codel_cb(skb)->enqueue_time = t;
+	return t;
+}
+
+static inline ktime_t control_law(const struct codel_sched_data *q, ktime_t t)
+{
+	return ktime_add_ns(t, q->interval / int_sqrt(q->count));
+}
+
+/*
+static int codel_prob_mark(const struct codel_sched_data *q)
+{
+	return q->flags & TC_CODEL_ECN;
+}
+*/
+
+/* wrappers for ultimate statistics collection */
+
+static int codel_drop(struct sk_buff *skb, struct Qdisc *sch) {
+  printk (KERN_CRIT "droppped packet\n");
+
+	return qdisc_drop(skb,sch);
+}
+
+/*
+static int codel_queue_drop(struct Qdisc *sch) {
+	return qdisc_drop(skb,sch);
+}
+*/
+
+struct sk_buff *codel_dequeue_head(struct Qdisc *sch) {
+	return(qdisc_dequeue_head(sch));
+}
+
+bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+	bool drop = false;
+
+	if (skb == NULL) {
+		q->first_above_time.tv64 = 0;
+	} else {
+		ktime_t sojourn_time = ktime_sub(now, get_enqueue_time(skb));
+		if (ktime_compare(sojourn_time, q->target) < 0 || 
+		    sch->qstats.backlog < q->minbytes) {
+		/* went below so we’ll stay below for at least q->interval */
+			q->first_above_time.tv64 = 0;
+		} else {
+			if (q->first_above_time.tv64 == 0) {
+		/* 
+		 * just went above from below. If we stay above
+		 * for at least q->interval we’ll say it’s ok to drop 
+		 */
+				q->first_above_time = 
+					ktime_add_ns(now,q->interval);
+			} else if (
+				ktime_compare(now,q->first_above_time) >=0) {
+				drop = true;
+				state1++;
+			}
+		}
+	}
+	return drop;
+}
+
+static struct sk_buff *codel_dequeue(struct Qdisc *sch)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb = codel_dequeue_head(sch);
+	ktime_t now;
+	bool drop;
+
+	if (skb == NULL) {
+		q->dropping = false;
+		return skb;
+	}
+	now = ktime_get();
+	drop = should_drop(skb, sch, now);
+	if (q->dropping) {
+		if (! drop) {
+/* sojourn time below target - leave dropping state */
+			q->dropping = false;
+		} else if (ktime_compare(now,q->drop_next) >=0) {
+			state2++;
+/*
+ * It’s time for the next drop. Drop the current packet and dequeue the next. 
+ * The dequeue might take us out of dropping state. If not, schedule the 
+ * next drop. A large backlog might result in drop rates so high that the next
+ * drop should happen now, hence the ‘while’ loop.
+ */  
+			while(q->dropping && 
+			      (ktime_compare(now,q->drop_next) >= 0)) {
+				codel_drop(skb, sch);
+				q->count++;
+				skb = codel_dequeue_head(sch);
+				if (! should_drop(skb,sch,now)) {
+				/* leave dropping state */
+					q->dropping = false;
+				} else {
+				/* and schedule the next drop */
+					q->drop_next = 
+						control_law(q,q->drop_next);
+				}
+			}
+		}
+	} else if (drop &&
+		   ((ktime_compare(ktime_sub(now,q->drop_next),
+				   ns_to_ktime(16 * q->interval)) < 0) ||
+		   (ktime_compare(ktime_sub(now,q->first_above_time),
+				  ns_to_ktime(2 * q->interval)) >= 0 ))) {
+		codel_drop(skb, sch);
+		skb = codel_dequeue_head(sch);
+		drop = should_drop(skb,sch,now);
+		q->dropping = true;
+		state3++;
+		/* 
+		 * if min went above target close to when we last went below it
+		 * assume that the drop rate that controlled the queue on the
+		 * last cycle is a good starting point to control it now.
+		 */
+		if (ktime_compare(ktime_sub(now,q->drop_next),
+				  ns_to_ktime(16 * q->interval)) < 0) {
+			q->count = q->count > 1 ? q->count - 1 : 1;
+		} else {
+			q->count = 1;
+		}
+		q->drop_next = control_law(q,now);
+	}
+/* if(states++ % 64 == 0 ) { 
+	printk (KERN_CRIT "s1: %d, s2: %d, s3: %d, d4: %d\n",state1,state2,state3,state4); 
+}
+*/
+	return skb;
+}
+
+
+static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	if (likely(skb_queue_len(&sch->q) < sch->limit)) {
+		set_enqueue_time(skb,ktime_get());
+		return qdisc_enqueue_tail(skb, sch);
+	}
+	return qdisc_reshape_fail(skb, sch);
+}
+
+static int codel_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+	struct tc_codel_qopt *ctl = nla_data(opt);
+	unsigned int qlen;
+
+	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
+		return -EINVAL;
+	if (ctl->depth && (ctl->depth < 2 || ctl->depth > 65536))
+		return -EINVAL;
+	if (ctl->minbytes && (ctl->minbytes < 64 || ctl->minbytes > 65536))
+		return -EINVAL;
+	sch_tree_lock(sch);
+	if (ctl->minbytes)
+		q->minbytes = ctl->minbytes;
+	if (ctl->flags)
+		q->flags = ctl->flags;
+	if (ctl->target)
+		q->target = ns_to_ktime((u64) ctl->target * 1000); 
+	if (ctl->interval)
+		q->interval = (s64) ctl->interval * 1000; 
+
+	/* something saner than this for depth is probably needed */
+
+	if (ctl->depth) 
+		sch->limit = ctl->depth;
+	qlen = sch->q.qlen;
+//	while (sch->q.qlen > ctl->depth)
+//		codel_drop(skb,sch);
+//	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); //?
+	q->drop_next.tv64 = q->first_above_time.tv64 = 0;
+	q->dropping = 0; /* exit dropping state */
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static int codel_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+	q->target = MS2TIME(5);
+	sch->limit =  DEFAULT_CODEL_DEPTH;
+	q->minbytes = psched_mtu(qdisc_dev(sch));
+	q->interval = 100 * 1000000;
+	q->drop_next.tv64 = q->first_above_time.tv64 = 0;
+	q->dropping = 0; /* exit dropping state */
+	/* if (!opt) { */
+	/* 	int err = codel_change(sch, opt); */
+	/* 	if (err) */
+	/* 		return err; */
+	/* } */
+
+	if (sch->limit >= 1)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+	return 0;
+}
+
+static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+	struct tc_codel_qopt opt;
+	opt.target = (u32) ktime_to_us(q->target);
+	opt.interval  = (u32) (q->interval/1000);
+	opt.depth  = sch->limit;
+	opt.flags  = q->flags;
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	return skb->len;
+
+nla_put_failure:
+//	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static void
+codel_reset(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	while ((skb = codel_dequeue(sch)) != NULL)
+		kfree_skb(skb);
+}
+
+struct Qdisc_ops codel_qdisc_ops __read_mostly = {
+	.id		=	"codel",
+	.priv_size	=	sizeof(struct codel_sched_data),
+	.enqueue	=	codel_enqueue,
+	.dequeue	=	codel_dequeue,
+	.peek		=	qdisc_peek_head,
+/*	.drop		=	codel_queue_drop, */
+	.init		=	codel_init,
+	.reset		=	codel_reset,
+	.change		=	codel_change,
+	.dump		=	codel_dump,
+	.owner		=	THIS_MODULE,
+};
+EXPORT_SYMBOL(codel_qdisc_ops);
+
+static int __init codel_module_init(void)
+{
+        return register_qdisc(&codel_qdisc_ops);
+}
+static void __exit codel_module_exit(void)
+{
+        unregister_qdisc(&codel_qdisc_ops);
+}
+module_init(codel_module_init)
+module_exit(codel_module_exit)
+MODULE_LICENSE("GPL");
+
-- 
1.7.9.5