* [Codel] [PATCH] codel take 2
@ 2012-05-04 7:29 Dave Täht
2012-05-04 8:56 ` Eric Dumazet
0 siblings, 1 reply; 3+ messages in thread
From: Dave Täht @ 2012-05-04 7:29 UTC (permalink / raw)
To: codel; +Cc: Dave Täht
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain, Size: 12817 bytes --]
I took most of the suggestions from the code review.
This attempthas some debugging code left in it. It looks logically correct,
and does exercise the various substates, does drop packets, and
does control delay. On a 2mbit htb setup like this:
tc qdisc add dev eth0 root handle 1: est 1sec 8sec htb default 1
tc class add dev eth0 parent 1: classid 1:1 est 1sec 8sec htb \
rate 2000kibit mtu 1500 quantum 1514
tc qdisc add dev eth0 parent 1:1 handle 10: est 1sec 4sec
codel target 5ms interval 100ms depth 1000
talking to some servers a few ms away via netperf,
I see it holding delays below 30ms.
This does not mean that I have the constants or units perfect.
Also for giggles I setup codel as a sub qdisc of qfq, and it worked.
Lastly I ran several dozen GB through it, and pfifo fast on a fast
86_64 box and saw roughly comparable throughput for both.
I get mildly better results with TSO and GSO off.
BUGS:
I gave up on figuring out how to interface tc netlink to it, and
hard coded the default constants in the paper. ???
Still some questions outstanding from the first code review.
htb complains about it not being work conserving.
I'll take another pass at this tomorrow afternoon.
---
include/linux/pkt_sched.h | 29 ++++
net/sched/Kconfig | 11 ++
net/sched/Makefile | 1 +
net/sched/sch_codel.c | 352 +++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 393 insertions(+)
create mode 100644 net/sched/sch_codel.c
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 0d5b793..c21b720 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -633,4 +633,33 @@ struct tc_qfq_stats {
__u32 lmax;
};
+/* CODEL */
+
+enum {
+ TCA_CODEL_UNSPEC,
+ TCA_CODEL_PARMS,
+ TCA_CODEL_TARGET,
+ TCA_CODEL_DEPTH,
+ TCA_CODEL_MINBYTES,
+ TCA_CODEL_INTERVAL,
+ __TCA_CODEL_MAX
+};
+
+#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1)
+#define TC_CODEL_ECN 1
+
+struct tc_codel_qopt {
+ __u32 flags; /* flags (e.g. ecn) */
+ __u32 target; /* max delay, in us */
+ __u32 depth; /* queue depth in packets */
+ __u32 minbytes; /* MTU (usually) */
+ __u32 interval; /* Sliding min time window width (us) */
+};
+
+struct tc_codel_stats {
+ __u64 drops;
+ __u64 marks;
+};
+
+
#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2590e91..8106c42 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -250,6 +250,17 @@ config NET_SCH_QFQ
If unsure, say N.
+config NET_SCH_CODEL
+ tristate "Controlled Delay AQM (CODEL)"
+ help
+ Say Y here if you want to use the Controlled Delay (CODEL)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_codel.
+
+ If unsure, say N.
+
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index dc5889c..41130b5 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
+obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
new file mode 100644
index 0000000..3b74090
--- /dev/null
+++ b/net/sched/sch_codel.c
@@ -0,0 +1,352 @@
+/*
+ * net/sched/sch_codel.c A Codel implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Based on ns2 simulation code presented by Kathie Nichols
+ * Authors: Dave Täht <d@taht.net>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/ktime.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+#define MS2TIME(a) (ns_to_ktime( (u64) a*1000000))
+#define DEFAULT_CODEL_DEPTH 1000
+
+/*
+ * Via patch found at:
+ * http://lkml.indiana.edu/hypermail/linux/kernel/0802.0/0659.html
+ * I don't know why this isn't in ktime.h as it seemed sane...
+*/
+
+/**
+ * ktime_compare - Compares two ktime_t variables
+ *
+ * Return val:
+ * lhs < rhs: < 0
+ * lhs == rhs: 0
+ * lhs > rhs: > 0
+ */
+
+#if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
+static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs)
+{
+ if (lhs.tv64 < rhs.tv64)
+ return -1;
+ if (lhs.tv64 > rhs.tv64)
+ return 1;
+ return 0;
+}
+#else
+static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs)
+{
+ if (lhs.tv.sec < rhs.tv.sec)
+ return -1;
+ if (lhs.tv.sec > rhs.tv.sec)
+ return 1;
+ return lhs.tv.nsec - rhs.tv.nsec;
+}
+#endif
+
+/* Per-queue state (codel_queue_t instance variables) */
+
+struct codel_sched_data {
+ u32 flags;
+ u32 minbytes;
+ u32 count; /* packets dropped since we went into drop state */
+ bool dropping; /* 1 if in drop state, might just add to flags */
+ ktime_t target;
+ /* time to declare above q->target (0 if below)*/
+ ktime_t first_above_time;
+ ktime_t drop_next; /* time to drop next packet */
+ s64 interval;
+};
+
+struct codel_skb_cb {
+ ktime_t enqueue_time;
+ char data[16];
+};
+
+static int debug = 1;
+static int state1 = 0;
+static int state2 = 0;
+static int state3 = 0;
+static int state4 = 0;
+static int states = 0;
+
+static inline struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb));
+ return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static inline ktime_t get_enqueue_time(const struct sk_buff *skb) {
+ return get_codel_cb(skb)->enqueue_time;
+}
+
+static inline ktime_t set_enqueue_time(struct sk_buff *skb, ktime_t t) {
+ get_codel_cb(skb)->enqueue_time = t;
+ return t;
+}
+
+static inline ktime_t control_law(const struct codel_sched_data *q, ktime_t t)
+{
+ return ktime_add_ns(t, q->interval / int_sqrt(q->count));
+}
+
+/*
+static int codel_prob_mark(const struct codel_sched_data *q)
+{
+ return q->flags & TC_CODEL_ECN;
+}
+*/
+
+/* wrappers for ultimate statistics collection */
+
+static int codel_drop(struct sk_buff *skb, struct Qdisc *sch) {
+ printk (KERN_CRIT "droppped packet\n");
+
+ return qdisc_drop(skb,sch);
+}
+
+/*
+static int codel_queue_drop(struct Qdisc *sch) {
+ return qdisc_drop(skb,sch);
+}
+*/
+
+struct sk_buff *codel_dequeue_head(struct Qdisc *sch) {
+ return(qdisc_dequeue_head(sch));
+}
+
+bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ bool drop = false;
+
+ if (skb == NULL) {
+ q->first_above_time.tv64 = 0;
+ } else {
+ ktime_t sojourn_time = ktime_sub(now, get_enqueue_time(skb));
+ if (ktime_compare(sojourn_time, q->target) < 0 ||
+ sch->qstats.backlog < q->minbytes) {
+ /* went below so we’ll stay below for at least q->interval */
+ q->first_above_time.tv64 = 0;
+ } else {
+ if (q->first_above_time.tv64 == 0) {
+ /*
+ * just went above from below. If we stay above
+ * for at least q->interval we’ll say it’s ok to drop
+ */
+ q->first_above_time =
+ ktime_add_ns(now,q->interval);
+ } else if (
+ ktime_compare(now,q->first_above_time) >=0) {
+ drop = true;
+ state1++;
+ }
+ }
+ }
+ return drop;
+}
+
+static struct sk_buff *codel_dequeue(struct Qdisc *sch)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = codel_dequeue_head(sch);
+ ktime_t now;
+ bool drop;
+
+ if (skb == NULL) {
+ q->dropping = false;
+ return skb;
+ }
+ now = ktime_get();
+ drop = should_drop(skb, sch, now);
+ if (q->dropping) {
+ if (! drop) {
+/* sojourn time below target - leave dropping state */
+ q->dropping = false;
+ } else if (ktime_compare(now,q->drop_next) >=0) {
+ state2++;
+/*
+ * It’s time for the next drop. Drop the current packet and dequeue the next.
+ * The dequeue might take us out of dropping state. If not, schedule the
+ * next drop. A large backlog might result in drop rates so high that the next
+ * drop should happen now, hence the ‘while’ loop.
+ */
+ while(q->dropping &&
+ (ktime_compare(now,q->drop_next) >= 0)) {
+ codel_drop(skb, sch);
+ q->count++;
+ skb = codel_dequeue_head(sch);
+ if (! should_drop(skb,sch,now)) {
+ /* leave dropping state */
+ q->dropping = false;
+ } else {
+ /* and schedule the next drop */
+ q->drop_next =
+ control_law(q,q->drop_next);
+ }
+ }
+ }
+ } else if (drop &&
+ ((ktime_compare(ktime_sub(now,q->drop_next),
+ ns_to_ktime(16 * q->interval)) < 0) ||
+ (ktime_compare(ktime_sub(now,q->first_above_time),
+ ns_to_ktime(2 * q->interval)) >= 0 ))) {
+ codel_drop(skb, sch);
+ skb = codel_dequeue_head(sch);
+ drop = should_drop(skb,sch,now);
+ q->dropping = true;
+ state3++;
+ /*
+ * if min went above target close to when we last went below it
+ * assume that the drop rate that controlled the queue on the
+ * last cycle is a good starting point to control it now.
+ */
+ if (ktime_compare(ktime_sub(now,q->drop_next),
+ ns_to_ktime(16 * q->interval)) < 0) {
+ q->count = q->count > 1 ? q->count - 1 : 1;
+ } else {
+ q->count = 1;
+ }
+ q->drop_next = control_law(q,now);
+ }
+/* if(states++ % 64 == 0 ) {
+ printk (KERN_CRIT "s1: %d, s2: %d, s3: %d, d4: %d\n",state1,state2,state3,state4);
+}
+*/
+ return skb;
+}
+
+
+static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ if (likely(skb_queue_len(&sch->q) < sch->limit)) {
+ set_enqueue_time(skb,ktime_get());
+ return qdisc_enqueue_tail(skb, sch);
+ }
+ return qdisc_reshape_fail(skb, sch);
+}
+
+static int codel_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct tc_codel_qopt *ctl = nla_data(opt);
+ unsigned int qlen;
+
+ if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
+ return -EINVAL;
+ if (ctl->depth && (ctl->depth < 2 || ctl->depth > 65536))
+ return -EINVAL;
+ if (ctl->minbytes && (ctl->minbytes < 64 || ctl->minbytes > 65536))
+ return -EINVAL;
+ sch_tree_lock(sch);
+ if (ctl->minbytes)
+ q->minbytes = ctl->minbytes;
+ if (ctl->flags)
+ q->flags = ctl->flags;
+ if (ctl->target)
+ q->target = ns_to_ktime((u64) ctl->target * 1000);
+ if (ctl->interval)
+ q->interval = (s64) ctl->interval * 1000;
+
+ /* something saner than this for depth is probably needed */
+
+ if (ctl->depth)
+ sch->limit = ctl->depth;
+ qlen = sch->q.qlen;
+// while (sch->q.qlen > ctl->depth)
+// codel_drop(skb,sch);
+// qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); //?
+ q->drop_next.tv64 = q->first_above_time.tv64 = 0;
+ q->dropping = 0; /* exit dropping state */
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static int codel_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ q->target = MS2TIME(5);
+ sch->limit = DEFAULT_CODEL_DEPTH;
+ q->minbytes = psched_mtu(qdisc_dev(sch));
+ q->interval = 100 * 1000000;
+ q->drop_next.tv64 = q->first_above_time.tv64 = 0;
+ q->dropping = 0; /* exit dropping state */
+ /* if (!opt) { */
+ /* int err = codel_change(sch, opt); */
+ /* if (err) */
+ /* return err; */
+ /* } */
+
+ if (sch->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+ return 0;
+}
+
+static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct tc_codel_qopt opt;
+ opt.target = (u32) ktime_to_us(q->target);
+ opt.interval = (u32) (q->interval/1000);
+ opt.depth = sch->limit;
+ opt.flags = q->flags;
+ NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+ return skb->len;
+
+nla_put_failure:
+// nlmsg_trim(skb, b);
+ return -1;
+}
+
+static void
+codel_reset(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+
+ while ((skb = codel_dequeue(sch)) != NULL)
+ kfree_skb(skb);
+}
+
+struct Qdisc_ops codel_qdisc_ops __read_mostly = {
+ .id = "codel",
+ .priv_size = sizeof(struct codel_sched_data),
+ .enqueue = codel_enqueue,
+ .dequeue = codel_dequeue,
+ .peek = qdisc_peek_head,
+/* .drop = codel_queue_drop, */
+ .init = codel_init,
+ .reset = codel_reset,
+ .change = codel_change,
+ .dump = codel_dump,
+ .owner = THIS_MODULE,
+};
+EXPORT_SYMBOL(codel_qdisc_ops);
+
+static int __init codel_module_init(void)
+{
+ return register_qdisc(&codel_qdisc_ops);
+}
+static void __exit codel_module_exit(void)
+{
+ unregister_qdisc(&codel_qdisc_ops);
+}
+module_init(codel_module_init)
+module_exit(codel_module_exit)
+MODULE_LICENSE("GPL");
+
--
1.7.9.5
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [Codel] [PATCH] codel take 2
2012-05-04 7:29 [Codel] [PATCH] codel take 2 Dave Täht
@ 2012-05-04 8:56 ` Eric Dumazet
2012-05-04 10:27 ` Eric Dumazet
0 siblings, 1 reply; 3+ messages in thread
From: Eric Dumazet @ 2012-05-04 8:56 UTC (permalink / raw)
To: Dave Täht; +Cc: codel
On Fri, 2012-05-04 at 00:29 -0700, Dave Täht wrote:
> I took most of the suggestions from the code review.
>
> This attempthas some debugging code left in it. It looks logically correct,
> and does exercise the various substates, does drop packets, and
> does control delay. On a 2mbit htb setup like this:
>
> tc qdisc add dev eth0 root handle 1: est 1sec 8sec htb default 1
> tc class add dev eth0 parent 1: classid 1:1 est 1sec 8sec htb \
> rate 2000kibit mtu 1500 quantum 1514
> tc qdisc add dev eth0 parent 1:1 handle 10: est 1sec 4sec
> codel target 5ms interval 100ms depth 1000
>
The usual keyword for tc qdisc is "limit 1000", not "depth 1000"
(CoDel has no depth mentioned at all, so I suspect this can be changed)
> talking to some servers a few ms away via netperf,
>
> I see it holding delays below 30ms.
>
> This does not mean that I have the constants or units perfect.
>
> Also for giggles I setup codel as a sub qdisc of qfq, and it worked.
>
> Lastly I ran several dozen GB through it, and pfifo fast on a fast
> 86_64 box and saw roughly comparable throughput for both.
>
> I get mildly better results with TSO and GSO off.
Since this code doesnt compile on 32bit arch as is and is not on based
on net-next, its a bit convoluted for me right now.
NLA_PUT : Its not that hard to get rid of it.
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
and the 64 bit divide doesnt work on 32bit, you need do_div() helper.
ERROR: "__divdi3" [net/sched/sch_codel.ko] undefined!
Are you sure interval should be an s64 and not an u64 ?
(But my personal choice would be an unsigned long)
u64 interval = q->interval;
do_div(interval, int_sqrt(q->count));
return ktime_add_ns(t, interval);
But a divide is expensive, and do_div() is _really_ expensive on 32bit.
So please prefer an "unsigned long" (max 4 second interval) and avoid
the do_div() game. If 4 sec limit is too low, change ns resolution to
us. Again it would be my personal choice.
same 64bit divide problem in codel_dump()
u64 interval = q->interval;
do_div(interval, NSEC_PER_USEC);
opt.interval = interval;
1000000 in codel_init() should be replaced by NSEC_PER_MSEC
(neat constants from include/linux/time.h)
The "char data[16];" in struct codel_skb_cb is of no use, please remove
it.
You have non ASCII chars to code the ' char in comments ...
Please remove them.
Last : about non conserving stuff, this is because you must call
qdisc_tree_decrease_qlen(sch, number_of_packets_dropped) in
codel_dequeue(), or else your parents has no idea of what happened.
Note : Do that once (batch), not for each packet, because
qdisc_tree_decrease_qlen() is really expensive since it does a linear
search to find parents (we dont have pointers to parents). If your
machine has 1000 qdisc, its slow as hell.
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [Codel] [PATCH] codel take 2
2012-05-04 8:56 ` Eric Dumazet
@ 2012-05-04 10:27 ` Eric Dumazet
0 siblings, 0 replies; 3+ messages in thread
From: Eric Dumazet @ 2012-05-04 10:27 UTC (permalink / raw)
To: Dave Täht; +Cc: codel
On Fri, 2012-05-04 at 10:56 +0200, Eric Dumazet wrote:
> Last : about non conserving stuff, this is because you must call
> qdisc_tree_decrease_qlen(sch, number_of_packets_dropped) in
> codel_dequeue(), or else your parents has no idea of what happened.
>
> Note : Do that once (batch), not for each packet, because
> qdisc_tree_decrease_qlen() is really expensive since it does a linear
> search to find parents (we dont have pointers to parents). If your
> machine has 1000 qdisc, its slow as hell.
>
>
Here is my cleaned up version of the patch, compiled on x86_32, net-next
tree. Feel free to use/amend it.
Some work is needed to get the TCA_CODEL_xxx things right.
include/linux/pkt_sched.h | 29 ++
net/sched/Kconfig | 11 +
net/sched/Makefile | 1
net/sched/sch_codel.c | 360 ++++++++++++++++++++++++++++++++++++
4 files changed, 401 insertions(+)
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index ffe975c..a4a7d04 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -655,4 +655,33 @@ struct tc_qfq_stats {
__u32 lmax;
};
+/* CODEL */
+
+enum {
+ TCA_CODEL_UNSPEC,
+ TCA_CODEL_PARMS,
+ TCA_CODEL_TARGET,
+ TCA_CODEL_LIMIT,
+ TCA_CODEL_MINBYTES,
+ TCA_CODEL_INTERVAL,
+ __TCA_CODEL_MAX
+};
+
+#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1)
+#define TC_CODEL_ECN 1
+
+struct tc_codel_qopt {
+ __u32 flags; /* flags (e.g. ecn) */
+ __u32 target; /* max delay, in us */
+ __u32 limit; /* queue limit in packets */
+ __u32 minbytes; /* MTU (usually) */
+ __u32 interval; /* Sliding min time window width (us) */
+};
+
+struct tc_codel_stats {
+ __u64 drops;
+ __u64 marks;
+};
+
+
#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 75b58f8..fadd252 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -250,6 +250,17 @@ config NET_SCH_QFQ
If unsure, say N.
+config NET_SCH_CODEL
+ tristate "Controlled Delay AQM (CODEL)"
+ help
+ Say Y here if you want to use the Controlled Delay (CODEL)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_codel.
+
+ If unsure, say N.
+
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8cdf4e2..30fab03 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
+obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index e69de29..91e7798 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -0,0 +1,360 @@
+/*
+ * net/sched/sch_codel.c A Codel implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Based on ns2 simulation code presented by Kathie Nichols
+ * Authors: Dave Täht <d@taht.net>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/ktime.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+#define MS2TIME(a) (ns_to_ktime( (u64) a * NSEC_PER_MSEC))
+#define DEFAULT_CODEL_LIMIT 1000
+
+/*
+ * Via patch found at:
+ * http://lkml.indiana.edu/hypermail/linux/kernel/0802.0/0659.html
+ * I don't know why this isn't in ktime.h as it seemed sane...
+*/
+
+/**
+ * ktime_compare - Compares two ktime_t variables
+ *
+ * Return val:
+ * lhs < rhs: < 0
+ * lhs == rhs: 0
+ * lhs > rhs: > 0
+ */
+
+#if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
+static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs)
+{
+ if (lhs.tv64 < rhs.tv64)
+ return -1;
+ if (lhs.tv64 > rhs.tv64)
+ return 1;
+ return 0;
+}
+#else
+static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs)
+{
+ if (lhs.tv.sec < rhs.tv.sec)
+ return -1;
+ if (lhs.tv.sec > rhs.tv.sec)
+ return 1;
+ return lhs.tv.nsec - rhs.tv.nsec;
+}
+#endif
+
+/* Per-queue state (codel_queue_t instance variables) */
+
+struct codel_sched_data {
+ u32 flags;
+ u32 minbytes;
+ u32 count; /* packets dropped since we went into drop state */
+ bool dropping;
+ ktime_t target;
+ /* time to declare above q->target (0 if below)*/
+ ktime_t first_above_time;
+ ktime_t drop_next; /* time to drop next packet */
+ unsigned long interval;
+};
+
+struct codel_skb_cb {
+ ktime_t enqueue_time;
+};
+
+static unsigned int state1;
+static unsigned int state2;
+static unsigned int state3;
+static unsigned int state4;
+static unsigned int states;
+
+static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb));
+ return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static ktime_t get_enqueue_time(const struct sk_buff *skb)
+{
+ return get_codel_cb(skb)->enqueue_time;
+}
+
+static void set_enqueue_time(struct sk_buff *skb)
+{
+ get_codel_cb(skb)->enqueue_time = ktime_get();
+}
+
+static ktime_t control_law(const struct codel_sched_data *q, ktime_t t)
+{
+ return ktime_add_ns(t, q->interval / int_sqrt(q->count));
+}
+
+/*
+static int codel_prob_mark(const struct codel_sched_data *q)
+{
+ return q->flags & TC_CODEL_ECN;
+}
+*/
+
+/* wrappers for ultimate statistics collection */
+
+static int codel_drop(struct sk_buff *skb, struct Qdisc *sch)
+{
+ pr_err("droppped packet\n");
+ return qdisc_drop(skb, sch);
+}
+
+/*
+static int codel_queue_drop(struct Qdisc *sch)
+{
+ return qdisc_drop(skb, sch);
+}
+*/
+
+struct sk_buff *codel_dequeue_head(struct Qdisc *sch)
+{
+ return qdisc_dequeue_head(sch);
+}
+
+bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ ktime_t sojourn_time;
+ bool drop = false;
+
+ if (!skb) {
+ q->first_above_time.tv64 = 0;
+ return false;
+ }
+ sojourn_time = ktime_sub(now, get_enqueue_time(skb));
+
+ if (ktime_compare(sojourn_time, q->target) < 0 ||
+ sch->qstats.backlog < q->minbytes) {
+ /* went below so we'll stay below for at least q->interval */
+ q->first_above_time.tv64 = 0;
+ } else {
+ if (q->first_above_time.tv64 == 0) {
+ /* just went above from below. If we stay above
+ * for at least q->interval we'll say it's ok to drop
+ */
+ q->first_above_time = ktime_add_ns(now, q->interval);
+ } else if (ktime_compare(now, q->first_above_time) >= 0) {
+ drop = true;
+ state1++;
+ }
+ }
+ return drop;
+}
+
+static struct sk_buff *codel_dequeue(struct Qdisc *sch)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = codel_dequeue_head(sch);
+ ktime_t now;
+ unsigned int drop_count = 0;
+ bool drop;
+
+ if (!skb) {
+ q->dropping = false;
+ return skb;
+ }
+ now = ktime_get();
+ drop = should_drop(skb, sch, now);
+ if (q->dropping) {
+ if (!drop) {
+ /* sojourn time below target - leave dropping state */
+ q->dropping = false;
+ } else if (ktime_compare(now, q->drop_next) >=0) {
+ state2++;
+ /* It's time for the next drop. Drop the current packet
+ * and dequeue the next. The dequeue might take us
+ * out of dropping state. If not, schedule the next drop.
+ * A large backlog might result in drop rates so high
+ * that the next drop should happen now, hence the while loop.
+ */
+ while (q->dropping &&
+ (ktime_compare(now, q->drop_next) >= 0)) {
+ codel_drop(skb, sch);
+ drop_count++;
+ q->count++;
+ skb = codel_dequeue_head(sch);
+ if (!should_drop(skb, sch, now)) {
+ /* leave dropping state */
+ q->dropping = false;
+ } else {
+ /* and schedule the next drop */
+ q->drop_next =
+ control_law(q, q->drop_next);
+ }
+ }
+ }
+ } else if (drop &&
+ ((ktime_compare(ktime_sub(now, q->drop_next),
+ ns_to_ktime(16 * q->interval)) < 0) ||
+ (ktime_compare(ktime_sub(now, q->first_above_time),
+ ns_to_ktime(2 * q->interval)) >= 0 ))) {
+ codel_drop(skb, sch);
+ drop_count++;
+ skb = codel_dequeue_head(sch);
+ drop = should_drop(skb, sch, now);
+ q->dropping = true;
+ state3++;
+ /*
+ * if min went above target close to when we last went below it
+ * assume that the drop rate that controlled the queue on the
+ * last cycle is a good starting point to control it now.
+ */
+ if (ktime_compare(ktime_sub(now, q->drop_next),
+ ns_to_ktime(16 * q->interval)) < 0) {
+ q->count = q->count > 1 ? q->count - 1 : 1;
+ } else {
+ q->count = 1;
+ }
+ q->drop_next = control_law(q, now);
+ }
+ if ((states++ % 64) == 0) {
+ pr_debug("s1: %u, s2: %u, s3: %u, d4: %u\n", state1, state2, state3, state4);
+ }
+ if (drop_count)
+ qdisc_tree_decrease_qlen(sch, drop_count);
+ return skb;
+}
+
+
+static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ if (likely(skb_queue_len(&sch->q) < sch->limit)) {
+ set_enqueue_time(skb);
+ return qdisc_enqueue_tail(skb, sch);
+ }
+ return qdisc_reshape_fail(skb, sch);
+}
+
+static int codel_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct tc_codel_qopt *ctl = nla_data(opt);
+ unsigned int qlen;
+
+ if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
+ return -EINVAL;
+ if (ctl->limit && (ctl->limit < 2 || ctl->limit > 65536))
+ return -EINVAL;
+ if (ctl->minbytes && (ctl->minbytes < 64 || ctl->minbytes > 65536))
+ return -EINVAL;
+ sch_tree_lock(sch);
+ if (ctl->minbytes)
+ q->minbytes = ctl->minbytes;
+ if (ctl->flags)
+ q->flags = ctl->flags;
+ if (ctl->target)
+ q->target = ns_to_ktime((u64) ctl->target * 1000);
+ if (ctl->interval) {
+ u32 interval = min_t(u32, ~0U / NSEC_PER_USEC, ctl->interval);
+
+ q->interval = interval * NSEC_PER_USEC;
+ }
+ /* something saner than this for limit is probably needed */
+
+ if (ctl->limit)
+ sch->limit = ctl->limit;
+ qlen = sch->q.qlen;
+// while (sch->q.qlen > ctl->limit)
+// codel_drop(skb, sch);
+// qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); //?
+ q->drop_next.tv64 = q->first_above_time.tv64 = 0;
+ q->dropping = false;
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static int codel_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+
+ q->target = MS2TIME(5);
+ sch->limit = DEFAULT_CODEL_LIMIT;
+ q->minbytes = psched_mtu(qdisc_dev(sch));
+ q->interval = 100 * NSEC_PER_MSEC;
+ q->drop_next.tv64 = q->first_above_time.tv64 = 0;
+ q->dropping = false; /* exit dropping state */
+ /* if (!opt) { */
+ /* int err = codel_change(sch, opt); */
+ /* if (err) */
+ /* return err; */
+ /* } */
+
+ if (sch->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+ return 0;
+}
+
+static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct tc_codel_qopt opt;
+
+ opt.target = (u32) ktime_to_us(q->target);
+ opt.interval = q->interval / NSEC_PER_USEC;
+ opt.limit = sch->limit;
+ opt.minbytes = q->minbytes;
+ opt.flags = q->flags;
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ return -1;
+}
+
+static void codel_reset(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+
+ while ((skb = codel_dequeue(sch)) != NULL)
+ kfree_skb(skb);
+}
+
+struct Qdisc_ops codel_qdisc_ops __read_mostly = {
+ .id = "codel",
+ .priv_size = sizeof(struct codel_sched_data),
+ .enqueue = codel_enqueue,
+ .dequeue = codel_dequeue,
+ .peek = qdisc_peek_head,
+/* .drop = codel_queue_drop, */
+ .init = codel_init,
+ .reset = codel_reset,
+ .change = codel_change,
+ .dump = codel_dump,
+ .owner = THIS_MODULE,
+};
+EXPORT_SYMBOL(codel_qdisc_ops);
+
+static int __init codel_module_init(void)
+{
+ return register_qdisc(&codel_qdisc_ops);
+}
+static void __exit codel_module_exit(void)
+{
+ unregister_qdisc(&codel_qdisc_ops);
+}
+module_init(codel_module_init)
+module_exit(codel_module_exit)
+MODULE_LICENSE("GPL");
+
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2012-05-04 10:28 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-04 7:29 [Codel] [PATCH] codel take 2 Dave Täht
2012-05-04 8:56 ` Eric Dumazet
2012-05-04 10:27 ` Eric Dumazet
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox