scaled for 16 bits of precision and added cache I've been testing with ethernet scaled down to 100Mbit so I can get packet drop easily... ethtool -K eth0 tso off ethtool -K eth0 gso off ethtool -s eth0 advertise 0x008 It's too early to benchmark but one netperf has ping rtts ~8-11ms vs 40 netperfs... vs pfifo_fast, txqueue of 1000, ping rtt can hit 600ms. tcp_rr is <2 for two streams.. With this codel implementation, ping avg is good (2.5ms normall). tcp_rr is ~25 for two streams... Sharing is excellent vs a vs multiple streams... 100 packets transmitted, 97 received, 3% packet loss, time 99159ms rtt min/avg/max/mdev = 2.321/25.837/86.491/8.383 ms Feel free to substitute your own servers and ping targets SERVER=huchra.bufferbloat.net for i in `seq 1 10` do netperf -4 -l 120 -H $SERVER -t TCP_MAERTS & netperf -4 -l 120 -H $SERVER -t TCP_STREAM & netperf -6 -l 120 -H $SERVER -t TCP_MAERTS & netperf -6 -l 120 -H $SERVER -t TCP_STREAM & done netperf -4 -l 60 -H $SERVER -t TCP_RR & netperf -6 -l 60 -H $SERVER -t TCP_RR & ping -c 100 8.8.8.8 --- include/linux/pkt_sched.h | 29 ++++ net/sched/Kconfig | 11 ++ net/sched/Makefile | 1 + net/sched/sch_codel.c | 398 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 439 insertions(+) create mode 100644 net/sched/sch_codel.c diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 0d5b793..7d50687 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -633,4 +633,33 @@ struct tc_qfq_stats { __u32 lmax; }; +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_PARMS, + TCA_CODEL_TARGET, + TCA_CODEL_LIMIT, + TCA_CODEL_MINBYTES, + TCA_CODEL_INTERVAL, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) +#define TC_CODEL_ECN 1 + +struct tc_codel_qopt { + __u32 flags; /* flags (e.g. ecn) */ + __u32 target; /* max delay, in us */ + __u32 limit; /* queue limit in packets */ + __u32 minbytes; /* MTU (usually) */ + __u32 interval; /* Sliding min time window width (us) */ +}; + +struct tc_codel_stats { + __u64 drops; + __u64 marks; +}; + + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2590e91..8106c42 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -250,6 +250,17 @@ config NET_SCH_QFQ If unsure, say N. +config NET_SCH_CODEL + tristate "Controlled Delay AQM (CODEL)" + help + Say Y here if you want to use the Controlled Delay (CODEL) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_codel. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index dc5889c..41130b5 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c new file mode 100644 index 0000000..636f505 --- /dev/null +++ b/net/sched/sch_codel.c @@ -0,0 +1,398 @@ +/* + * net/sched/sch_codel.c A Codel implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Codel, the COntrolled DELay Queueing discipline + * Based on ns2 simulation code presented by Kathie Nichols + * Authors: Dave Täht , Eric Dumazet + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MS2TIME(a) (ns_to_ktime( (u64) a * NSEC_PER_MSEC)) +#define DEFAULT_CODEL_LIMIT 1000 +#define PRECALC_MAX 64 + +/* + * Via patch found at: + * http://lkml.indiana.edu/hypermail/linux/kernel/0802.0/0659.html + * I don't know why this isn't in ktime.h as it seemed sane... +*/ + +/* + * ktime_compare - Compares two ktime_t variables + * + * Return val: + * lhs < rhs: < 0 + * lhs == rhs: 0 + * lhs > rhs: > 0 + */ + +#if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR) +static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs) +{ + if (lhs.tv64 < rhs.tv64) + return -1; + if (lhs.tv64 > rhs.tv64) + return 1; + return 0; +} +#else +static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs) +{ + if (lhs.tv.sec < rhs.tv.sec) + return -1; + if (lhs.tv.sec > rhs.tv.sec) + return 1; + return lhs.tv.nsec - rhs.tv.nsec; +} +#endif + +/* Per-queue state (codel_queue_t instance variables) */ + +struct codel_sched_data { + u32 flags; + u32 minbytes; + u32 count; /* packets dropped since we went into drop state */ + bool dropping; + ktime_t target; + /* time to declare above q->target (0 if below)*/ + ktime_t first_above_time; + ktime_t drop_next; /* time to drop next packet */ + unsigned long interval; + u32 precalc[PRECALC_MAX]; +}; + +struct codel_skb_cb { + ktime_t enqueue_time; +}; + +static unsigned int state1; +static unsigned int state2; +static unsigned int state3; +static unsigned int states; + +/* + * return interval/sqrt(x) with good precision + */ + +static u32 calc(u64 interval, unsigned long x) +{ + /* scale for 16 bits precision */ + while (x < (1UL << 30)) { + x <<= 2; + interval <<= 1; + } + do_div(interval, int_sqrt(x)); + return (u32)interval; +} + +static void codel_fill_cache(struct codel_sched_data *q) { + int i; + + q->precalc[0] = q->interval; + for(i=2; i <= PRECALC_MAX; i++) + q->precalc[i-1] = calc(q->interval, i); +} + +static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb)); + return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static ktime_t get_enqueue_time(const struct sk_buff *skb) +{ + return get_codel_cb(skb)->enqueue_time; +} + +static void set_enqueue_time(struct sk_buff *skb) +{ + get_codel_cb(skb)->enqueue_time = ktime_get(); +} + +/* + * The original control_law required floating point. + * + * return ktime_add_ns(t, q->interval / sqrt(q->count)); + * + */ + +static ktime_t control_law(const struct codel_sched_data *q, ktime_t t) +{ + if(unlikely(q->count > PRECALC_MAX)) + return ktime_add_ns(t, calc(q->interval,q->count)); + return ktime_add_ns(t, q->precalc[q->count - 1]); +} + +/* +static int codel_prob_mark(const struct codel_sched_data *q) +{ + return q->flags & TC_CODEL_ECN; +} +*/ + +/* wrappers for ultimate statistics collection */ + +static int codel_drop(struct sk_buff *skb, struct Qdisc *sch) +{ + return qdisc_drop(skb, sch); +} + +/* +static int codel_queue_drop(struct Qdisc *sch) +{ + return qdisc_drop(skb, sch); +} +*/ + +struct sk_buff *codel_dequeue_head(struct Qdisc *sch) +{ + return qdisc_dequeue_head(sch); +} + +bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now) +{ + struct codel_sched_data *q = qdisc_priv(sch); + ktime_t sojourn_time; + bool drop = false; + + if (!skb) { + q->first_above_time.tv64 = 0; + return false; + } + sojourn_time = ktime_sub(now, get_enqueue_time(skb)); + + if (ktime_compare(sojourn_time, q->target) < 0 || + sch->qstats.backlog < q->minbytes) { + /* went below so we'll stay below for at least q->interval */ + q->first_above_time.tv64 = 0; + } else { + if (q->first_above_time.tv64 == 0) { + /* just went above from below. If we stay above + * for at least q->interval we'll say it's ok to drop + */ + q->first_above_time = ktime_add_ns(now, q->interval); + } else if (ktime_compare(now, q->first_above_time) >= 0) { + drop = true; + state1++; + } + } + return drop; +} + +static struct sk_buff *codel_dequeue(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = codel_dequeue_head(sch); + ktime_t now; + unsigned int drop_count = 0; + bool drop; + + if (!skb) { + q->dropping = false; + return skb; + } + now = ktime_get(); + drop = should_drop(skb, sch, now); + if (q->dropping) { + if (!drop) { + /* sojourn time below target - leave dropping state */ + q->dropping = false; + } else if (ktime_compare(now, q->drop_next) >=0) { + state2++; + /* It's time for the next drop. Drop the current + * packet and dequeue the next. The dequeue might + * take us out of dropping state. + * If not, schedule the next drop. + * A large backlog might result in drop rates so high + * that the next drop should happen now, + * hence the while loop. + */ + while (q->dropping && + (ktime_compare(now, q->drop_next) >= 0)) { + codel_drop(skb, sch); + drop_count++; + q->count++; + skb = codel_dequeue_head(sch); + if (!should_drop(skb, sch, now)) { + /* leave dropping state */ + q->dropping = false; + } else { + /* and schedule the next drop */ + q->drop_next = + control_law(q, q->drop_next); + } + } + } + } else if (drop && + ((ktime_compare(ktime_sub(now, q->drop_next), + ns_to_ktime(16 * q->interval)) < 0) || + (ktime_compare(ktime_sub(now, q->first_above_time), + ns_to_ktime(2 * q->interval)) >= 0 ))) { + codel_drop(skb, sch); + drop_count++; + skb = codel_dequeue_head(sch); + drop = should_drop(skb, sch, now); + q->dropping = true; + state3++; + /* + * if min went above target close to when we last went below it + * assume that the drop rate that controlled the queue on the + * last cycle is a good starting point to control it now. + */ + if (ktime_compare(ktime_sub(now, q->drop_next), + ns_to_ktime(16 * q->interval)) < 0) { + q->count = q->count > 1 ? q->count - 1 : 1; + } else { + q->count = 1; + } + q->drop_next = control_law(q, now); + } + if ((states++ % 64) == 0) { + pr_debug("s1: %u, s2: %u, s3: %u\n", + state1, state2, state3); + } + if (drop_count) + qdisc_tree_decrease_qlen(sch, drop_count); + return skb; +} + +static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + if (likely(skb_queue_len(&sch->q) < sch->limit)) { + set_enqueue_time(skb); + return qdisc_enqueue_tail(skb, sch); + } + return qdisc_reshape_fail(skb, sch); +} + +static int codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct tc_codel_qopt *ctl = nla_data(opt); + unsigned int qlen; + pr_debug("minbytes: %u, target: %u, limit: %u, flags: %u\n", + ctl->minbytes, ctl->target, ctl->limit, ctl->flags); + + if (opt->nla_len < nla_attr_size(sizeof(*ctl))) + return -EINVAL; + if (ctl->limit && (ctl->limit < 2 || ctl->limit > 65536)) + return -EINVAL; + if (ctl->minbytes && (ctl->minbytes < 64 || ctl->minbytes > 65536)) + return -EINVAL; + sch_tree_lock(sch); + if (ctl->minbytes) + q->minbytes = ctl->minbytes; + if (ctl->flags) + q->flags = ctl->flags; + if (ctl->target) + q->target = ns_to_ktime((u64) ctl->target * 1000); + if (ctl->interval) { + u32 interval = min_t(u32, ~0U / NSEC_PER_USEC, ctl->interval); + q->interval = interval * NSEC_PER_USEC; + codel_fill_cache(q); + } + if (ctl->limit) + sch->limit = ctl->limit; + qlen = sch->q.qlen; + while (sch->q.qlen > ctl->limit) + qdisc_drop(qdisc_dequeue_head(sch), sch); + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + q->drop_next.tv64 = q->first_above_time.tv64 = 0; + q->dropping = false; + sch_tree_unlock(sch); + return 0; +} + +static int codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + q->target = MS2TIME(5); + /* It should be possible to run with no limit, actually */ + sch->limit = DEFAULT_CODEL_LIMIT; + q->minbytes = psched_mtu(qdisc_dev(sch)); + q->interval = 100 * NSEC_PER_MSEC; + q->drop_next.tv64 = q->first_above_time.tv64 = 0; + q->dropping = false; /* exit dropping state */ + codel_fill_cache(q); +// if (opt) { +// int err = codel_change(sch, opt); +// if (err) +// return err; +// } + + if (sch->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + + return 0; +} + +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct tc_codel_qopt opt; + + opt.target = (u32) ktime_to_us(q->target); + opt.interval = q->interval / NSEC_PER_USEC; + opt.limit = sch->limit; + opt.minbytes = q->minbytes; + opt.flags = q->flags; + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; + return skb->len; + +nla_put_failure: + return -1; +} + +static void codel_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = codel_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +struct Qdisc_ops codel_qdisc_ops __read_mostly = { + .id = "codel", + .priv_size = sizeof(struct codel_sched_data), + .enqueue = codel_enqueue, + .dequeue = codel_dequeue, + .peek = qdisc_peek_head, +/* .drop = codel_queue_drop, */ + .init = codel_init, + .reset = codel_reset, + .change = codel_change, + .dump = codel_dump, + .owner = THIS_MODULE, +}; +EXPORT_SYMBOL(codel_qdisc_ops); + +static int __init codel_module_init(void) +{ + return register_qdisc(&codel_qdisc_ops); +} +static void __exit codel_module_exit(void) +{ + unregister_qdisc(&codel_qdisc_ops); +} +module_init(codel_module_init) +module_exit(codel_module_exit) +MODULE_LICENSE("GPL"); + -- 1.7.9.5