From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by huchra.bufferbloat.net (Postfix, from userid 1000) id B1FCA202213; Sun, 6 May 2012 22:35:58 -0700 (PDT) From: =?UTF-8?q?Dave=20T=C3=A4ht?= To: codel@lists.bufferbloat.net Date: Sun, 6 May 2012 22:35:57 -0700 Message-Id: <1336368957-17586-1-git-send-email-dave.taht@bufferbloat.net> X-Mailer: git-send-email 1.7.1 Cc: =?UTF-8?q?Dave=20T=C3=A4ht?= Subject: [Codel] [PATCH v9] codel: Controlled Delay AQM X-BeenThere: codel@lists.bufferbloat.net X-Mailman-Version: 2.1.13 Precedence: list List-Id: CoDel AQM discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 07 May 2012 05:35:59 -0000 This version (9) adds support for various forms of decrease in s3, in the form of the module parameters gentle and decrease_method. It defaults to the algorithm as described in the original presentation. v1: Original implementation - Dave Taht v2: Working code - Corrections for ktime - Dave Taht v3: 32 bit support and port to net-next - Eric Dumazet v4: 16 bit precision for inv sqrt and cache - Dave Taht v5: Kernel cleanup and full precision - Eric Dumazet v6: Dump Stats support added - Eric Dumazet v7: Complete rewrite for u32 values - Eric Dumazet v8: Stats and timing added, 64 bit prescale improved - Eric Dumazet v9: debated functionality moved to isolated routine - Dave Taht --- include/linux/pkt_sched.h | 25 +++ net/sched/Kconfig | 11 ++ net/sched/Makefile | 1 + net/sched/sch_codel.c | 463 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 500 insertions(+) create mode 100644 net/sched/sch_codel.c diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 0d5b793..f62141e 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -633,4 +633,29 @@ struct tc_qfq_stats { __u32 lmax; }; +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_TARGET, + TCA_CODEL_LIMIT, + TCA_CODEL_MINBYTES, + TCA_CODEL_INTERVAL, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) + +struct tc_codel_xstats { + __u32 count; + __u32 delay; /* time elapsed since next packet was queued (in us) */ + __u32 drop_next; + __u32 drop_overlimit; + __u32 dropping; + __u32 state1; + __u32 state2; + __u32 state3; + __u32 states; +}; + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2590e91..8106c42 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -250,6 +250,17 @@ config NET_SCH_QFQ If unsure, say N. +config NET_SCH_CODEL + tristate "Controlled Delay AQM (CODEL)" + help + Say Y here if you want to use the Controlled Delay (CODEL) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_codel. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index dc5889c..41130b5 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c new file mode 100644 index 0000000..a9e6383 --- /dev/null +++ b/net/sched/sch_codel.c @@ -0,0 +1,463 @@ +/* + * net/sched/sch_codel.c A Codel implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Codel, the COntrolled DELay Queueing discipline + * Based on ns2 simulation code presented by Kathie Nichols + * + * Authors: Dave Täht + * Eric Dumazet + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * codel uses a 1024 nsec clock, encoded in u32 + */ +typedef u32 codel_time_t; +#define CODEL_SHIFT 10 + +static u32 gentle = 0; +static u32 decrease_method = 0; +module_param(gentle, uint, 0644); +module_param(decrease_method, uint, 0644); +MODULE_PARM_DESC(gentle,"Gently increment count in massive drop state"); +MODULE_PARM_DESC(decrease_method,"Various means of decreasing count"); + + +static codel_time_t codel_get_time(void) +{ + u64 ns = ktime_to_ns(ktime_get()); + + return ns >> CODEL_SHIFT; +} + +#define codel_time_after(a, b) ((int)(a) - (int)(b) > 0) +#define codel_time_after_eq(a, b) ((int)(a) - (int)(b) >= 0) +#define codel_time_before(a, b) ((int)(a) - (int)(b) < 0) +#define codel_time_before_eq(a, b) ((int)(a) - (int)(b) <= 0) + +#define MS2TIME(a) ((a * NSEC_PER_MSEC) >> CODEL_SHIFT) + +#define DEFAULT_CODEL_LIMIT 1000 + +/* Per-queue state (codel_queue_t instance variables) */ + +struct codel_sched_data { + u32 minbytes; + u32 interval; + codel_time_t target; + + u32 count; /* packets dropped since entering drop state */ + u32 drop_count; + bool dropping; + /* time to declare above q->target (0 if below)*/ + codel_time_t first_above_time; + codel_time_t drop_next; /* time to drop next packet */ + + u32 state1; + u32 state2; + u32 state3; + u32 states; + u32 drop_overlimit; +}; + +struct codel_skb_cb { + codel_time_t enqueue_time; +}; + + +/* + * return interval/sqrt(x) with good precision + */ +static u32 calc(u32 _interval, u32 _x) +{ + u64 interval = _interval; + unsigned long x = _x; + + /* scale operands for max precision + * On 64bit arches, we can prescale x by 32bits + */ + if (BITS_PER_LONG == 64) { + x <<= 32; + interval <<= 16; + } + while (x < (1UL << (BITS_PER_LONG - 2))) { + x <<= 2; + interval <<= 1; + } + do_div(interval, int_sqrt(x)); + return (u32)interval; +} + +static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb)); + return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static codel_time_t get_enqueue_time(const struct sk_buff *skb) +{ + return get_codel_cb(skb)->enqueue_time; +} + +static void set_enqueue_time(struct sk_buff *skb) +{ + get_codel_cb(skb)->enqueue_time = codel_get_time(); +} + +static codel_time_t control_law(const struct codel_sched_data *q, + codel_time_t t) +{ + return t + calc(q->interval, q->count); +} + +static bool should_drop(struct sk_buff *skb, struct Qdisc *sch, + codel_time_t now) +{ + struct codel_sched_data *q = qdisc_priv(sch); + codel_time_t sojourn_time; + bool drop; + + if (!skb) { + q->first_above_time = 0; + return false; + } + sch->qstats.backlog -= qdisc_pkt_len(skb); + sojourn_time = now - get_enqueue_time(skb); + + if (codel_time_before(sojourn_time, q->target) || + sch->qstats.backlog < q->minbytes) { + /* went below so we'll stay below for at least q->interval */ + q->first_above_time = 0; + return false; + } + drop = false; + if (q->first_above_time == 0) { + /* just went above from below. If we stay above + * for at least q->interval we'll say it's ok to drop + */ + q->first_above_time = now + q->interval; + } else if (codel_time_after(now, q->first_above_time)) { + drop = true; + q->state1++; + } + return drop; +} + +static void codel_drop(struct Qdisc *sch, struct sk_buff *skb) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + qdisc_drop(skb, sch); + q->drop_count++; +} + +/* +* if min went above target close to when we last went below, +* assume that some drop rate near that controlled the queue on the +* last cycle is a good starting point to control it now. +* +* Since there is debate about it right now, we try a few +* different methods. +*/ + +static u32 count_rescale(struct codel_sched_data *q, codel_time_t now) { + u32 c = 1; + + if (q->count < 2) + return 1; + + if (codel_time_after(now - q->drop_next, 16 * q->interval)) { + switch(decrease_method) { + case 3: /* Taht 1 (not yet what I have in mind) */ + c = q->count - 1; + break; + case 2: /* Dumazet 2 */ + c = q->count >> 1; + break; + case 1: /* Dumazet 1 */ + c = min(q->count - 1, + q->count - (q->count >> 4)); + break; + case 0: /* Codel Paper Default */ + default: + c = q->count - 1; + } + c = max(1U, c); + } + return (u32) c; +} + +static struct sk_buff *codel_dequeue(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = __skb_dequeue(&sch->q); + codel_time_t now; + bool drop; + + if (!skb) { + q->dropping = false; + return skb; + } + now = codel_get_time(); + drop = should_drop(skb, sch, now); + if (q->dropping) { + if (!drop) { + /* sojourn time below target - leave dropping state */ + q->dropping = false; + } else if (codel_time_after_eq(now, q->drop_next)) { + q->state2++; + /* It's time for the next drop. Drop the current + * packet and dequeue the next. The dequeue might + * take us out of dropping state. + * If not, schedule the next drop. + * A large backlog might result in drop rates so high + * that the next drop should happen now, + * hence the while loop. + */ + if(gentle) + q->count++; + while (q->dropping && + codel_time_after_eq(now, q->drop_next)) { + codel_drop(sch, skb); + if(!gentle) + q->count++; + skb = __skb_dequeue(&sch->q); + if (!should_drop(skb, sch, now)) { + /* leave dropping state */ + q->dropping = false; + } else { + /* and schedule the next drop */ + q->drop_next = + control_law(q, q->drop_next); + } + } + } + } else if (drop && + (codel_time_before(now - q->drop_next, + 16 * q->interval) || + codel_time_after_eq(now - q->first_above_time, + 2 * q->interval))) { + codel_drop(sch, skb); + skb = __skb_dequeue(&sch->q); + drop = should_drop(skb, sch, now); + q->dropping = true; + q->state3++; + /* + * if min went above target close to when we last went below, + * assume that the drop rate that controlled the queue on the + * last cycle is a good starting point to control it now. + * Since there is debate about it right now, punt. + */ + q->count = count_rescale(q, now); + q->drop_next = control_law(q, now); + } + q->states++; + /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + * or HTB crashes. Defer it for next round. + */ + if (q->drop_count && sch->q.qlen) { + qdisc_tree_decrease_qlen(sch, q->drop_count); + q->drop_count = 0; + } + if (skb) + qdisc_bstats_update(sch, skb); + return skb; +} + +static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct codel_sched_data *q; + + if (likely(skb_queue_len(&sch->q) < sch->limit)) { + set_enqueue_time(skb); + return qdisc_enqueue_tail(skb, sch); + } + q = qdisc_priv(sch); + q->drop_overlimit++; + return qdisc_drop(skb, sch); +} + +static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { + [TCA_CODEL_TARGET] = { .type = NLA_U32 }, + [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, + [TCA_CODEL_MINBYTES] = { .type = NLA_U32 }, + [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, +}; + +static int codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CODEL_MAX + 1]; + unsigned int qlen; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + if (tb[TCA_CODEL_TARGET]) { + u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]); + + q->target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; + } + if (tb[TCA_CODEL_INTERVAL]) { + u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); + + q->interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT; + } + if (tb[TCA_CODEL_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]); + + if (tb[TCA_CODEL_MINBYTES]) + q->minbytes = nla_get_u32(tb[TCA_CODEL_MINBYTES]); + + qlen = sch->q.qlen; + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = __skb_dequeue(&sch->q); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + q->drop_next = q->first_above_time = 0; + q->dropping = false; + sch_tree_unlock(sch); + return 0; +} + +static int codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + q->target = MS2TIME(5); + /* It should be possible to run with no limit, + * with infinite memory + */ + sch->limit = DEFAULT_CODEL_LIMIT; + q->minbytes = psched_mtu(qdisc_dev(sch)); + q->interval = MS2TIME(100); + q->drop_next = q->first_above_time = 0; + q->dropping = false; /* exit dropping state */ + q->count = 1; + if (opt) { + int err = codel_change(sch, opt); + + if (err) + return err; + } + + if (sch->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + + return 0; +} + +static u32 codel_time_to_us(codel_time_t val) +{ + u64 valns = ((u64)val << CODEL_SHIFT); + + do_div(valns, NSEC_PER_USEC); + return (u32)valns; +} + +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_CODEL_TARGET, codel_time_to_us(q->target)) || + nla_put_u32(skb, TCA_CODEL_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_CODEL_INTERVAL, codel_time_to_us(q->interval)) || + nla_put_u32(skb, TCA_CODEL_MINBYTES, q->minbytes)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; +} + +static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = skb_peek(&sch->q); + codel_time_t now = codel_get_time(); + struct tc_codel_xstats st = { + .count = q->count, + .state1 = q->state1, + .state2 = q->state2, + .state3 = q->state3, + .states = q->states, + .drop_overlimit = q->drop_overlimit, + .delay = skb ? now - get_enqueue_time(skb) : 0, + .drop_next = q->drop_next ? q->drop_next - now : 0, + .dropping = q->dropping, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void codel_reset(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + qdisc_reset_queue(sch); + sch->q.qlen = 0; + q->dropping = false; + q->count = 1; +} + +static struct Qdisc_ops codel_qdisc_ops __read_mostly = { + .id = "codel", + .priv_size = sizeof(struct codel_sched_data), + + .enqueue = codel_enqueue, + .dequeue = codel_dequeue, + .peek = qdisc_peek_dequeued, + .init = codel_init, + .reset = codel_reset, + .change = codel_change, + .dump = codel_dump, + .dump_stats = codel_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init codel_module_init(void) +{ + return register_qdisc(&codel_qdisc_ops); +} +static void __exit codel_module_exit(void) +{ + unregister_qdisc(&codel_qdisc_ops); +} +module_init(codel_module_init) +module_exit(codel_module_exit) + +MODULE_DESCRIPTION("Controlled Delay queue discipline"); +MODULE_AUTHOR("Dave Taht"); +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); -- 1.7.9.5