From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-we0-f171.google.com (mail-we0-f171.google.com [74.125.82.171]) (using TLSv1 with cipher RC4-SHA (128/128 bits)) (Client CN "smtp.gmail.com", Issuer "Google Internet Authority" (verified OK)) by huchra.bufferbloat.net (Postfix) with ESMTPS id 78FBB200CCC for ; Sat, 5 May 2012 14:28:58 -0700 (PDT) Received: by wejx9 with SMTP id x9so4572451wej.16 for ; Sat, 05 May 2012 14:28:56 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=subject:from:to:cc:in-reply-to:references:content-type:date :message-id:mime-version:x-mailer:content-transfer-encoding; bh=pSMfcgmkGd0ZTBhJxh+1dLY/y9HrTTiDHhdlYGpok+c=; b=OrQujXAIKuZdHE8F5kEasN+Pc94nHzVkd8uwqflwO5mh0Ua/Ir0Asp5U0WmH7YOvrL gZLB960Jg3zmOA1UtcQunKglbJgWu1SUGfK/JvPg4UbdA9oiuAXLGXFIiYGthyPm1R7t Z2jzfrhnQT1HTslaKa7HZNeF9oB1tROLH1e/lZ4rsHRZrbjVPVfpoA6ZUJaHqOSzm6ta UTMfRRrQ70dQGuc6qR2PHUwBgzMxp/o/Nlh1/u9SE25MlelmXc7iBtH2dqTCjmNJg56n 1qEdbK5huMT8D4xjqvq50gPIgBfzY/QpUmB/6DxDgXxnk7+rbWNCP3r85CDqARy7Xiv8 fB8w== Received: by 10.180.98.8 with SMTP id ee8mr23447757wib.14.1336253336579; Sat, 05 May 2012 14:28:56 -0700 (PDT) Received: from [172.28.130.107] ([74.125.122.49]) by mx.google.com with ESMTPS id e6sm8865431wix.8.2012.05.05.14.28.53 (version=SSLv3 cipher=OTHER); Sat, 05 May 2012 14:28:55 -0700 (PDT) From: Eric Dumazet To: dave taht In-Reply-To: <1336252832.3752.563.camel@edumazet-glaptop> References: <1336217671-20384-1-git-send-email-dave.taht@bufferbloat.net> <1336218794.3752.508.camel@edumazet-glaptop> <1336229343.3752.516.camel@edumazet-glaptop> <1336249251.3752.558.camel@edumazet-glaptop> <1336250168.3752.560.camel@edumazet-glaptop> <1336252281.3752.561.camel@edumazet-glaptop> <4FA597C0.7090206@gmail.com> <1336252832.3752.563.camel@edumazet-glaptop> Content-Type: text/plain; charset="UTF-8" Date: Sat, 05 May 2012 23:28:50 +0200 Message-ID: <1336253330.3752.564.camel@edumazet-glaptop> Mime-Version: 1.0 X-Mailer: Evolution 2.28.3 Content-Transfer-Encoding: 8bit Cc: codel@lists.bufferbloat.net, Dave =?ISO-8859-1?Q?T=E4ht?= Subject: [Codel] [PATCH v6] pkt_sched: codel: Controlled Delay AQM X-BeenThere: codel@lists.bufferbloat.net X-Mailman-Version: 2.1.13 Precedence: list List-Id: CoDel AQM discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 05 May 2012 21:28:59 -0000 include/linux/pkt_sched.h | 19 + net/sched/Kconfig | 11 net/sched/Makefile | 1 net/sched/sch_codel.c | 414 ++++++++++++++++++++++++++++++++++++ 4 files changed, 445 insertions(+) diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index ffe975c..420ea95 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -655,4 +655,23 @@ struct tc_qfq_stats { __u32 lmax; }; +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_TARGET, + TCA_CODEL_LIMIT, + TCA_CODEL_MINBYTES, + TCA_CODEL_INTERVAL, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) + +struct tc_codel_xstats { + __u32 count; + __u32 delay; /* time elapsed since next packet was queued (in us) */ + __u32 drop_next; +}; + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 75b58f8..fadd252 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -250,6 +250,17 @@ config NET_SCH_QFQ If unsure, say N. +config NET_SCH_CODEL + tristate "Controlled Delay AQM (CODEL)" + help + Say Y here if you want to use the Controlled Delay (CODEL) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_codel. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index 8cdf4e2..30fab03 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c new file mode 100644 index 0000000..2e938c1 --- /dev/null +++ b/net/sched/sch_codel.c @@ -0,0 +1,414 @@ +/* + * net/sched/sch_codel.c A Codel implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Codel, the COntrolled DELay Queueing discipline + * Based on ns2 simulation code presented by Kathie Nichols + * + * Authors: Dave Täht + * Eric Dumazet + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MS2TIME(a) (ns_to_ktime( (u64) a * NSEC_PER_MSEC)) +#define DEFAULT_CODEL_LIMIT 1000 + +/* + * Via patch found at: + * http://lkml.indiana.edu/hypermail/linux/kernel/0802.0/0659.html + * I don't know why this isn't in ktime.h as it seemed sane... + */ + +/* + * ktime_compare - Compares two ktime_t variables + * + * Return val: + * lhs < rhs: < 0 + * lhs == rhs: 0 + * lhs > rhs: > 0 + */ + +#if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR) +static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs) +{ + if (lhs.tv64 < rhs.tv64) + return -1; + if (lhs.tv64 > rhs.tv64) + return 1; + return 0; +} +#else +static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs) +{ + if (lhs.tv.sec < rhs.tv.sec) + return -1; + if (lhs.tv.sec > rhs.tv.sec) + return 1; + return lhs.tv.nsec - rhs.tv.nsec; +} +#endif + +/* Per-queue state (codel_queue_t instance variables) */ + +struct codel_sched_data { + u32 minbytes; + u32 count; /* packets dropped since we went into drop state */ + u32 drop_count; + bool dropping; + ktime_t target; + /* time to declare above q->target (0 if below)*/ + ktime_t first_above_time; + ktime_t drop_next; /* time to drop next packet */ + ktime_t interval16; + u32 interval; +}; + +struct codel_skb_cb { + ktime_t enqueue_time; +}; + +static unsigned int state1; +static unsigned int state2; +static unsigned int state3; +static unsigned int states; + +/* + * return interval/sqrt(x) with good precision + */ +static u32 calc(u32 _interval, unsigned long x) +{ + u64 interval = _interval; + + /* scale operands for max precision */ + while (x < (1UL << (BITS_PER_LONG - 2))) { + x <<= 2; + interval <<= 1; + } + do_div(interval, int_sqrt(x)); + return (u32)interval; +} + +static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb)); + return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static ktime_t get_enqueue_time(const struct sk_buff *skb) +{ + return get_codel_cb(skb)->enqueue_time; +} + +static void set_enqueue_time(struct sk_buff *skb) +{ + get_codel_cb(skb)->enqueue_time = ktime_get(); +} + +static ktime_t control_law(const struct codel_sched_data *q, ktime_t t) +{ + return ktime_add_ns(t, calc(q->interval, q->count)); +} + +static bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now) +{ + struct codel_sched_data *q = qdisc_priv(sch); + ktime_t sojourn_time; + bool drop; + + if (!skb) { + q->first_above_time.tv64 = 0; + return false; + } + sch->qstats.backlog -= qdisc_pkt_len(skb); + sojourn_time = ktime_sub(now, get_enqueue_time(skb)); + + if (ktime_compare(sojourn_time, q->target) < 0 || + sch->qstats.backlog < q->minbytes) { + /* went below so we'll stay below for at least q->interval */ + q->first_above_time.tv64 = 0; + return false; + } + drop = false; + if (q->first_above_time.tv64 == 0) { + /* just went above from below. If we stay above + * for at least q->interval we'll say it's ok to drop + */ + q->first_above_time = ktime_add_ns(now, q->interval); + } else if (ktime_compare(now, q->first_above_time) >= 0) { + drop = true; + state1++; + } + return drop; +} + +static void codel_drop(struct Qdisc *sch, struct sk_buff *skb) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + qdisc_drop(skb, sch); + q->drop_count++; +} + +static struct sk_buff *codel_dequeue(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = __skb_dequeue(&sch->q); + ktime_t now; + bool drop; + + if (!skb) { + q->dropping = false; + return skb; + } + now = ktime_get(); + drop = should_drop(skb, sch, now); + if (q->dropping) { + if (!drop) { + /* sojourn time below target - leave dropping state */ + q->dropping = false; + } else if (ktime_compare(now, q->drop_next) >=0) { + state2++; + /* It's time for the next drop. Drop the current + * packet and dequeue the next. The dequeue might + * take us out of dropping state. + * If not, schedule the next drop. + * A large backlog might result in drop rates so high + * that the next drop should happen now, + * hence the while loop. + */ + while (q->dropping && + (ktime_compare(now, q->drop_next) >= 0)) { + codel_drop(sch, skb); + q->count++; + skb = __skb_dequeue(&sch->q); + if (!should_drop(skb, sch, now)) { + /* leave dropping state */ + q->dropping = false; + } else { + /* and schedule the next drop */ + q->drop_next = + control_law(q, q->drop_next); + } + } + } + } else if (drop && + ((ktime_compare(ktime_sub(now, q->drop_next), q->interval16) < 0) || + (ktime_compare(ktime_sub(now, q->first_above_time), + ns_to_ktime(2 * q->interval)) >= 0 ))) { + codel_drop(sch, skb); + skb = __skb_dequeue(&sch->q); + drop = should_drop(skb, sch, now); + q->dropping = true; + state3++; + /* + * if min went above target close to when we last went below it + * assume that the drop rate that controlled the queue on the + * last cycle is a good starting point to control it now. + */ + if (ktime_compare(ktime_sub(now, q->drop_next), q->interval16) < 0) + q->count = min(1U, q->count - 1); + else + q->count = 1; + q->drop_next = control_law(q, now); + } + if ((states++ % 64) == 0) { + pr_debug("s1: %u, s2: %u, s3: %u\n", + state1, state2, state3); + } + /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + * or HTB crashes + */ + if (q->drop_count && sch->q.qlen) { + qdisc_tree_decrease_qlen(sch, q->drop_count); + q->drop_count = 0; + } + if (skb) + qdisc_bstats_update(sch, skb); + return skb; +} + +static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + if (likely(skb_queue_len(&sch->q) < sch->limit)) { + set_enqueue_time(skb); + return qdisc_enqueue_tail(skb, sch); + } + return qdisc_drop(skb, sch); +} + +static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { + [TCA_CODEL_TARGET] = { .type = NLA_U32 }, + [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, + [TCA_CODEL_MINBYTES] = { .type = NLA_U32 }, + [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, +}; + +static int codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CODEL_MAX + 1]; + unsigned int qlen; + int err; + + if (opt == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + if (tb[TCA_CODEL_TARGET]) { + u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]); + + q->target = ns_to_ktime((u64) target * NSEC_PER_USEC); + } + if (tb[TCA_CODEL_INTERVAL]) { + u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); + + interval = min_t(u32, ~0U / NSEC_PER_USEC, interval); + + q->interval = interval * NSEC_PER_USEC; + q->interval16 = ns_to_ktime(16 * (u64)q->interval); + } + if (tb[TCA_CODEL_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]); + + if (tb[TCA_CODEL_MINBYTES]) + q->minbytes = nla_get_u32(tb[TCA_CODEL_MINBYTES]); + + qlen = sch->q.qlen; + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = __skb_dequeue(&sch->q); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + q->drop_next.tv64 = q->first_above_time.tv64 = 0; + q->dropping = false; + sch_tree_unlock(sch); + return 0; +} + +static int codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + q->target = MS2TIME(5); + /* It should be possible to run with no limit, + * with infinite memory :) + */ + sch->limit = DEFAULT_CODEL_LIMIT; + q->minbytes = psched_mtu(qdisc_dev(sch)); + q->interval = 100 * NSEC_PER_MSEC; + q->interval16 = ns_to_ktime(16 * (u64)q->interval); + q->drop_next.tv64 = q->first_above_time.tv64 = 0; + q->dropping = false; /* exit dropping state */ + q->count = 1; + if (opt) { + int err = codel_change(sch, opt); + + if (err) + return err; + } + + if (sch->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + + return 0; +} + +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + u32 target = ktime_to_us(q->target); + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_CODEL_TARGET, target) || + nla_put_u32(skb, TCA_CODEL_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_CODEL_INTERVAL, q->interval / NSEC_PER_USEC) || + nla_put_u32(skb, TCA_CODEL_MINBYTES, q->minbytes)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; +} + +static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = skb_peek(&sch->q); + ktime_t now = ktime_get(); + s64 delay; + struct tc_codel_xstats st = { + .count = q->count, + }; + + if (skb) { + delay = ktime_us_delta(now, get_enqueue_time(skb)); + st.delay = min_t(u64, ~0U, delay); + } + delay = ktime_us_delta(q->drop_next, now); + st.drop_next = delay > 0 ? min_t(u64, ~0U, delay) : 0; + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void codel_reset(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + qdisc_reset_queue(sch); + sch->q.qlen = 0; + q->dropping = false; + q->count = 1; +} + +static struct Qdisc_ops codel_qdisc_ops __read_mostly = { + .id = "codel", + .priv_size = sizeof(struct codel_sched_data), + + .enqueue = codel_enqueue, + .dequeue = codel_dequeue, + .peek = qdisc_peek_dequeued, + .init = codel_init, + .reset = codel_reset, + .change = codel_change, + .dump = codel_dump, + .dump_stats = codel_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init codel_module_init(void) +{ + return register_qdisc(&codel_qdisc_ops); +} +static void __exit codel_module_exit(void) +{ + unregister_qdisc(&codel_qdisc_ops); +} +module_init(codel_module_init) +module_exit(codel_module_exit) +MODULE_LICENSE("GPL"); +