From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-wi0-f181.google.com (mail-wi0-f181.google.com [209.85.212.181]) (using TLSv1 with cipher RC4-SHA (128/128 bits)) (Client CN "smtp.gmail.com", Issuer "Google Internet Authority" (verified OK)) by huchra.bufferbloat.net (Postfix) with ESMTPS id DB728202213 for ; Sat, 5 May 2012 07:49:09 -0700 (PDT) Received: by wibhn14 with SMTP id hn14so1751630wib.10 for ; Sat, 05 May 2012 07:49:06 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=subject:from:to:cc:in-reply-to:references:content-type:date :message-id:mime-version:x-mailer:content-transfer-encoding; bh=3MMA/Qk2FuPO43wChHgDKMEMilM4eqNSWEDjJXPgr+s=; b=R43gUv7u83cRqKHGxt8htvKmpCEGAmBxPuarofv03ViWH6sI/fsmg4m0FVmaskDdnq 9TTbjVq1HqF2eCnTxBwhE2eSING1I5he2//FidrOsHZVhxUN6ZVN/VdSEvhZxy6X5xf9 WvQNSJi9RvpFrp7S9W/at7OVUmjhsHVY9OzFz0PzasGfEjAE8Q4qg3SRRSkZHEmQq5+u tf+aJiczRynQjmYwrSRgDeaTK7ORlaMmDEQfEpQRWXH6cvSdEVxMl9Xs1lT4iciMRt6O DOlSo76n64P3KCsHwOHH4NiMs3D0gxHHIhSkdiLdZQ5Xr/Mf7R5kfYWUpI7bL1zH14s7 1/0g== Received: by 10.180.98.8 with SMTP id ee8mr21696350wib.14.1336229346447; Sat, 05 May 2012 07:49:06 -0700 (PDT) Received: from [172.28.130.107] ([74.125.122.49]) by mx.google.com with ESMTPS id e6sm6239391wix.8.2012.05.05.07.49.04 (version=SSLv3 cipher=OTHER); Sat, 05 May 2012 07:49:05 -0700 (PDT) From: Eric Dumazet To: Dave Taht In-Reply-To: <1336218794.3752.508.camel@edumazet-glaptop> References: <1336217671-20384-1-git-send-email-dave.taht@bufferbloat.net> <1336218794.3752.508.camel@edumazet-glaptop> Content-Type: text/plain; charset="UTF-8" Date: Sat, 05 May 2012 16:49:03 +0200 Message-ID: <1336229343.3752.516.camel@edumazet-glaptop> Mime-Version: 1.0 X-Mailer: Evolution 2.28.3 Content-Transfer-Encoding: 8bit Cc: codel@lists.bufferbloat.net, Dave =?ISO-8859-1?Q?T=E4ht?= Subject: [Codel] [PATCH v5] pkt_sched: codel: Controlled Delay AQM X-BeenThere: codel@lists.bufferbloat.net X-Mailman-Version: 2.1.13 Precedence: list List-Id: CoDel AQM discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 05 May 2012 14:49:12 -0000 From: Dave Taht A nice changelog here, to tell how nice is CoDel, giving pointers to documentation and all credits. Signed-off-by: Dave Taht Signed-off-by: Eric Dumazet --- include/linux/pkt_sched.h | 13 + net/sched/Kconfig | 11 net/sched/Makefile | 1 net/sched/sch_codel.c | 425 ++++++++++++++++++++++++++++++++++++ 4 files changed, 450 insertions(+) diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index ffe975c..62a73bf 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -655,4 +655,17 @@ struct tc_qfq_stats { __u32 lmax; }; +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_TARGET, + TCA_CODEL_LIMIT, + TCA_CODEL_MINBYTES, + TCA_CODEL_INTERVAL, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 75b58f8..fadd252 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -250,6 +250,17 @@ config NET_SCH_QFQ If unsure, say N. +config NET_SCH_CODEL + tristate "Controlled Delay AQM (CODEL)" + help + Say Y here if you want to use the Controlled Delay (CODEL) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_codel. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index 8cdf4e2..30fab03 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c new file mode 100644 index 0000000..a19177f --- /dev/null +++ b/net/sched/sch_codel.c @@ -0,0 +1,425 @@ +/* + * net/sched/sch_codel.c A Codel implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Codel, the COntrolled DELay Queueing discipline + * Based on ns2 simulation code presented by Kathie Nichols + * + * Authors: Dave Täht + * Eric Dumazet + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MS2TIME(a) (ns_to_ktime( (u64) a * NSEC_PER_MSEC)) +#define DEFAULT_CODEL_LIMIT 1000 +#define PRECALC_MAX 64 + +/* + * Via patch found at: + * http://lkml.indiana.edu/hypermail/linux/kernel/0802.0/0659.html + * I don't know why this isn't in ktime.h as it seemed sane... + */ + +/* + * ktime_compare - Compares two ktime_t variables + * + * Return val: + * lhs < rhs: < 0 + * lhs == rhs: 0 + * lhs > rhs: > 0 + */ + +#if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR) +static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs) +{ + if (lhs.tv64 < rhs.tv64) + return -1; + if (lhs.tv64 > rhs.tv64) + return 1; + return 0; +} +#else +static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs) +{ + if (lhs.tv.sec < rhs.tv.sec) + return -1; + if (lhs.tv.sec > rhs.tv.sec) + return 1; + return lhs.tv.nsec - rhs.tv.nsec; +} +#endif + +/* Per-queue state (codel_queue_t instance variables) */ + +struct codel_sched_data { + u32 flags; + u32 minbytes; + u32 count; /* packets dropped since we went into drop state */ + u32 drop_count; + bool dropping; + ktime_t target; + /* time to declare above q->target (0 if below)*/ + ktime_t first_above_time; + ktime_t drop_next; /* time to drop next packet */ + ktime_t interval16; + u32 interval; + u32 q_intervals[PRECALC_MAX]; +}; + +struct codel_skb_cb { + ktime_t enqueue_time; +}; + +static unsigned int state1; +static unsigned int state2; +static unsigned int state3; +static unsigned int states; + +/* + * return interval/sqrt(x) with good precision + */ +static u32 calc(u32 _interval, unsigned long x) +{ + u64 interval = _interval; + + /* scale operands for max precision */ + while (x < (1UL << (BITS_PER_LONG - 2))) { + x <<= 2; + interval <<= 1; + } + do_div(interval, int_sqrt(x)); + return (u32)interval; +} + +static void codel_fill_cache(struct codel_sched_data *q) +{ + int i; + + q->q_intervals[0] = q->interval; + for (i = 2; i <= PRECALC_MAX; i++) + q->q_intervals[i - 1] = calc(q->interval, i); +} + +static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb)); + return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static ktime_t get_enqueue_time(const struct sk_buff *skb) +{ + return get_codel_cb(skb)->enqueue_time; +} + +static void set_enqueue_time(struct sk_buff *skb) +{ + get_codel_cb(skb)->enqueue_time = ktime_get(); +} + +/* + * The original control_law required floating point. + * + * return ktime_add_ns(t, q->interval / sqrt(q->count)); + * + */ +static ktime_t control_law(const struct codel_sched_data *q, ktime_t t) +{ + u32 inter; + + if (q->count > PRECALC_MAX) + inter = calc(q->interval, q->count); + else + inter = q->q_intervals[q->count - 1]; + return ktime_add_ns(t, inter); +} + +static bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now) +{ + struct codel_sched_data *q = qdisc_priv(sch); + ktime_t sojourn_time; + bool drop; + + if (!skb) { + q->first_above_time.tv64 = 0; + return false; + } + sojourn_time = ktime_sub(now, get_enqueue_time(skb)); + + if (ktime_compare(sojourn_time, q->target) < 0 || + sch->qstats.backlog < q->minbytes) { + /* went below so we'll stay below for at least q->interval */ + q->first_above_time.tv64 = 0; + return false; + } + drop = false; + if (q->first_above_time.tv64 == 0) { + /* just went above from below. If we stay above + * for at least q->interval we'll say it's ok to drop + */ + q->first_above_time = ktime_add_ns(now, q->interval); + } else if (ktime_compare(now, q->first_above_time) >= 0) { + drop = true; + state1++; + } + return drop; +} + +static void codel_drop(struct Qdisc *sch, struct sk_buff *skb) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + q->drop_count++; +} + +static struct sk_buff *codel_dequeue(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = __skb_dequeue(&sch->q); + ktime_t now; + bool drop; + + if (!skb) { + q->dropping = false; + return skb; + } + now = ktime_get(); + drop = should_drop(skb, sch, now); + if (q->dropping) { + if (!drop) { + /* sojourn time below target - leave dropping state */ + q->dropping = false; + } else if (ktime_compare(now, q->drop_next) >=0) { + state2++; + /* It's time for the next drop. Drop the current + * packet and dequeue the next. The dequeue might + * take us out of dropping state. + * If not, schedule the next drop. + * A large backlog might result in drop rates so high + * that the next drop should happen now, + * hence the while loop. + */ + while (q->dropping && + (ktime_compare(now, q->drop_next) >= 0)) { + codel_drop(sch, skb); + q->count++; + skb = __skb_dequeue(&sch->q); + if (!should_drop(skb, sch, now)) { + /* leave dropping state */ + q->dropping = false; + } else { + /* and schedule the next drop */ + q->drop_next = + control_law(q, q->drop_next); + } + } + } + } else if (drop && + ((ktime_compare(ktime_sub(now, q->drop_next), + q->interval16) < 0) || + (ktime_compare(ktime_sub(now, q->first_above_time), + ns_to_ktime(2 * q->interval)) >= 0 ))) { + codel_drop(sch, skb); + skb = __skb_dequeue(&sch->q); + drop = should_drop(skb, sch, now); + q->dropping = true; + state3++; + /* + * if min went above target close to when we last went below it + * assume that the drop rate that controlled the queue on the + * last cycle is a good starting point to control it now. + */ + if (ktime_compare(ktime_sub(now, q->drop_next), + q->interval16) < 0) { + q->count = q->count > 1 ? q->count - 1 : 1; + } else { + q->count = 1; + } + q->drop_next = control_law(q, now); + } + if ((states++ % 64) == 0) { + pr_debug("s1: %u, s2: %u, s3: %u\n", + state1, state2, state3); + } + /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + * or HTB crashes + */ + if (q->drop_count && sch->q.qlen) { + qdisc_tree_decrease_qlen(sch, q->drop_count); + q->drop_count = 0; + } + if (skb) { + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_bstats_update(sch, skb); + } + return skb; +} + +static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + if (likely(skb_queue_len(&sch->q) < sch->limit)) { + set_enqueue_time(skb); + return qdisc_enqueue_tail(skb, sch); + } + return qdisc_drop(skb, sch); +} + +static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { + [TCA_CODEL_TARGET] = { .type = NLA_U32 }, + [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, + [TCA_CODEL_MINBYTES] = { .type = NLA_U32 }, + [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, +}; + +static int codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CODEL_MAX + 1]; + unsigned int qlen; + int err; + + if (opt == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + if (tb[TCA_CODEL_TARGET]) { + u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]); + + q->target = ns_to_ktime((u64) target * NSEC_PER_USEC); + } + if (tb[TCA_CODEL_INTERVAL]) { + u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); + + interval = min_t(u32, ~0U / NSEC_PER_USEC, interval); + + q->interval = interval * NSEC_PER_USEC; + q->interval16 = ns_to_ktime(16 * (u64)q->interval); + codel_fill_cache(q); + } + if (tb[TCA_CODEL_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]); + + if (tb[TCA_CODEL_MINBYTES]) + q->minbytes = nla_get_u32(tb[TCA_CODEL_MINBYTES]); + + qlen = sch->q.qlen; + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = __skb_dequeue(&sch->q); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + q->drop_next.tv64 = q->first_above_time.tv64 = 0; + q->dropping = false; + sch_tree_unlock(sch); + return 0; +} + +static int codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + q->target = MS2TIME(5); + /* It should be possible to run with no limit, + * with infinite memory :) + */ + sch->limit = DEFAULT_CODEL_LIMIT; + q->minbytes = psched_mtu(qdisc_dev(sch)); + q->interval = 100 * NSEC_PER_MSEC; + q->interval16 = ns_to_ktime(16 * (u64)q->interval); + q->drop_next.tv64 = q->first_above_time.tv64 = 0; + q->dropping = false; /* exit dropping state */ + q->count = 1; + codel_fill_cache(q); + if (opt) { + int err = codel_change(sch, opt); + + if (err) + return err; + } + + if (sch->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + + return 0; +} + +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + u32 target = ktime_to_us(q->target); + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_CODEL_TARGET, target) || + nla_put_u32(skb, TCA_CODEL_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_CODEL_INTERVAL, q->interval / NSEC_PER_USEC) || + nla_put_u32(skb, TCA_CODEL_MINBYTES, q->minbytes)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; +} + +static void codel_reset(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + qdisc_reset_queue(sch); + sch->q.qlen = 0; + q->dropping = false; + q->count = 1; +} + +static struct Qdisc_ops codel_qdisc_ops __read_mostly = { + .id = "codel", + .priv_size = sizeof(struct codel_sched_data), + + .enqueue = codel_enqueue, + .dequeue = codel_dequeue, + .peek = qdisc_peek_dequeued, + .init = codel_init, + .reset = codel_reset, + .change = codel_change, + .dump = codel_dump, + .owner = THIS_MODULE, +}; + +static int __init codel_module_init(void) +{ + return register_qdisc(&codel_qdisc_ops); +} +static void __exit codel_module_exit(void) +{ + unregister_qdisc(&codel_qdisc_ops); +} +module_init(codel_module_init) +module_exit(codel_module_exit) +MODULE_LICENSE("GPL"); +