From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-bk0-f43.google.com (mail-bk0-f43.google.com [209.85.214.43]) (using TLSv1 with cipher RC4-SHA (128/128 bits)) (Client CN "smtp.gmail.com", Issuer "Google Internet Authority" (verified OK)) by huchra.bufferbloat.net (Postfix) with ESMTPS id 5469A202213 for ; Fri, 4 May 2012 03:28:00 -0700 (PDT) Received: by bkty5 with SMTP id y5so4678220bkt.16 for ; Fri, 04 May 2012 03:27:58 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=subject:from:to:cc:in-reply-to:references:content-type:date :message-id:mime-version:x-mailer:content-transfer-encoding; bh=o5wW99edEWyUQb2loNoXMJmQhW7/hMJSIY9OB3WyUn0=; b=Imu5WZJpRkzoaTz03S0EsVIJ1oDQvrxezOExiV/Dcd6C9ihySql6oiYV1KrF3fPIlI brEDX9PmF/3CPiBRL/Jyl2JzINSDCu7UhcVTpIozXJEtPziwixG61zeMJuFtjen49pjC 9omQdoMMkvhiFcCy4ClruOMbfpGZb2DlJ3SefLGSWICfB4IMl5ky8Pe+ytJYOhogG7a1 6lwM7+/lyhnMrZYd9PipYk8e9UvHs9N/y4rxwmfS3x1qoaMmjDrKwyqlJRmruFXRWawm QbkkfTPK/pl8gJRhoO91WkfavbxZOQeprx1G3c8iP8PTs50rlkI/TGROoVc40tAMuaON hCrQ== Received: by 10.204.152.202 with SMTP id h10mr1898700bkw.85.1336127277966; Fri, 04 May 2012 03:27:57 -0700 (PDT) Received: from [172.28.131.195] ([74.125.122.49]) by mx.google.com with ESMTPS id z17sm16011183bkw.12.2012.05.04.03.27.55 (version=SSLv3 cipher=OTHER); Fri, 04 May 2012 03:27:56 -0700 (PDT) From: Eric Dumazet To: Dave =?ISO-8859-1?Q?T=E4ht?= In-Reply-To: <1336121761.3752.115.camel@edumazet-glaptop> References: <1336116589-18550-1-git-send-email-dave.taht@bufferbloat.net> <1336121761.3752.115.camel@edumazet-glaptop> Content-Type: text/plain; charset="UTF-8" Date: Fri, 04 May 2012 12:27:54 +0200 Message-ID: <1336127274.3752.126.camel@edumazet-glaptop> Mime-Version: 1.0 X-Mailer: Evolution 2.28.3 Content-Transfer-Encoding: 8bit Cc: codel@lists.bufferbloat.net Subject: Re: [Codel] [PATCH] codel take 2 X-BeenThere: codel@lists.bufferbloat.net X-Mailman-Version: 2.1.13 Precedence: list List-Id: CoDel AQM discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 04 May 2012 10:28:01 -0000 On Fri, 2012-05-04 at 10:56 +0200, Eric Dumazet wrote: > Last : about non conserving stuff, this is because you must call > qdisc_tree_decrease_qlen(sch, number_of_packets_dropped) in > codel_dequeue(), or else your parents has no idea of what happened. > > Note : Do that once (batch), not for each packet, because > qdisc_tree_decrease_qlen() is really expensive since it does a linear > search to find parents (we dont have pointers to parents). If your > machine has 1000 qdisc, its slow as hell. > > Here is my cleaned up version of the patch, compiled on x86_32, net-next tree. Feel free to use/amend it. Some work is needed to get the TCA_CODEL_xxx things right. include/linux/pkt_sched.h | 29 ++ net/sched/Kconfig | 11 + net/sched/Makefile | 1 net/sched/sch_codel.c | 360 ++++++++++++++++++++++++++++++++++++ 4 files changed, 401 insertions(+) diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index ffe975c..a4a7d04 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -655,4 +655,33 @@ struct tc_qfq_stats { __u32 lmax; }; +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_PARMS, + TCA_CODEL_TARGET, + TCA_CODEL_LIMIT, + TCA_CODEL_MINBYTES, + TCA_CODEL_INTERVAL, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) +#define TC_CODEL_ECN 1 + +struct tc_codel_qopt { + __u32 flags; /* flags (e.g. ecn) */ + __u32 target; /* max delay, in us */ + __u32 limit; /* queue limit in packets */ + __u32 minbytes; /* MTU (usually) */ + __u32 interval; /* Sliding min time window width (us) */ +}; + +struct tc_codel_stats { + __u64 drops; + __u64 marks; +}; + + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 75b58f8..fadd252 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -250,6 +250,17 @@ config NET_SCH_QFQ If unsure, say N. +config NET_SCH_CODEL + tristate "Controlled Delay AQM (CODEL)" + help + Say Y here if you want to use the Controlled Delay (CODEL) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_codel. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index 8cdf4e2..30fab03 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index e69de29..91e7798 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -0,0 +1,360 @@ +/* + * net/sched/sch_codel.c A Codel implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Based on ns2 simulation code presented by Kathie Nichols + * Authors: Dave Täht + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MS2TIME(a) (ns_to_ktime( (u64) a * NSEC_PER_MSEC)) +#define DEFAULT_CODEL_LIMIT 1000 + +/* + * Via patch found at: + * http://lkml.indiana.edu/hypermail/linux/kernel/0802.0/0659.html + * I don't know why this isn't in ktime.h as it seemed sane... +*/ + +/** + * ktime_compare - Compares two ktime_t variables + * + * Return val: + * lhs < rhs: < 0 + * lhs == rhs: 0 + * lhs > rhs: > 0 + */ + +#if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR) +static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs) +{ + if (lhs.tv64 < rhs.tv64) + return -1; + if (lhs.tv64 > rhs.tv64) + return 1; + return 0; +} +#else +static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs) +{ + if (lhs.tv.sec < rhs.tv.sec) + return -1; + if (lhs.tv.sec > rhs.tv.sec) + return 1; + return lhs.tv.nsec - rhs.tv.nsec; +} +#endif + +/* Per-queue state (codel_queue_t instance variables) */ + +struct codel_sched_data { + u32 flags; + u32 minbytes; + u32 count; /* packets dropped since we went into drop state */ + bool dropping; + ktime_t target; + /* time to declare above q->target (0 if below)*/ + ktime_t first_above_time; + ktime_t drop_next; /* time to drop next packet */ + unsigned long interval; +}; + +struct codel_skb_cb { + ktime_t enqueue_time; +}; + +static unsigned int state1; +static unsigned int state2; +static unsigned int state3; +static unsigned int state4; +static unsigned int states; + +static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb)); + return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static ktime_t get_enqueue_time(const struct sk_buff *skb) +{ + return get_codel_cb(skb)->enqueue_time; +} + +static void set_enqueue_time(struct sk_buff *skb) +{ + get_codel_cb(skb)->enqueue_time = ktime_get(); +} + +static ktime_t control_law(const struct codel_sched_data *q, ktime_t t) +{ + return ktime_add_ns(t, q->interval / int_sqrt(q->count)); +} + +/* +static int codel_prob_mark(const struct codel_sched_data *q) +{ + return q->flags & TC_CODEL_ECN; +} +*/ + +/* wrappers for ultimate statistics collection */ + +static int codel_drop(struct sk_buff *skb, struct Qdisc *sch) +{ + pr_err("droppped packet\n"); + return qdisc_drop(skb, sch); +} + +/* +static int codel_queue_drop(struct Qdisc *sch) +{ + return qdisc_drop(skb, sch); +} +*/ + +struct sk_buff *codel_dequeue_head(struct Qdisc *sch) +{ + return qdisc_dequeue_head(sch); +} + +bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now) +{ + struct codel_sched_data *q = qdisc_priv(sch); + ktime_t sojourn_time; + bool drop = false; + + if (!skb) { + q->first_above_time.tv64 = 0; + return false; + } + sojourn_time = ktime_sub(now, get_enqueue_time(skb)); + + if (ktime_compare(sojourn_time, q->target) < 0 || + sch->qstats.backlog < q->minbytes) { + /* went below so we'll stay below for at least q->interval */ + q->first_above_time.tv64 = 0; + } else { + if (q->first_above_time.tv64 == 0) { + /* just went above from below. If we stay above + * for at least q->interval we'll say it's ok to drop + */ + q->first_above_time = ktime_add_ns(now, q->interval); + } else if (ktime_compare(now, q->first_above_time) >= 0) { + drop = true; + state1++; + } + } + return drop; +} + +static struct sk_buff *codel_dequeue(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = codel_dequeue_head(sch); + ktime_t now; + unsigned int drop_count = 0; + bool drop; + + if (!skb) { + q->dropping = false; + return skb; + } + now = ktime_get(); + drop = should_drop(skb, sch, now); + if (q->dropping) { + if (!drop) { + /* sojourn time below target - leave dropping state */ + q->dropping = false; + } else if (ktime_compare(now, q->drop_next) >=0) { + state2++; + /* It's time for the next drop. Drop the current packet + * and dequeue the next. The dequeue might take us + * out of dropping state. If not, schedule the next drop. + * A large backlog might result in drop rates so high + * that the next drop should happen now, hence the while loop. + */ + while (q->dropping && + (ktime_compare(now, q->drop_next) >= 0)) { + codel_drop(skb, sch); + drop_count++; + q->count++; + skb = codel_dequeue_head(sch); + if (!should_drop(skb, sch, now)) { + /* leave dropping state */ + q->dropping = false; + } else { + /* and schedule the next drop */ + q->drop_next = + control_law(q, q->drop_next); + } + } + } + } else if (drop && + ((ktime_compare(ktime_sub(now, q->drop_next), + ns_to_ktime(16 * q->interval)) < 0) || + (ktime_compare(ktime_sub(now, q->first_above_time), + ns_to_ktime(2 * q->interval)) >= 0 ))) { + codel_drop(skb, sch); + drop_count++; + skb = codel_dequeue_head(sch); + drop = should_drop(skb, sch, now); + q->dropping = true; + state3++; + /* + * if min went above target close to when we last went below it + * assume that the drop rate that controlled the queue on the + * last cycle is a good starting point to control it now. + */ + if (ktime_compare(ktime_sub(now, q->drop_next), + ns_to_ktime(16 * q->interval)) < 0) { + q->count = q->count > 1 ? q->count - 1 : 1; + } else { + q->count = 1; + } + q->drop_next = control_law(q, now); + } + if ((states++ % 64) == 0) { + pr_debug("s1: %u, s2: %u, s3: %u, d4: %u\n", state1, state2, state3, state4); + } + if (drop_count) + qdisc_tree_decrease_qlen(sch, drop_count); + return skb; +} + + +static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + if (likely(skb_queue_len(&sch->q) < sch->limit)) { + set_enqueue_time(skb); + return qdisc_enqueue_tail(skb, sch); + } + return qdisc_reshape_fail(skb, sch); +} + +static int codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct tc_codel_qopt *ctl = nla_data(opt); + unsigned int qlen; + + if (opt->nla_len < nla_attr_size(sizeof(*ctl))) + return -EINVAL; + if (ctl->limit && (ctl->limit < 2 || ctl->limit > 65536)) + return -EINVAL; + if (ctl->minbytes && (ctl->minbytes < 64 || ctl->minbytes > 65536)) + return -EINVAL; + sch_tree_lock(sch); + if (ctl->minbytes) + q->minbytes = ctl->minbytes; + if (ctl->flags) + q->flags = ctl->flags; + if (ctl->target) + q->target = ns_to_ktime((u64) ctl->target * 1000); + if (ctl->interval) { + u32 interval = min_t(u32, ~0U / NSEC_PER_USEC, ctl->interval); + + q->interval = interval * NSEC_PER_USEC; + } + /* something saner than this for limit is probably needed */ + + if (ctl->limit) + sch->limit = ctl->limit; + qlen = sch->q.qlen; +// while (sch->q.qlen > ctl->limit) +// codel_drop(skb, sch); +// qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); //? + q->drop_next.tv64 = q->first_above_time.tv64 = 0; + q->dropping = false; + sch_tree_unlock(sch); + return 0; +} + +static int codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + q->target = MS2TIME(5); + sch->limit = DEFAULT_CODEL_LIMIT; + q->minbytes = psched_mtu(qdisc_dev(sch)); + q->interval = 100 * NSEC_PER_MSEC; + q->drop_next.tv64 = q->first_above_time.tv64 = 0; + q->dropping = false; /* exit dropping state */ + /* if (!opt) { */ + /* int err = codel_change(sch, opt); */ + /* if (err) */ + /* return err; */ + /* } */ + + if (sch->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + + return 0; +} + +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct tc_codel_qopt opt; + + opt.target = (u32) ktime_to_us(q->target); + opt.interval = q->interval / NSEC_PER_USEC; + opt.limit = sch->limit; + opt.minbytes = q->minbytes; + opt.flags = q->flags; + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; + return skb->len; + +nla_put_failure: + return -1; +} + +static void codel_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = codel_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +struct Qdisc_ops codel_qdisc_ops __read_mostly = { + .id = "codel", + .priv_size = sizeof(struct codel_sched_data), + .enqueue = codel_enqueue, + .dequeue = codel_dequeue, + .peek = qdisc_peek_head, +/* .drop = codel_queue_drop, */ + .init = codel_init, + .reset = codel_reset, + .change = codel_change, + .dump = codel_dump, + .owner = THIS_MODULE, +}; +EXPORT_SYMBOL(codel_qdisc_ops); + +static int __init codel_module_init(void) +{ + return register_qdisc(&codel_qdisc_ops); +} +static void __exit codel_module_exit(void) +{ + unregister_qdisc(&codel_qdisc_ops); +} +module_init(codel_module_init) +module_exit(codel_module_exit) +MODULE_LICENSE("GPL"); +