[Codel] [PATCH v5] pkt_sched: codel: Controlled Delay AQM

Sat May 5 12:11:15 EDT 2012

Nice!

Nits:

0) I figure you already have an iproute2 patch that you can send?
    I thought 5 hours ago I had almost, but not entirely grokked netlink.
    The way you just did it was not at all how I thought it worked. :/
    but I will read.
1) I take it if a limit is not specified or set here, sch->limit comes
from txqueuelen?
    I do kind of like infinite queues (and angels dancing on the heads of pins)
2) I woke up with a mod that could do ecn. I'll do an rfc patch.
3) Tom's already on the list
4) I'd like to play with this a lot (and have others do so too) before
it goes upstream,
    gain kathie and vans blessing, etc. Couple weeks? (see 2). In
particular I was
    hoping to see actual pings under load match the target setting.
I'll get this
    going on two boxes and see what happens... play with bql, htb, etc...
5) thought the * 16 could be efficiently implemented by the compiler,
and saves a mem
    access.
6) unless 2) happens we can kill q->flags
7) thx

On Sat, May 5, 2012 at 7:49 AM, Eric Dumazet <eric.dumazet at gmail.com> wrote:
> From: Dave Taht <dave.taht at gmail.com>
>
> A nice changelog here, to tell how nice is CoDel, giving pointers to
> documentation and all credits.

I don't have a link to the web site... nor have I read the paper, yet. Monday.

>
> Signed-off-by: Dave Taht <dave.taht at gmail.com>
> Signed-off-by: Eric Dumazet <edumazet at google.com>
> ---
>  include/linux/pkt_sched.h |   13 +
>  net/sched/Kconfig         |   11
>  net/sched/Makefile        |    1
>  net/sched/sch_codel.c     |  425 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 450 insertions(+)
>
> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
> index ffe975c..62a73bf 100644
> --- a/include/linux/pkt_sched.h
> +++ b/include/linux/pkt_sched.h
> @@ -655,4 +655,17 @@ struct tc_qfq_stats {
>        __u32 lmax;
>  };
>
> +/* CODEL */
> +
> +enum {
> +       TCA_CODEL_UNSPEC,
> +       TCA_CODEL_TARGET,
> +       TCA_CODEL_LIMIT,
> +       TCA_CODEL_MINBYTES,
> +       TCA_CODEL_INTERVAL,
> +       __TCA_CODEL_MAX
> +};
> +
> +#define TCA_CODEL_MAX  (__TCA_CODEL_MAX - 1)
> +
>  #endif
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index 75b58f8..fadd252 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -250,6 +250,17 @@ config NET_SCH_QFQ
>
>          If unsure, say N.
>
> +config NET_SCH_CODEL
> +       tristate "Controlled Delay AQM (CODEL)"
> +       help
> +         Say Y here if you want to use the Controlled Delay (CODEL)
> +         packet scheduling algorithm.
> +
> +         To compile this driver as a module, choose M here: the module
> +         will be called sch_codel.
> +
> +         If unsure, say N.
> +
>  config NET_SCH_INGRESS
>        tristate "Ingress Qdisc"
>        depends on NET_CLS_ACT
> diff --git a/net/sched/Makefile b/net/sched/Makefile
> index 8cdf4e2..30fab03 100644
> --- a/net/sched/Makefile
> +++ b/net/sched/Makefile
> @@ -37,6 +37,7 @@ obj-$(CONFIG_NET_SCH_PLUG)    += sch_plug.o
>  obj-$(CONFIG_NET_SCH_MQPRIO)   += sch_mqprio.o
>  obj-$(CONFIG_NET_SCH_CHOKE)    += sch_choke.o
>  obj-$(CONFIG_NET_SCH_QFQ)      += sch_qfq.o
> +obj-$(CONFIG_NET_SCH_CODEL)    += sch_codel.o
>
>  obj-$(CONFIG_NET_CLS_U32)      += cls_u32.o
>  obj-$(CONFIG_NET_CLS_ROUTE4)   += cls_route.o
> diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
> new file mode 100644
> index 0000000..a19177f
> --- /dev/null
> +++ b/net/sched/sch_codel.c
> @@ -0,0 +1,425 @@
> +/*
> + * net/sched/sch_codel.c       A Codel implementation
> + *
> + *     This program is free software; you can redistribute it and/or
> + *     modify it under the terms of the GNU General Public License
> + *     as published by the Free Software Foundation; either version
> + *     2 of the License, or (at your option) any later version.
> + *
> + * Codel, the COntrolled DELay Queueing discipline
> + * Based on ns2 simulation code presented by Kathie Nichols
> + *
> + * Authors:    Dave Täht <d at taht.net>
> + *             Eric Dumazet <edumazet at google.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/errno.h>
> +#include <linux/ktime.h>
> +#include <linux/skbuff.h>
> +#include <net/pkt_sched.h>
> +
> +#define MS2TIME(a) (ns_to_ktime( (u64) a * NSEC_PER_MSEC))
> +#define DEFAULT_CODEL_LIMIT 1000
> +#define PRECALC_MAX 64
> +
> +/*
> + * Via patch found at:
> + * http://lkml.indiana.edu/hypermail/linux/kernel/0802.0/0659.html
> + * I don't know why this isn't in ktime.h as it seemed sane...
> + */
> +
> +/*
> + * ktime_compare - Compares two ktime_t variables
> + *
> + * Return val:
> + * lhs < rhs: < 0
> + * lhs == rhs: 0
> + * lhs > rhs: > 0
> + */
> +
> +#if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
> +static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs)
> +{
> +       if (lhs.tv64 < rhs.tv64)
> +               return -1;
> +       if (lhs.tv64 > rhs.tv64)
> +               return 1;
> +       return 0;
> +}
> +#else
> +static inline int ktime_compare(const ktime_t lhs, const ktime_t rhs)
> +{
> +       if (lhs.tv.sec < rhs.tv.sec)
> +               return -1;
> +       if (lhs.tv.sec > rhs.tv.sec)
> +               return 1;
> +       return lhs.tv.nsec - rhs.tv.nsec;
> +}
> +#endif
> +
> +/* Per-queue state (codel_queue_t instance variables) */
> +
> +struct codel_sched_data {
> +       u32     flags;
> +       u32     minbytes;
> +       u32     count; /* packets dropped since we went into drop state */
> +       u32     drop_count;
> +       bool    dropping;
> +       ktime_t target;
> +       /* time to declare above q->target (0 if below)*/
> +       ktime_t first_above_time;
> +       ktime_t drop_next; /* time to drop next packet */
> +       ktime_t interval16;
> +       u32     interval;
> +       u32     q_intervals[PRECALC_MAX];
> +};
> +
> +struct codel_skb_cb {
> +       ktime_t enqueue_time;
> +};
> +
> +static unsigned int state1;
> +static unsigned int state2;
> +static unsigned int state3;
> +static unsigned int states;
> +
> +/*
> + * return interval/sqrt(x) with good precision
> + */
> +static u32 calc(u32 _interval, unsigned long x)
> +{
> +       u64 interval = _interval;
> +
> +       /* scale operands for max precision */
> +       while (x < (1UL << (BITS_PER_LONG - 2))) {
> +               x <<= 2;
> +               interval <<= 1;
> +       }
> +       do_div(interval, int_sqrt(x));
> +       return (u32)interval;
> +}
> +
> +static void codel_fill_cache(struct codel_sched_data *q)
> +{
> +       int i;
> +
> +       q->q_intervals[0] = q->interval;
> +       for (i = 2; i <= PRECALC_MAX; i++)
> +               q->q_intervals[i - 1] = calc(q->interval, i);
> +}
> +
> +static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb)
> +{
> +       qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb));
> +       return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data;
> +}
> +
> +static ktime_t get_enqueue_time(const struct sk_buff *skb)
> +{
> +       return get_codel_cb(skb)->enqueue_time;
> +}
> +
> +static void set_enqueue_time(struct sk_buff *skb)
> +{
> +       get_codel_cb(skb)->enqueue_time = ktime_get();
> +}
> +
> +/*
> + *     The original control_law required floating point.
> + *
> + *     return ktime_add_ns(t, q->interval / sqrt(q->count));
> + *
> + */
> +static ktime_t control_law(const struct codel_sched_data *q, ktime_t t)
> +{
> +       u32 inter;
> +
> +       if (q->count > PRECALC_MAX)
> +               inter = calc(q->interval, q->count);
> +       else
> +               inter = q->q_intervals[q->count - 1];
> +       return ktime_add_ns(t, inter);
> +}
> +
> +static bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now)
> +{
> +       struct codel_sched_data *q = qdisc_priv(sch);
> +       ktime_t sojourn_time;
> +       bool drop;
> +
> +       if (!skb) {
> +               q->first_above_time.tv64 = 0;
> +               return false;
> +       }
> +       sojourn_time = ktime_sub(now, get_enqueue_time(skb));
> +
> +       if (ktime_compare(sojourn_time, q->target) < 0 ||
> +           sch->qstats.backlog < q->minbytes) {
> +               /* went below so we'll stay below for at least q->interval */
> +               q->first_above_time.tv64 = 0;
> +               return false;
> +       }
> +       drop = false;
> +       if (q->first_above_time.tv64 == 0) {
> +               /* just went above from below. If we stay above
> +                * for at least q->interval we'll say it's ok to drop
> +                */
> +               q->first_above_time = ktime_add_ns(now, q->interval);
> +       } else if (ktime_compare(now, q->first_above_time) >= 0) {
> +               drop = true;
> +               state1++;
> +       }
> +       return drop;
> +}
> +
> +static void codel_drop(struct Qdisc *sch, struct sk_buff *skb)
> +{
> +       struct codel_sched_data *q = qdisc_priv(sch);
> +
> +       sch->qstats.backlog -= qdisc_pkt_len(skb);
> +       qdisc_drop(skb, sch);
> +       q->drop_count++;
> +}
> +
> +static struct sk_buff *codel_dequeue(struct Qdisc *sch)
> +{
> +       struct codel_sched_data *q = qdisc_priv(sch);
> +       struct sk_buff *skb = __skb_dequeue(&sch->q);
> +       ktime_t now;
> +       bool drop;
> +
> +       if (!skb) {
> +               q->dropping = false;
> +               return skb;
> +       }
> +       now = ktime_get();
> +       drop = should_drop(skb, sch, now);
> +       if (q->dropping) {
> +               if (!drop) {
> +                       /* sojourn time below target - leave dropping state */
> +                       q->dropping = false;
> +               } else if (ktime_compare(now, q->drop_next) >=0) {
> +                       state2++;
> +                       /* It's time for the next drop. Drop the current
> +                        * packet and dequeue the next. The dequeue might
> +                        * take us out of dropping state.
> +                        * If not, schedule the next drop.
> +                        * A large backlog might result in drop rates so high
> +                        * that the next drop should happen now,
> +                        * hence the while loop.
> +                        */
> +                       while (q->dropping &&
> +                              (ktime_compare(now, q->drop_next) >= 0)) {
> +                               codel_drop(sch, skb);
> +                               q->count++;
> +                               skb = __skb_dequeue(&sch->q);
> +                               if (!should_drop(skb, sch, now)) {
> +                                       /* leave dropping state */
> +                                       q->dropping = false;
> +                               } else {
> +                                       /* and schedule the next drop */
> +                                       q->drop_next =
> +                                               control_law(q, q->drop_next);
> +                               }
> +                       }
> +               }
> +       } else if (drop &&
> +                  ((ktime_compare(ktime_sub(now, q->drop_next),
> +                                  q->interval16) < 0) ||
> +                  (ktime_compare(ktime_sub(now, q->first_above_time),
> +                                 ns_to_ktime(2 * q->interval)) >= 0 ))) {
> +               codel_drop(sch, skb);
> +               skb = __skb_dequeue(&sch->q);
> +               drop = should_drop(skb, sch, now);
> +               q->dropping = true;
> +               state3++;
> +               /*
> +                * if min went above target close to when we last went below it
> +                * assume that the drop rate that controlled the queue on the
> +                * last cycle is a good starting point to control it now.
> +                */
> +               if (ktime_compare(ktime_sub(now, q->drop_next),
> +                                 q->interval16) < 0) {
> +                       q->count = q->count > 1 ? q->count - 1 : 1;
> +               } else {
> +                       q->count = 1;
> +               }
> +               q->drop_next = control_law(q, now);
> +       }
> +       if ((states++ % 64) == 0) {
> +               pr_debug("s1: %u, s2: %u, s3: %u\n",
> +                         state1, state2, state3);
> +       }
> +       /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
> +        * or HTB crashes
> +        */
> +       if (q->drop_count && sch->q.qlen) {
> +               qdisc_tree_decrease_qlen(sch, q->drop_count);
> +               q->drop_count = 0;
> +       }
> +       if (skb) {
> +               sch->qstats.backlog -= qdisc_pkt_len(skb);
> +               qdisc_bstats_update(sch, skb);
> +       }
> +       return skb;
> +}
> +
> +static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> +{
> +       if (likely(skb_queue_len(&sch->q) < sch->limit)) {
> +               set_enqueue_time(skb);
> +               return qdisc_enqueue_tail(skb, sch);
> +       }
> +       return qdisc_drop(skb, sch);
> +}
> +
> +static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
> +       [TCA_CODEL_TARGET]      = { .type = NLA_U32 },
> +       [TCA_CODEL_LIMIT]       = { .type = NLA_U32 },
> +       [TCA_CODEL_MINBYTES]    = { .type = NLA_U32 },
> +       [TCA_CODEL_INTERVAL]    = { .type = NLA_U32 },
> +};
> +
> +static int codel_change(struct Qdisc *sch, struct nlattr *opt)
> +{
> +       struct codel_sched_data *q = qdisc_priv(sch);
> +       struct nlattr *tb[TCA_CODEL_MAX + 1];
> +       unsigned int qlen;
> +       int err;
> +
> +       if (opt == NULL)
> +               return -EINVAL;
> +
> +       err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy);
> +       if (err < 0)
> +               return err;
> +
> +       sch_tree_lock(sch);
> +       if (tb[TCA_CODEL_TARGET]) {
> +               u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]);
> +
> +               q->target = ns_to_ktime((u64) target * NSEC_PER_USEC);
> +       }
> +       if (tb[TCA_CODEL_INTERVAL]) {
> +               u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]);
> +
> +               interval = min_t(u32, ~0U / NSEC_PER_USEC, interval);
> +
> +               q->interval = interval * NSEC_PER_USEC;
> +               q->interval16 = ns_to_ktime(16 * (u64)q->interval);
> +               codel_fill_cache(q);
> +       }
> +       if (tb[TCA_CODEL_LIMIT])
> +               sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]);
> +
> +       if (tb[TCA_CODEL_MINBYTES])
> +               q->minbytes = nla_get_u32(tb[TCA_CODEL_MINBYTES]);
> +
> +       qlen = sch->q.qlen;
> +       while (sch->q.qlen > sch->limit) {
> +               struct sk_buff *skb = __skb_dequeue(&sch->q);
> +
> +               sch->qstats.backlog -= qdisc_pkt_len(skb);
> +               qdisc_drop(skb, sch);
> +       }
> +       qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
> +
> +       q->drop_next.tv64 = q->first_above_time.tv64 = 0;
> +       q->dropping = false;
> +       sch_tree_unlock(sch);
> +       return 0;
> +}
> +
> +static int codel_init(struct Qdisc *sch, struct nlattr *opt)
> +{
> +       struct codel_sched_data *q = qdisc_priv(sch);
> +
> +       q->target = MS2TIME(5);
> +       /* It should be possible to run with no limit,
> +        * with infinite memory :)
> +        */
> +       sch->limit = DEFAULT_CODEL_LIMIT;
> +       q->minbytes = psched_mtu(qdisc_dev(sch));
> +       q->interval = 100 * NSEC_PER_MSEC;
> +       q->interval16 = ns_to_ktime(16 * (u64)q->interval);
> +       q->drop_next.tv64 = q->first_above_time.tv64 = 0;
> +       q->dropping = false; /* exit dropping state */
> +       q->count = 1;
> +       codel_fill_cache(q);
> +       if (opt) {
> +               int err = codel_change(sch, opt);
> +
> +               if (err)
> +                       return err;
> +       }
> +
> +       if (sch->limit >= 1)
> +               sch->flags |= TCQ_F_CAN_BYPASS;
> +       else
> +               sch->flags &= ~TCQ_F_CAN_BYPASS;
> +
> +       return 0;
> +}
> +
> +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
> +{
> +       struct codel_sched_data *q = qdisc_priv(sch);
> +       struct nlattr *opts;
> +       u32 target = ktime_to_us(q->target);
> +
> +       opts = nla_nest_start(skb, TCA_OPTIONS);
> +       if (opts == NULL)
> +               goto nla_put_failure;
> +       if (nla_put_u32(skb, TCA_CODEL_TARGET, target) ||
> +           nla_put_u32(skb, TCA_CODEL_LIMIT, sch->limit) ||
> +           nla_put_u32(skb, TCA_CODEL_INTERVAL, q->interval / NSEC_PER_USEC) ||
> +           nla_put_u32(skb, TCA_CODEL_MINBYTES, q->minbytes))
> +               goto nla_put_failure;
> +
> +       return nla_nest_end(skb, opts);
> +
> +nla_put_failure:
> +       nla_nest_cancel(skb, opts);
> +       return -1;
> +}
> +
> +static void codel_reset(struct Qdisc *sch)
> +{
> +       struct codel_sched_data *q = qdisc_priv(sch);
> +
> +       qdisc_reset_queue(sch);
> +       sch->q.qlen = 0;
> +       q->dropping = false;
> +       q->count = 1;
> +}
> +
> +static struct Qdisc_ops codel_qdisc_ops __read_mostly = {
> +       .id             =       "codel",
> +       .priv_size      =       sizeof(struct codel_sched_data),
> +
> +       .enqueue        =       codel_enqueue,
> +       .dequeue        =       codel_dequeue,
> +       .peek           =       qdisc_peek_dequeued,
> +       .init           =       codel_init,
> +       .reset          =       codel_reset,
> +       .change         =       codel_change,
> +       .dump           =       codel_dump,
> +       .owner          =       THIS_MODULE,
> +};
> +
> +static int __init codel_module_init(void)
> +{
> +        return register_qdisc(&codel_qdisc_ops);
> +}
> +static void __exit codel_module_exit(void)
> +{
> +        unregister_qdisc(&codel_qdisc_ops);
> +}
> +module_init(codel_module_init)
> +module_exit(codel_module_exit)
> +MODULE_LICENSE("GPL");
> +
>
>

-- 
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
http://www.bufferbloat.net