[Codel] [PATCH] Preliminary codel implementation

Thu May 3 18:38:22 EDT 2012

On Thu, May 3, 2012 at 11:17 AM, Eric Dumazet <eric.dumazet at gmail.com> wrote:
> On Thu, 2012-05-03 at 10:52 -0700, Dave Täht wrote:
>> ---
>>  include/linux/pkt_sched.h |   29 +++++
>>  net/sched/Kconfig         |   11 ++
>>  net/sched/Makefile        |    1 +
>>  net/sched/sch_codel.c     |  296 +++++++++++++++++++++++++++++++++++++++++++++
>>  4 files changed, 337 insertions(+)
>>  create mode 100644 net/sched/sch_codel.c
>>
>
>
> Hi Dave.
>
> Was this tested or is just a draft ?

It compiled, seemed logically correct (since disproven), passedpackets
and didn't crash. :) I was happy with the night's work for all of 10
minutes.

I do massively appreciate your feedback and advice, as these are areas
of the kernel that are opaque to most.

>
>> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
>> index 0d5b793..c21b720 100644
>> --- a/include/linux/pkt_sched.h
>> +++ b/include/linux/pkt_sched.h
>> @@ -633,4 +633,33 @@ struct tc_qfq_stats {
>>       __u32 lmax;
>>  };
>>
>> +/* CODEL */
>> +
>> +enum {
>> +     TCA_CODEL_UNSPEC,
>> +     TCA_CODEL_PARMS,
>> +     TCA_CODEL_TARGET,
>> +     TCA_CODEL_DEPTH,
>> +     TCA_CODEL_MINBYTES,
>> +     TCA_CODEL_INTERVAL,
>> +     __TCA_CODEL_MAX
>> +};
>> +
>> +#define TCA_CODEL_MAX        (__TCA_CODEL_MAX - 1)
>> +#define TC_CODEL_ECN 1
>> +
>> +struct tc_codel_qopt {
>> +        __u32 flags; /* flags (e.g. ecn) */
>> +     __u32 target;   /* max delay, in us */
>> +        __u32 depth; /* queue depth in packets */
>> +        __u32 minbytes;      /* MTU (usually) */
>> +        __u32 interval;      /* Sliding min time window width (us) */
>> +};
>> +
>> +struct tc_codel_stats {
>> +     __u64 drops;
>> +     __u64 marks;
>> +};
>> +
>> +
>>  #endif
>> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
>> index 2590e91..8106c42 100644
>> --- a/net/sched/Kconfig
>> +++ b/net/sched/Kconfig
>> @@ -250,6 +250,17 @@ config NET_SCH_QFQ
>>
>>         If unsure, say N.
>>
>> +config NET_SCH_CODEL
>> +     tristate "Controlled Delay AQM (CODEL)"
>> +     help
>> +       Say Y here if you want to use the Controlled Delay (CODEL)
>> +       packet scheduling algorithm.
>> +
>> +       To compile this driver as a module, choose M here: the module
>> +       will be called sch_codel.
>> +
>> +       If unsure, say N.
>> +
>>  config NET_SCH_INGRESS
>>       tristate "Ingress Qdisc"
>>       depends on NET_CLS_ACT
>> diff --git a/net/sched/Makefile b/net/sched/Makefile
>> index dc5889c..41130b5 100644
>> --- a/net/sched/Makefile
>> +++ b/net/sched/Makefile
>> @@ -36,6 +36,7 @@ obj-$(CONFIG_NET_SCH_DRR)   += sch_drr.o
>>  obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
>>  obj-$(CONFIG_NET_SCH_CHOKE)  += sch_choke.o
>>  obj-$(CONFIG_NET_SCH_QFQ)    += sch_qfq.o
>> +obj-$(CONFIG_NET_SCH_CODEL)  += sch_codel.o
>>
>>  obj-$(CONFIG_NET_CLS_U32)    += cls_u32.o
>>  obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
>> diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
>> new file mode 100644
>> index 0000000..159df47
>> --- /dev/null
>> +++ b/net/sched/sch_codel.c
>> @@ -0,0 +1,296 @@
>> +/*
>> + * net/sched/sch_codel.c     A Codel implementation
>> + *
>> + *           This program is free software; you can redistribute it and/or
>> + *           modify it under the terms of the GNU General Public License
>> + *           as published by the Free Software Foundation; either version
>> + *           2 of the License, or (at your option) any later version.
>> + *
>> + * Based on ns2 simulation code presented by Kathie Nichols
>> + * Authors:  Dave Täht <d at taht.net>
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/slab.h>
>> +#include <linux/types.h>
>> +#include <linux/kernel.h>
>> +#include <linux/errno.h>
>> +#include <linux/ktime.h>
>> +#include <linux/skbuff.h>
>> +#include <net/pkt_sched.h>
>> +
>> +#define MS2TIME(a) (ns_to_ktime( (u64) a*1000000))
>> +#define DEFAULT_CODEL_DEPTH 1000
>> +
>> +/* Per-queue state (codel_queue_t instance variables) */
>> +
>> +struct codel_sched_data {
>> +     u32 flags;
>> +     u32 minbytes;
>> +     u32 count; /* packets dropped since we went into drop state */
>> +     bool dropping;  /* 1 if in drop state, might just add to flags */
>> +     ktime_t target;
>> +     ktime_t interval;
>> +        /* time to declare above q->target (0 if below)*/
>> +     ktime_t first_above_time;
>> +     ktime_t drop_next; /* time to drop next packet */
>> +};
>> +
>> +struct codel_skb_cb {
>> +     ktime_t enqueue_time;
>> +     char data[16];
>> +};
>> +
>> +static inline struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb)
>> +{
>> +     return (struct codel_skb_cb *)skb->cb;
>> +}
>
> You cant do that. You MUST keep the generic qdisc cb unchanged
>

I would prefer to have the timestamp on entry to the overall system,
if there was some way to leverage the skb_shared_timestamps->syststamp
facility.

But, otherwise noted and corrected.

> {
>        qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb));
>        return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data;
> }
>
>
>
>
>> +
>> +static inline ktime_t get_enqueue_time(const struct sk_buff *skb) {
>> +     struct codel_skb_cb *cb = get_codel_cb(skb);
>
>        const struct codel_skb_cb *cb = get_codel_cb(skb);
>
>> +     return cb->enqueue_time;
>
>        or : return get_codel_cb(skb)->enqueue_time
>
>> +}
>> +

OK

>> +static inline ktime_t set_enqueue_time(struct sk_buff *skb, ktime_t t ) {
>> +     struct codel_skb_cb *cb = get_codel_cb(skb);
>> +     cb->enqueue_time = t;
>
>        get_codel_cb(skb)->enqueue_time = t;
>
>> +     return t;
>> +}
>> +

I read that and my eyes cross.

>> +static inline ktime_t control_law(const struct codel_sched_data *q, ktime_t t)
>> +{
>> +     t.tv64 = t.tv64 + q->interval.tv64 / int_sqrt(q->count);
>
> thats a killer for lot of machines... (divide plus the int_sqrt())

If you think this is high overhead compared to everything else going
on in a box, I'm geniunely puzzled... 3 pages of function calls have
to be traversed to get to here...

I am tending to agree now that the interval needent be 64bit.

>> +     return t;
>> +}
>> +
>> +/*
>> +static int codel_prob_mark(const struct codel_sched_data *q)
>> +{
>> +     return q->flags & TC_CODEL_ECN;
>
> Not clear if ECN plays well with CoDel at all.

Not clear to me either, wanted to leave room in the API to play with it later.

>> +}
>> +*/
>> +
>> + /* wrappers for ultimate statistics collection */
>> +
>> +static int codel_drop(struct sk_buff *skb, struct Qdisc *sch) {
>> +     return qdisc_drop(skb,sch);
>> +}
>> +
>> +/*
>> +static int codel_queue_drop(struct Qdisc *sch) {
>> +     return qdisc_drop(skb,sch);
>> +}
>> +*/
>

Aside from drops/marks as statistics, I was wanting something like
'avg closure rate', and 'dropping intervals'

To try and explain the first I'd have to refer to a 1/10th brilliant
paper about the microscopic behavior of tcp that Andrew and I were
discussing that I can't find right now.  Andrew?

>> +struct sk_buff *codel_dequeue_head(struct Qdisc *sch) {
>> +     return(qdisc_dequeue_head(sch));
>> +}
>> +
>> +bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now)
>> +{
>> +     struct codel_sched_data *q = qdisc_priv(sch);
>> +     bool drop = 0;
>
>        bool drop = false;
>
>> +     if (skb == NULL) {
>> +             q->first_above_time.tv64 = 0;
>> +     } else {
>> +             ktime_t sojourn_time = ktime_sub(now, get_enqueue_time(skb));
>> +             if (sojourn_time.tv64 <
>> +                 q->target.tv64 || sch->qstats.backlog < q->minbytes) {
>> +/* went below so we’ll stay below for at least q->interval */
>> +                     q->first_above_time.tv64 = 0;
>> +             } else {
>> +                     if (q->first_above_time.tv64 == 0) {
>> +
>> +/* just went above from below. If we stay above
>> + * for at least q->interval we’ll say it’s ok to drop
>> + */
>> +                             q->first_above_time =
>> +                                     ktime_add(now,q->interval);
>> +                     } else if (now.tv64 >= q->first_above_time.tv64) {
>> +                             drop = 1;
>> +                     }
>> +             }
>> +     }
>> +     return drop;
>> +}
>> +
>> +static struct sk_buff *codel_dequeue(struct Qdisc *sch)
>> +{
>> +     struct codel_sched_data *q = qdisc_priv(sch);
>> +     struct sk_buff *skb = codel_dequeue_head(sch);
>> +     ktime_t now;
>> +     bool drop;
>
> missing newline

KernelStyle is a newline after variable declarations? ok... thx
>
>> +     if (skb == NULL) {
>> +             q->dropping = 0;
>> +             q->first_above_time.tv64 = 0;
>> +             return skb;
>> +     }
>> +     now = ktime_get();
>> +     drop = should_drop(skb, sch, now);
>> +     if (q->dropping) {
>> +             if (drop) {
>> +/* sojourn time below target - leave dropping state */
>> +                     q->dropping = 0;
>> +             } else if (now.tv64 >= q->drop_next.tv64) {
>> +/*
>> + * It’s time for the next drop. Drop the current packet and dequeue the next.
>> + * The dequeue might take us out of dropping state. If not, schedule the
>> + * next drop. A large backlog might result in drop rates so high that the next
>> + * drop should happen now, hence the ‘while’ loop.
>> + */
>> +                     while (now.tv64 >= q->drop_next.tv64 && q->dropping) {
>> +                             codel_drop(skb, sch);
>> +                             q->count++;
>> +                             skb = codel_dequeue_head(sch);
>> +                             if (should_drop(skb,sch,now)) {
>> +/* leave dropping state */
>> +                                     q->dropping = 0;
>> +                             } else {
>> +/* and schedule the next drop */
>> +                                     q->drop_next =
>> +                                             control_law(q,q->drop_next);
>> +                             }
>> +                     }
>> +             } else if (drop &&
>> +                        ((now.tv64 - q->drop_next.tv64 <
>> +                             16 * q->interval.tv64) ||
>> +                         (now.tv64 - q->first_above_time.tv64 >=
>> +                          2 * q->interval.tv64))) {
>> +                     codel_drop(skb, sch);
>> +                     skb = codel_dequeue_head(sch);
>> +                     /* engage state machine */
>> +                     drop = should_drop(skb,sch,now);
>> +                     q->dropping = 1;
>> +
>> +/* if min went above target close to when we last went below it
>> + * assume that the drop rate that controlled the queue on the
>> + * last cycle is a good starting point to control it now.
>> + */
>> +                     if (now.tv64 - q->drop_next.tv64 <
>> +                             16 * q->interval.tv64) {
>
> doing ktime_t compares is tricky. Here its not done properly.

yea, um, I screwed this up globally. Rethinking now...

> Are you sure we need 64bit and ns resolution ?

No. However there are several things that I have in my head
that overcomplicated things.

a typical 5ms target for drop is a LOT of packets on 10GigE. Am trying
to think ahead a decade or two or three and to a tiny equivalent of
DCB in the 1000s of processors connected directly to your brain
implant.

> IMHO 'unsigned long' values are the best, for 32bit arches.
>
> CoDel paper mentions ms resolution...
> Even if you use us resolution, we can have about 2000 secs of window in
> an unsigned long.

I wasn't as concerned as getting into the oort cloud as to handling
really fast gear.

>
>> +                             int c = q->count - 1;
>> +                             q->count = c < 1 ? 1 : c;
>> +                     } else {
>> +                             q->count = 1;
>> +                     }
>> +                     q->drop_next = control_law(q,now);
>> +             }
>> +     }
>> +     return skb;
>> +}
>> +
>> +
>> +static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
>> +{
>
> I am worried about the latency spike in dequeue(), if we need to drop
> hundred of packets (while device is idle... If BQL is in use... its
> really unfortunate)

I'm concerned about it too, I don't think anything else currently
loops in that call. as best as I recall that's under rcu protection?

>
> Could we try to check if we can drop right now one or two packets that
> were queued in previous ->queue() calls, to smooth the drop processing ?

I have been speculating wildly about ways to shorten the tail of
codel's compensation, and (especially) only shoot the elephants, not
mice, using fq techniques, but have to wait to see this stuff run as
is first.

Do you have a suggestion? I'm never big on dropping more than one
packet in a flow at a time.

>> +     if (likely(skb_queue_len(&sch->q) < sch->limit)) {
>> +             set_enqueue_time(skb,ktime_get());
>> +             return qdisc_enqueue_tail(skb, sch);
>> +     }
>> +     return qdisc_reshape_fail(skb, sch);
>> +}
>> +
>> +static int codel_change(struct Qdisc *sch, struct nlattr *opt)
>> +{
>> +     struct codel_sched_data *q = qdisc_priv(sch);
>> +     struct tc_codel_qopt *ctl = nla_data(opt);
>> +     unsigned int qlen;
>> +
>> +     if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
>> +             return -EINVAL;
>> +     if (ctl->depth && (ctl->depth < 2 || ctl->depth > 65536))
>> +             return -EINVAL;
>> +     if (ctl->minbytes && (ctl->minbytes < 64 || ctl->minbytes > 65536))
>> +             return -EINVAL;
>> +     sch_tree_lock(sch);
>> +     if (ctl->minbytes) q->minbytes = ctl->minbytes;
>
>        if (cond)    // NEWLINE
>                statement;
>
>> +     if (ctl->flags) q->flags = ctl->flags;
>> +     if (ctl->target) q->target = ns_to_ktime((u64) ctl->target * 1000);
>> +     if (ctl->interval) q->interval =
>> +                     ns_to_ktime((u64) ctl->interval * 1000);
>> +
>> +     /* something saner than this for depth is probably needed */
>> +
>> +     if (ctl->depth) sch->limit = ctl->depth;
>> +     qlen = sch->q.qlen;
>> +//   while (sch->q.qlen > ctl->depth)
>> +//           codel_drop(skb,sch);
>> +//   qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); //?

I don't get the how/when/why of this commented out bit.
Can you clue me up?

>> +     q->drop_next.tv64 = q->first_above_time.tv64 = 0;
>> +     q->dropping = 0; /* exit dropping state */
>> +     sch_tree_unlock(sch);
>> +     return 0;
>> +}
>> +
>> +static int codel_init(struct Qdisc *sch, struct nlattr *opt)
>> +{
>> +     struct codel_sched_data *q = qdisc_priv(sch);
>> +     q->target = MS2TIME(5);
>> +     sch->limit =  DEFAULT_CODEL_DEPTH;
>> +     q->minbytes = 1500;
>
>        Please dont hard code this value, use ETH_DATA_LEN, or better :
>
> psched_mtu(qdisc_dev(sch));

Done.

>
>> +     q->interval = MS2TIME(100);
>> +     if (opt) {
>> +             int err = codel_change(sch, opt);
>> +             if (err)
>> +                     return err;
>> +     }
>> +
>> +/*   if (sch->limit >= 1)
>> +             sch->flags |= TCQ_F_CAN_BYPASS;
>> +     else
>> +             sch->flags &= ~TCQ_F_CAN_BYPASS;
>> +*/
>> +     return 0;
>> +}

It was unclear to me the use of the above functionality I found in
other qdiscs. In particular, it seems like codel could run with an
unlimited queue just fine, I was uncomfortable with doing that as a
first pass.

>> +
>> +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
>> +{
>> +     struct codel_sched_data *q = qdisc_priv(sch);
>> +     struct tc_codel_qopt opt;
>> +     opt.target = (u32) ktime_to_us(q->target);
>> +     opt.interval  = (u32) ktime_to_us(q->interval);
>> +     opt.depth  = sch->limit;
>> +     opt.flags  = q->flags;
>> +     NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
>
> NLA_PUT() doesnt exist anymore in net-next tree, please rebase on
> net-next

I saw it was transitioning, but I would prefer to make this stable on
something I have deployed, get more people running it day to day...
then submit to net-next.

>> +     return skb->len;
>> +
>> +nla_put_failure:
>> +//   nlmsg_trim(skb, b);
>> +     return -1;
>> +}
>> +
>> +static void
>> +codel_reset(struct Qdisc *sch)
>> +{
>> +     struct sk_buff *skb;
>> +
>> +     while ((skb = codel_dequeue(sch)) != NULL)
>> +             kfree_skb(skb);
>
>
>> +}
>> +
>> +struct Qdisc_ops codel_qdisc_ops __read_mostly = {
>> +     .id             =       "codel",
>> +     .priv_size      =       sizeof(struct codel_sched_data),
>> +     .enqueue        =       codel_enqueue,
>> +     .dequeue        =       codel_dequeue,
>> +     .peek           =       qdisc_peek_head,
>> +/*   .drop           =       codel_queue_drop, */

I'm puzzled as to what this is supposed to really do and
who it's callers are.

>> +     .init           =       codel_init,
>> +     .reset          =       codel_reset,
>> +     .change         =       codel_change,
>> +     .dump           =       codel_dump,
>> +     .owner          =       THIS_MODULE,
>> +};
>> +EXPORT_SYMBOL(codel_qdisc_ops);
>> +
>> +static int __init codel_module_init(void)
>> +{
>> +        return register_qdisc(&codel_qdisc_ops);
>> +}
>> +static void __exit codel_module_exit(void)
>> +{
>> +        unregister_qdisc(&codel_qdisc_ops);
>> +}
>> +module_init(codel_module_init)
>> +module_exit(codel_module_exit)
>> +MODULE_LICENSE("GPL");
>> +
>> _______________________________________________
>> Codel mailing list
>> Codel at lists.bufferbloat.net
>> https://lists.bufferbloat.net/listinfo/codel
>
>
> _______________________________________________
> Codel mailing list
> Codel at lists.bufferbloat.net
> https://lists.bufferbloat.net/listinfo/codel

-- 
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
http://www.bufferbloat.net