* [Codel] [PATCH] iproute2: added preliminary codel support @ 2012-05-03 17:52 Dave Täht 2012-05-03 17:52 ` [Codel] [PATCH] Preliminary codel implementation Dave Täht 2012-05-03 17:59 ` [Codel] [PATCH] iproute2: added preliminary codel support Dave Taht 0 siblings, 2 replies; 11+ messages in thread From: Dave Täht @ 2012-05-03 17:52 UTC (permalink / raw) To: codel; +Cc: Dave Täht this makes certain constants in the code variables, but has seemingly good defaults. target and interval can be specified with a ms, s, or us parameter mtu is in bytes ecn, while part of the api, is not enabled in the code --- include/linux/pkt_sched.h | 28 ++++++++++ tc/Makefile | 1 + tc/q_codel.c | 135 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 164 insertions(+) create mode 100644 tc/q_codel.c diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 410b33d..150aee9 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -654,4 +654,32 @@ struct tc_qfq_stats { __u32 lmax; }; +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_PARMS, + TCA_CODEL_TARGET, + TCA_CODEL_DEPTH, + TCA_CODEL_MINBYTES, + TCA_CODEL_INTERVAL, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) +#define TC_CODEL_ECN 1 + +struct tc_codel_qopt { + __u32 flags; /* flags (e.g. ecn) */ + __u32 target; /* max delay, in us */ + __u32 depth; /* queue depth in packets */ + __u32 minbytes; /* MTU (usually) */ + __u32 interval; /* Sliding min time window width (us) */ +}; + +struct tc_codel_stats { + __u64 drops; + __u64 marks; +}; + #endif diff --git a/tc/Makefile b/tc/Makefile index be8cd5a..8a7cc8d 100644 --- a/tc/Makefile +++ b/tc/Makefile @@ -47,6 +47,7 @@ TCMODULES += em_cmp.o TCMODULES += em_u32.o TCMODULES += em_meta.o TCMODULES += q_mqprio.o +TCMODULES += q_codel.o TCSO := ifeq ($(TC_CONFIG_ATM),y) diff --git a/tc/q_codel.c b/tc/q_codel.c new file mode 100644 index 0000000..2ff51a1 --- /dev/null +++ b/tc/q_codel.c @@ -0,0 +1,135 @@ +/* + * q_codel.c Controlled Delay + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Dave Taht <d@taht.net> + * + */ + + + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, + "Usage: ... codel [target MAX_DELAY] [depth PACKETS]" + " [mtu MINBYTES]\n " + "[interval INTEGRATION_TIME] [ecn]\n" ); +} + +static int codel_parse_opt(struct qdisc_util *qu, int argc, char **argv, + struct nlmsghdr *n) +{ + struct tc_codel_qopt opt; + struct rtattr *tail; + + memset(&opt, 0, sizeof(opt)); + opt.target = 5000; + opt.depth = 1000; + opt.interval = 1000*100; + opt.minbytes = 1500; + opt.flags = 0; + + while (argc > 0) { + if (strcmp(*argv, "target") == 0) { + NEXT_ARG(); + if (get_time(&opt.target, *argv)) { + fprintf(stderr, "Illegal \"target\"\n"); + return -1; + } + } else if (strcmp(*argv, "depth") == 0) { + NEXT_ARG(); + if (get_u32(&opt.depth, *argv, 0)) { + fprintf(stderr, "Illegal \"depth\"\n"); + return -1; + } + } else if (strcmp(*argv, "interval") == 0) { + NEXT_ARG(); + if (get_time(&opt.interval, *argv)) { + fprintf(stderr, "Illegal \"interval\"\n"); + return -1; + } + } else if (strcmp(*argv, "mtu") == 0) { + NEXT_ARG(); + if (get_u32(&opt.minbytes, *argv, 0)) { + fprintf(stderr, "Illegal \"mtu\"\n"); + return -1; + } + } else if (strcmp(*argv, "ecn") == 0) { + opt.flags |= TC_CODEL_ECN; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + argc--; argv++; + } + + tail = NLMSG_TAIL(n); + addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); + addattr_l(n, 1024, TCA_CODEL_PARMS, &opt, sizeof(opt)); + tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail; + return 0; +} + +static int codel_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + struct rtattr *tb[__TCA_CODEL_MAX]; + struct tc_codel_qopt *qopt; + + if (opt == NULL) + return 0; + + parse_rtattr_nested(tb, TCA_CODEL_MAX, opt); + if (tb[TCA_CODEL_PARMS] == NULL) + return -1; + qopt = RTA_DATA(tb[TCA_CODEL_PARMS]); + if (RTA_PAYLOAD(tb[TCA_CODEL_PARMS]) < sizeof(*qopt)) + return -1; + fprintf(f," target %dus depth %d mtu %d interval %dus %s\n", + qopt->target, qopt->depth, qopt->minbytes, + qopt->interval, (qopt->flags & TC_CODEL_ECN) ? "ecn" : ""); + + return 0; +} + +static int codel_print_xstats(struct qdisc_util *qu, FILE *f, + struct rtattr *xstats) +{ + struct tc_codel_stats *st; + + if (xstats == NULL) + return 0; + + if (RTA_PAYLOAD(xstats) < sizeof(*st)) + return -1; + + st = RTA_DATA(xstats); + fprintf(f, " dropped %lu marked %lu ", + st->drops, st->marks); + + return 0; +} + +struct qdisc_util codel_qdisc_util = { + .id = "codel", + .parse_qopt = codel_parse_opt, + .print_qopt = codel_print_opt, + .print_xstats = codel_print_xstats, +}; -- 1.7.9.5 ^ permalink raw reply [flat|nested] 11+ messages in thread
* [Codel] [PATCH] Preliminary codel implementation 2012-05-03 17:52 [Codel] [PATCH] iproute2: added preliminary codel support Dave Täht @ 2012-05-03 17:52 ` Dave Täht 2012-05-03 18:01 ` Dave Taht ` (2 more replies) 2012-05-03 17:59 ` [Codel] [PATCH] iproute2: added preliminary codel support Dave Taht 1 sibling, 3 replies; 11+ messages in thread From: Dave Täht @ 2012-05-03 17:52 UTC (permalink / raw) To: codel; +Cc: Dave Täht [-- Warning: decoded text below may be mangled, UTF-8 assumed --] [-- Attachment #1: Type: text/plain, Size: 10261 bytes --] --- include/linux/pkt_sched.h | 29 +++++ net/sched/Kconfig | 11 ++ net/sched/Makefile | 1 + net/sched/sch_codel.c | 296 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 337 insertions(+) create mode 100644 net/sched/sch_codel.c diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 0d5b793..c21b720 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -633,4 +633,33 @@ struct tc_qfq_stats { __u32 lmax; }; +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_PARMS, + TCA_CODEL_TARGET, + TCA_CODEL_DEPTH, + TCA_CODEL_MINBYTES, + TCA_CODEL_INTERVAL, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) +#define TC_CODEL_ECN 1 + +struct tc_codel_qopt { + __u32 flags; /* flags (e.g. ecn) */ + __u32 target; /* max delay, in us */ + __u32 depth; /* queue depth in packets */ + __u32 minbytes; /* MTU (usually) */ + __u32 interval; /* Sliding min time window width (us) */ +}; + +struct tc_codel_stats { + __u64 drops; + __u64 marks; +}; + + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2590e91..8106c42 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -250,6 +250,17 @@ config NET_SCH_QFQ If unsure, say N. +config NET_SCH_CODEL + tristate "Controlled Delay AQM (CODEL)" + help + Say Y here if you want to use the Controlled Delay (CODEL) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_codel. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index dc5889c..41130b5 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c new file mode 100644 index 0000000..159df47 --- /dev/null +++ b/net/sched/sch_codel.c @@ -0,0 +1,296 @@ +/* + * net/sched/sch_codel.c A Codel implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Based on ns2 simulation code presented by Kathie Nichols + * Authors: Dave Täht <d@taht.net> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/ktime.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> + +#define MS2TIME(a) (ns_to_ktime( (u64) a*1000000)) +#define DEFAULT_CODEL_DEPTH 1000 + +/* Per-queue state (codel_queue_t instance variables) */ + +struct codel_sched_data { + u32 flags; + u32 minbytes; + u32 count; /* packets dropped since we went into drop state */ + bool dropping; /* 1 if in drop state, might just add to flags */ + ktime_t target; + ktime_t interval; + /* time to declare above q->target (0 if below)*/ + ktime_t first_above_time; + ktime_t drop_next; /* time to drop next packet */ +}; + +struct codel_skb_cb { + ktime_t enqueue_time; + char data[16]; +}; + +static inline struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb) +{ + return (struct codel_skb_cb *)skb->cb; +} + +static inline ktime_t get_enqueue_time(const struct sk_buff *skb) { + struct codel_skb_cb *cb = get_codel_cb(skb); + return cb->enqueue_time; +} + +static inline ktime_t set_enqueue_time(struct sk_buff *skb, ktime_t t ) { + struct codel_skb_cb *cb = get_codel_cb(skb); + cb->enqueue_time = t; + return t; +} + +static inline ktime_t control_law(const struct codel_sched_data *q, ktime_t t) +{ + t.tv64 = t.tv64 + q->interval.tv64 / int_sqrt(q->count); + return t; +} + +/* +static int codel_prob_mark(const struct codel_sched_data *q) +{ + return q->flags & TC_CODEL_ECN; +} +*/ + + /* wrappers for ultimate statistics collection */ + +static int codel_drop(struct sk_buff *skb, struct Qdisc *sch) { + return qdisc_drop(skb,sch); +} + +/* +static int codel_queue_drop(struct Qdisc *sch) { + return qdisc_drop(skb,sch); +} +*/ + +struct sk_buff *codel_dequeue_head(struct Qdisc *sch) { + return(qdisc_dequeue_head(sch)); +} + +bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now) +{ + struct codel_sched_data *q = qdisc_priv(sch); + bool drop = 0; + if (skb == NULL) { + q->first_above_time.tv64 = 0; + } else { + ktime_t sojourn_time = ktime_sub(now, get_enqueue_time(skb)); + if (sojourn_time.tv64 < + q->target.tv64 || sch->qstats.backlog < q->minbytes) { +/* went below so we’ll stay below for at least q->interval */ + q->first_above_time.tv64 = 0; + } else { + if (q->first_above_time.tv64 == 0) { + +/* just went above from below. If we stay above + * for at least q->interval we’ll say it’s ok to drop + */ + q->first_above_time = + ktime_add(now,q->interval); + } else if (now.tv64 >= q->first_above_time.tv64) { + drop = 1; + } + } + } + return drop; +} + +static struct sk_buff *codel_dequeue(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = codel_dequeue_head(sch); + ktime_t now; + bool drop; + if (skb == NULL) { + q->dropping = 0; + q->first_above_time.tv64 = 0; + return skb; + } + now = ktime_get(); + drop = should_drop(skb, sch, now); + if (q->dropping) { + if (drop) { +/* sojourn time below target - leave dropping state */ + q->dropping = 0; + } else if (now.tv64 >= q->drop_next.tv64) { +/* + * It’s time for the next drop. Drop the current packet and dequeue the next. + * The dequeue might take us out of dropping state. If not, schedule the + * next drop. A large backlog might result in drop rates so high that the next + * drop should happen now, hence the ‘while’ loop. + */ + while (now.tv64 >= q->drop_next.tv64 && q->dropping) { + codel_drop(skb, sch); + q->count++; + skb = codel_dequeue_head(sch); + if (should_drop(skb,sch,now)) { +/* leave dropping state */ + q->dropping = 0; + } else { +/* and schedule the next drop */ + q->drop_next = + control_law(q,q->drop_next); + } + } + } else if (drop && + ((now.tv64 - q->drop_next.tv64 < + 16 * q->interval.tv64) || + (now.tv64 - q->first_above_time.tv64 >= + 2 * q->interval.tv64))) { + codel_drop(skb, sch); + skb = codel_dequeue_head(sch); + /* engage state machine */ + drop = should_drop(skb,sch,now); + q->dropping = 1; + +/* if min went above target close to when we last went below it + * assume that the drop rate that controlled the queue on the + * last cycle is a good starting point to control it now. + */ + if (now.tv64 - q->drop_next.tv64 < + 16 * q->interval.tv64) { + int c = q->count - 1; + q->count = c < 1 ? 1 : c; + } else { + q->count = 1; + } + q->drop_next = control_law(q,now); + } + } + return skb; +} + + +static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + if (likely(skb_queue_len(&sch->q) < sch->limit)) { + set_enqueue_time(skb,ktime_get()); + return qdisc_enqueue_tail(skb, sch); + } + return qdisc_reshape_fail(skb, sch); +} + +static int codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct tc_codel_qopt *ctl = nla_data(opt); + unsigned int qlen; + + if (opt->nla_len < nla_attr_size(sizeof(*ctl))) + return -EINVAL; + if (ctl->depth && (ctl->depth < 2 || ctl->depth > 65536)) + return -EINVAL; + if (ctl->minbytes && (ctl->minbytes < 64 || ctl->minbytes > 65536)) + return -EINVAL; + sch_tree_lock(sch); + if (ctl->minbytes) q->minbytes = ctl->minbytes; + if (ctl->flags) q->flags = ctl->flags; + if (ctl->target) q->target = ns_to_ktime((u64) ctl->target * 1000); + if (ctl->interval) q->interval = + ns_to_ktime((u64) ctl->interval * 1000); + + /* something saner than this for depth is probably needed */ + + if (ctl->depth) sch->limit = ctl->depth; + qlen = sch->q.qlen; +// while (sch->q.qlen > ctl->depth) +// codel_drop(skb,sch); +// qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); //? + q->drop_next.tv64 = q->first_above_time.tv64 = 0; + q->dropping = 0; /* exit dropping state */ + sch_tree_unlock(sch); + return 0; +} + +static int codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + q->target = MS2TIME(5); + sch->limit = DEFAULT_CODEL_DEPTH; + q->minbytes = 1500; + q->interval = MS2TIME(100); + if (opt) { + int err = codel_change(sch, opt); + if (err) + return err; + } + +/* if (sch->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; +*/ + return 0; +} + +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct tc_codel_qopt opt; + opt.target = (u32) ktime_to_us(q->target); + opt.interval = (u32) ktime_to_us(q->interval); + opt.depth = sch->limit; + opt.flags = q->flags; + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +nla_put_failure: +// nlmsg_trim(skb, b); + return -1; +} + +static void +codel_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = codel_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +struct Qdisc_ops codel_qdisc_ops __read_mostly = { + .id = "codel", + .priv_size = sizeof(struct codel_sched_data), + .enqueue = codel_enqueue, + .dequeue = codel_dequeue, + .peek = qdisc_peek_head, +/* .drop = codel_queue_drop, */ + .init = codel_init, + .reset = codel_reset, + .change = codel_change, + .dump = codel_dump, + .owner = THIS_MODULE, +}; +EXPORT_SYMBOL(codel_qdisc_ops); + +static int __init codel_module_init(void) +{ + return register_qdisc(&codel_qdisc_ops); +} +static void __exit codel_module_exit(void) +{ + unregister_qdisc(&codel_qdisc_ops); +} +module_init(codel_module_init) +module_exit(codel_module_exit) +MODULE_LICENSE("GPL"); + -- 1.7.9.5 ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Codel] [PATCH] Preliminary codel implementation 2012-05-03 17:52 ` [Codel] [PATCH] Preliminary codel implementation Dave Täht @ 2012-05-03 18:01 ` Dave Taht 2012-05-03 18:17 ` Rick Jones 2012-05-03 18:19 ` Eric Dumazet 2012-05-03 18:17 ` Eric Dumazet 2012-05-03 20:47 ` Jim Gettys 2 siblings, 2 replies; 11+ messages in thread From: Dave Taht @ 2012-05-03 18:01 UTC (permalink / raw) To: Dave Täht; +Cc: codel This does compile, and does not crash the kernel, and it even passes traffic! but... as noted I goofed on the netlink interface somehow, and as this is a transliteration from C++ and ns2 slides into C and linux network scheduling code, that I started writing at 2AM... I could also have goofed in multiple other places. I built these against linux-stable 3.3.4 and iproute current git head. So a bit of review for logical and technical correctness is highly desired. On Thu, May 3, 2012 at 10:52 AM, Dave Täht <dave.taht@bufferbloat.net> wrote: > --- > include/linux/pkt_sched.h | 29 +++++ > net/sched/Kconfig | 11 ++ > net/sched/Makefile | 1 + > net/sched/sch_codel.c | 296 +++++++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 337 insertions(+) > create mode 100644 net/sched/sch_codel.c > > diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h > index 0d5b793..c21b720 100644 > --- a/include/linux/pkt_sched.h > +++ b/include/linux/pkt_sched.h > @@ -633,4 +633,33 @@ struct tc_qfq_stats { > __u32 lmax; > }; > > +/* CODEL */ > + > +enum { > + TCA_CODEL_UNSPEC, > + TCA_CODEL_PARMS, > + TCA_CODEL_TARGET, > + TCA_CODEL_DEPTH, > + TCA_CODEL_MINBYTES, > + TCA_CODEL_INTERVAL, > + __TCA_CODEL_MAX > +}; > + > +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) > +#define TC_CODEL_ECN 1 > + > +struct tc_codel_qopt { > + __u32 flags; /* flags (e.g. ecn) */ > + __u32 target; /* max delay, in us */ > + __u32 depth; /* queue depth in packets */ > + __u32 minbytes; /* MTU (usually) */ > + __u32 interval; /* Sliding min time window width (us) */ > +}; > + > +struct tc_codel_stats { > + __u64 drops; > + __u64 marks; > +}; > + > + > #endif > diff --git a/net/sched/Kconfig b/net/sched/Kconfig > index 2590e91..8106c42 100644 > --- a/net/sched/Kconfig > +++ b/net/sched/Kconfig > @@ -250,6 +250,17 @@ config NET_SCH_QFQ > > If unsure, say N. > > +config NET_SCH_CODEL > + tristate "Controlled Delay AQM (CODEL)" > + help > + Say Y here if you want to use the Controlled Delay (CODEL) > + packet scheduling algorithm. > + > + To compile this driver as a module, choose M here: the module > + will be called sch_codel. > + > + If unsure, say N. > + > config NET_SCH_INGRESS > tristate "Ingress Qdisc" > depends on NET_CLS_ACT > diff --git a/net/sched/Makefile b/net/sched/Makefile > index dc5889c..41130b5 100644 > --- a/net/sched/Makefile > +++ b/net/sched/Makefile > @@ -36,6 +36,7 @@ obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o > obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o > obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o > obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o > +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o > > obj-$(CONFIG_NET_CLS_U32) += cls_u32.o > obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o > diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c > new file mode 100644 > index 0000000..159df47 > --- /dev/null > +++ b/net/sched/sch_codel.c > @@ -0,0 +1,296 @@ > +/* > + * net/sched/sch_codel.c A Codel implementation > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License > + * as published by the Free Software Foundation; either version > + * 2 of the License, or (at your option) any later version. > + * > + * Based on ns2 simulation code presented by Kathie Nichols > + * Authors: Dave Täht <d@taht.net> > + */ > + > +#include <linux/module.h> > +#include <linux/slab.h> > +#include <linux/types.h> > +#include <linux/kernel.h> > +#include <linux/errno.h> > +#include <linux/ktime.h> > +#include <linux/skbuff.h> > +#include <net/pkt_sched.h> > + > +#define MS2TIME(a) (ns_to_ktime( (u64) a*1000000)) > +#define DEFAULT_CODEL_DEPTH 1000 > + > +/* Per-queue state (codel_queue_t instance variables) */ > + > +struct codel_sched_data { > + u32 flags; > + u32 minbytes; > + u32 count; /* packets dropped since we went into drop state */ > + bool dropping; /* 1 if in drop state, might just add to flags */ > + ktime_t target; > + ktime_t interval; > + /* time to declare above q->target (0 if below)*/ > + ktime_t first_above_time; > + ktime_t drop_next; /* time to drop next packet */ > +}; > + > +struct codel_skb_cb { > + ktime_t enqueue_time; > + char data[16]; > +}; > + > +static inline struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb) > +{ > + return (struct codel_skb_cb *)skb->cb; > +} > + > +static inline ktime_t get_enqueue_time(const struct sk_buff *skb) { > + struct codel_skb_cb *cb = get_codel_cb(skb); > + return cb->enqueue_time; > +} > + > +static inline ktime_t set_enqueue_time(struct sk_buff *skb, ktime_t t ) { > + struct codel_skb_cb *cb = get_codel_cb(skb); > + cb->enqueue_time = t; > + return t; > +} > + > +static inline ktime_t control_law(const struct codel_sched_data *q, ktime_t t) > +{ > + t.tv64 = t.tv64 + q->interval.tv64 / int_sqrt(q->count); > + return t; > +} > + > +/* > +static int codel_prob_mark(const struct codel_sched_data *q) > +{ > + return q->flags & TC_CODEL_ECN; > +} > +*/ > + > + /* wrappers for ultimate statistics collection */ > + > +static int codel_drop(struct sk_buff *skb, struct Qdisc *sch) { > + return qdisc_drop(skb,sch); > +} > + > +/* > +static int codel_queue_drop(struct Qdisc *sch) { > + return qdisc_drop(skb,sch); > +} > +*/ > + > +struct sk_buff *codel_dequeue_head(struct Qdisc *sch) { > + return(qdisc_dequeue_head(sch)); > +} > + > +bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now) > +{ > + struct codel_sched_data *q = qdisc_priv(sch); > + bool drop = 0; > + if (skb == NULL) { > + q->first_above_time.tv64 = 0; > + } else { > + ktime_t sojourn_time = ktime_sub(now, get_enqueue_time(skb)); > + if (sojourn_time.tv64 < > + q->target.tv64 || sch->qstats.backlog < q->minbytes) { > +/* went below so we’ll stay below for at least q->interval */ > + q->first_above_time.tv64 = 0; > + } else { > + if (q->first_above_time.tv64 == 0) { > + > +/* just went above from below. If we stay above > + * for at least q->interval we’ll say it’s ok to drop > + */ > + q->first_above_time = > + ktime_add(now,q->interval); > + } else if (now.tv64 >= q->first_above_time.tv64) { > + drop = 1; > + } > + } > + } > + return drop; > +} > + > +static struct sk_buff *codel_dequeue(struct Qdisc *sch) > +{ > + struct codel_sched_data *q = qdisc_priv(sch); > + struct sk_buff *skb = codel_dequeue_head(sch); > + ktime_t now; > + bool drop; > + if (skb == NULL) { > + q->dropping = 0; > + q->first_above_time.tv64 = 0; > + return skb; > + } > + now = ktime_get(); > + drop = should_drop(skb, sch, now); > + if (q->dropping) { > + if (drop) { > +/* sojourn time below target - leave dropping state */ > + q->dropping = 0; > + } else if (now.tv64 >= q->drop_next.tv64) { > +/* > + * It’s time for the next drop. Drop the current packet and dequeue the next. > + * The dequeue might take us out of dropping state. If not, schedule the > + * next drop. A large backlog might result in drop rates so high that the next > + * drop should happen now, hence the ‘while’ loop. > + */ > + while (now.tv64 >= q->drop_next.tv64 && q->dropping) { > + codel_drop(skb, sch); > + q->count++; > + skb = codel_dequeue_head(sch); > + if (should_drop(skb,sch,now)) { > +/* leave dropping state */ > + q->dropping = 0; > + } else { > +/* and schedule the next drop */ > + q->drop_next = > + control_law(q,q->drop_next); > + } > + } > + } else if (drop && > + ((now.tv64 - q->drop_next.tv64 < > + 16 * q->interval.tv64) || > + (now.tv64 - q->first_above_time.tv64 >= > + 2 * q->interval.tv64))) { > + codel_drop(skb, sch); > + skb = codel_dequeue_head(sch); > + /* engage state machine */ > + drop = should_drop(skb,sch,now); > + q->dropping = 1; > + > +/* if min went above target close to when we last went below it > + * assume that the drop rate that controlled the queue on the > + * last cycle is a good starting point to control it now. > + */ > + if (now.tv64 - q->drop_next.tv64 < > + 16 * q->interval.tv64) { > + int c = q->count - 1; > + q->count = c < 1 ? 1 : c; > + } else { > + q->count = 1; > + } > + q->drop_next = control_law(q,now); > + } > + } > + return skb; > +} > + > + > +static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) > +{ > + if (likely(skb_queue_len(&sch->q) < sch->limit)) { > + set_enqueue_time(skb,ktime_get()); > + return qdisc_enqueue_tail(skb, sch); > + } > + return qdisc_reshape_fail(skb, sch); > +} > + > +static int codel_change(struct Qdisc *sch, struct nlattr *opt) > +{ > + struct codel_sched_data *q = qdisc_priv(sch); > + struct tc_codel_qopt *ctl = nla_data(opt); > + unsigned int qlen; > + > + if (opt->nla_len < nla_attr_size(sizeof(*ctl))) > + return -EINVAL; > + if (ctl->depth && (ctl->depth < 2 || ctl->depth > 65536)) > + return -EINVAL; > + if (ctl->minbytes && (ctl->minbytes < 64 || ctl->minbytes > 65536)) > + return -EINVAL; > + sch_tree_lock(sch); > + if (ctl->minbytes) q->minbytes = ctl->minbytes; > + if (ctl->flags) q->flags = ctl->flags; > + if (ctl->target) q->target = ns_to_ktime((u64) ctl->target * 1000); > + if (ctl->interval) q->interval = > + ns_to_ktime((u64) ctl->interval * 1000); > + > + /* something saner than this for depth is probably needed */ > + > + if (ctl->depth) sch->limit = ctl->depth; > + qlen = sch->q.qlen; > +// while (sch->q.qlen > ctl->depth) > +// codel_drop(skb,sch); > +// qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); //? > + q->drop_next.tv64 = q->first_above_time.tv64 = 0; > + q->dropping = 0; /* exit dropping state */ > + sch_tree_unlock(sch); > + return 0; > +} > + > +static int codel_init(struct Qdisc *sch, struct nlattr *opt) > +{ > + struct codel_sched_data *q = qdisc_priv(sch); > + q->target = MS2TIME(5); > + sch->limit = DEFAULT_CODEL_DEPTH; > + q->minbytes = 1500; > + q->interval = MS2TIME(100); > + if (opt) { > + int err = codel_change(sch, opt); > + if (err) > + return err; > + } > + > +/* if (sch->limit >= 1) > + sch->flags |= TCQ_F_CAN_BYPASS; > + else > + sch->flags &= ~TCQ_F_CAN_BYPASS; > +*/ > + return 0; > +} > + > +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) > +{ > + struct codel_sched_data *q = qdisc_priv(sch); > + struct tc_codel_qopt opt; > + opt.target = (u32) ktime_to_us(q->target); > + opt.interval = (u32) ktime_to_us(q->interval); > + opt.depth = sch->limit; > + opt.flags = q->flags; > + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); > + return skb->len; > + > +nla_put_failure: > +// nlmsg_trim(skb, b); > + return -1; > +} > + > +static void > +codel_reset(struct Qdisc *sch) > +{ > + struct sk_buff *skb; > + > + while ((skb = codel_dequeue(sch)) != NULL) > + kfree_skb(skb); > +} > + > +struct Qdisc_ops codel_qdisc_ops __read_mostly = { > + .id = "codel", > + .priv_size = sizeof(struct codel_sched_data), > + .enqueue = codel_enqueue, > + .dequeue = codel_dequeue, > + .peek = qdisc_peek_head, > +/* .drop = codel_queue_drop, */ > + .init = codel_init, > + .reset = codel_reset, > + .change = codel_change, > + .dump = codel_dump, > + .owner = THIS_MODULE, > +}; > +EXPORT_SYMBOL(codel_qdisc_ops); > + > +static int __init codel_module_init(void) > +{ > + return register_qdisc(&codel_qdisc_ops); > +} > +static void __exit codel_module_exit(void) > +{ > + unregister_qdisc(&codel_qdisc_ops); > +} > +module_init(codel_module_init) > +module_exit(codel_module_exit) > +MODULE_LICENSE("GPL"); > + > -- > 1.7.9.5 > > > _______________________________________________ > Codel mailing list > Codel@lists.bufferbloat.net > https://lists.bufferbloat.net/listinfo/codel > -- Dave Täht SKYPE: davetaht US Tel: 1-239-829-5608 http://www.bufferbloat.net ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Codel] [PATCH] Preliminary codel implementation 2012-05-03 18:01 ` Dave Taht @ 2012-05-03 18:17 ` Rick Jones 2012-05-03 18:19 ` Eric Dumazet 1 sibling, 0 replies; 11+ messages in thread From: Rick Jones @ 2012-05-03 18:17 UTC (permalink / raw) To: Dave Taht; +Cc: codel, Dave Täht Some nits, may not be substantive. >> + >> +struct tc_codel_qopt { >> + __u32 flags; /* flags (e.g. ecn) */ >> + __u32 target; /* max delay, in us */ >> + __u32 depth; /* queue depth in packets */ >> + __u32 minbytes; /* MTU (usually) */ >> + __u32 interval; /* Sliding min time window width (us) */ >> +}; Perhaps include the units in target and interval - eg target_usec? Or go full-CS101 with target_delay_usec and action_delay_usec (interval)? >> + >> +#define MS2TIME(a) (ns_to_ktime( (u64) a*1000000)) >> +#define DEFAULT_CODEL_DEPTH 1000 >> + >> +/* Per-queue state (codel_queue_t instance variables) */ >> + >> +struct codel_sched_data { >> + u32 flags; >> + u32 minbytes; >> + u32 count; /* packets dropped since we went into drop state */ >> + bool dropping; /* 1 if in drop state, might just add to flags */ >> + ktime_t target; >> + ktime_t interval; >> + /* time to declare above q->target (0 if below)*/ >> + ktime_t first_above_time; >> + ktime_t drop_next; /* time to drop next packet */ >> +}; Similar sort of thing here, though I suspect ktime_t implies units already. Also is drop_next the time to arbitrarily drop the next packet or just consider it again - eg consider_drop_next? >> +bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now) >> +{ >> + struct codel_sched_data *q = qdisc_priv(sch); >> + bool drop = 0; >> + if (skb == NULL) { >> + q->first_above_time.tv64 = 0; >> + } else { >> + ktime_t sojourn_time = ktime_sub(now, get_enqueue_time(skb)); >> + if (sojourn_time.tv64< >> + q->target.tv64 || sch->qstats.backlog< q->minbytes) { >> +/* went below so we’ll stay below for at least q->interval */ >> + q->first_above_time.tv64 = 0; >> + } else { >> + if (q->first_above_time.tv64 == 0) { >> + >> +/* just went above from below. If we stay above >> + * for at least q->interval we’ll say it’s ok to drop >> + */ Indentation on the comment? >> + q->first_above_time = >> + ktime_add(now,q->interval); >> + } else if (now.tv64>= q->first_above_time.tv64) { >> + drop = 1; >> + } >> + } >> + } >> + return drop; >> +} >> + >> +static struct sk_buff *codel_dequeue(struct Qdisc *sch) >> +{ >> + struct codel_sched_data *q = qdisc_priv(sch); >> + struct sk_buff *skb = codel_dequeue_head(sch); >> + ktime_t now; >> + bool drop; >> + if (skb == NULL) { >> + q->dropping = 0; >> + q->first_above_time.tv64 = 0; >> + return skb; >> + } >> + now = ktime_get(); >> + drop = should_drop(skb, sch, now); >> + if (q->dropping) { >> + if (drop) { >> +/* sojourn time below target - leave dropping state */ >> + q->dropping = 0; >> + } else if (now.tv64>= q->drop_next.tv64) { >> +/* >> + * It’s time for the next drop. Drop the current packet and dequeue the next. >> + * The dequeue might take us out of dropping state. If not, schedule the >> + * next drop. A large backlog might result in drop rates so high that the next >> + * drop should happen now, hence the ‘while’ loop. >> + */ Comment indentation? rick ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Codel] [PATCH] Preliminary codel implementation 2012-05-03 18:01 ` Dave Taht 2012-05-03 18:17 ` Rick Jones @ 2012-05-03 18:19 ` Eric Dumazet 1 sibling, 0 replies; 11+ messages in thread From: Eric Dumazet @ 2012-05-03 18:19 UTC (permalink / raw) To: Dave Taht; +Cc: codel, Dave Täht On Thu, 2012-05-03 at 11:01 -0700, Dave Taht wrote: > This does compile, and does not crash the kernel, and it even > passes traffic! but... > > as noted I goofed on the netlink interface somehow, > and as this is a transliteration from C++ and ns2 slides into > C and linux network scheduling code, that I started writing at 2AM... > > I could also have goofed in multiple other places. > > I built these against linux-stable 3.3.4 and iproute current git head. > > So a bit of review for logical and technical correctness is highly desired. I believe you can take time to test and polish your code before sending it. No hurry, this wont save the planet. ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Codel] [PATCH] Preliminary codel implementation 2012-05-03 17:52 ` [Codel] [PATCH] Preliminary codel implementation Dave Täht 2012-05-03 18:01 ` Dave Taht @ 2012-05-03 18:17 ` Eric Dumazet 2012-05-03 22:38 ` Dave Taht 2012-05-03 20:47 ` Jim Gettys 2 siblings, 1 reply; 11+ messages in thread From: Eric Dumazet @ 2012-05-03 18:17 UTC (permalink / raw) To: Dave Täht; +Cc: codel On Thu, 2012-05-03 at 10:52 -0700, Dave Täht wrote: > --- > include/linux/pkt_sched.h | 29 +++++ > net/sched/Kconfig | 11 ++ > net/sched/Makefile | 1 + > net/sched/sch_codel.c | 296 +++++++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 337 insertions(+) > create mode 100644 net/sched/sch_codel.c > Hi Dave. Was this tested or is just a draft ? > diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h > index 0d5b793..c21b720 100644 > --- a/include/linux/pkt_sched.h > +++ b/include/linux/pkt_sched.h > @@ -633,4 +633,33 @@ struct tc_qfq_stats { > __u32 lmax; > }; > > +/* CODEL */ > + > +enum { > + TCA_CODEL_UNSPEC, > + TCA_CODEL_PARMS, > + TCA_CODEL_TARGET, > + TCA_CODEL_DEPTH, > + TCA_CODEL_MINBYTES, > + TCA_CODEL_INTERVAL, > + __TCA_CODEL_MAX > +}; > + > +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) > +#define TC_CODEL_ECN 1 > + > +struct tc_codel_qopt { > + __u32 flags; /* flags (e.g. ecn) */ > + __u32 target; /* max delay, in us */ > + __u32 depth; /* queue depth in packets */ > + __u32 minbytes; /* MTU (usually) */ > + __u32 interval; /* Sliding min time window width (us) */ > +}; > + > +struct tc_codel_stats { > + __u64 drops; > + __u64 marks; > +}; > + > + > #endif > diff --git a/net/sched/Kconfig b/net/sched/Kconfig > index 2590e91..8106c42 100644 > --- a/net/sched/Kconfig > +++ b/net/sched/Kconfig > @@ -250,6 +250,17 @@ config NET_SCH_QFQ > > If unsure, say N. > > +config NET_SCH_CODEL > + tristate "Controlled Delay AQM (CODEL)" > + help > + Say Y here if you want to use the Controlled Delay (CODEL) > + packet scheduling algorithm. > + > + To compile this driver as a module, choose M here: the module > + will be called sch_codel. > + > + If unsure, say N. > + > config NET_SCH_INGRESS > tristate "Ingress Qdisc" > depends on NET_CLS_ACT > diff --git a/net/sched/Makefile b/net/sched/Makefile > index dc5889c..41130b5 100644 > --- a/net/sched/Makefile > +++ b/net/sched/Makefile > @@ -36,6 +36,7 @@ obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o > obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o > obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o > obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o > +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o > > obj-$(CONFIG_NET_CLS_U32) += cls_u32.o > obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o > diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c > new file mode 100644 > index 0000000..159df47 > --- /dev/null > +++ b/net/sched/sch_codel.c > @@ -0,0 +1,296 @@ > +/* > + * net/sched/sch_codel.c A Codel implementation > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License > + * as published by the Free Software Foundation; either version > + * 2 of the License, or (at your option) any later version. > + * > + * Based on ns2 simulation code presented by Kathie Nichols > + * Authors: Dave Täht <d@taht.net> > + */ > + > +#include <linux/module.h> > +#include <linux/slab.h> > +#include <linux/types.h> > +#include <linux/kernel.h> > +#include <linux/errno.h> > +#include <linux/ktime.h> > +#include <linux/skbuff.h> > +#include <net/pkt_sched.h> > + > +#define MS2TIME(a) (ns_to_ktime( (u64) a*1000000)) > +#define DEFAULT_CODEL_DEPTH 1000 > + > +/* Per-queue state (codel_queue_t instance variables) */ > + > +struct codel_sched_data { > + u32 flags; > + u32 minbytes; > + u32 count; /* packets dropped since we went into drop state */ > + bool dropping; /* 1 if in drop state, might just add to flags */ > + ktime_t target; > + ktime_t interval; > + /* time to declare above q->target (0 if below)*/ > + ktime_t first_above_time; > + ktime_t drop_next; /* time to drop next packet */ > +}; > + > +struct codel_skb_cb { > + ktime_t enqueue_time; > + char data[16]; > +}; > + > +static inline struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb) > +{ > + return (struct codel_skb_cb *)skb->cb; > +} You cant do that. You MUST keep the generic qdisc cb unchanged { qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb)); return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data; } > + > +static inline ktime_t get_enqueue_time(const struct sk_buff *skb) { > + struct codel_skb_cb *cb = get_codel_cb(skb); const struct codel_skb_cb *cb = get_codel_cb(skb); > + return cb->enqueue_time; or : return get_codel_cb(skb)->enqueue_time > +} > + > +static inline ktime_t set_enqueue_time(struct sk_buff *skb, ktime_t t ) { > + struct codel_skb_cb *cb = get_codel_cb(skb); > + cb->enqueue_time = t; get_codel_cb(skb)->enqueue_time = t; > + return t; > +} > + > +static inline ktime_t control_law(const struct codel_sched_data *q, ktime_t t) > +{ > + t.tv64 = t.tv64 + q->interval.tv64 / int_sqrt(q->count); thats a killer for lot of machines... (divide plus the int_sqrt()) > + return t; > +} > + > +/* > +static int codel_prob_mark(const struct codel_sched_data *q) > +{ > + return q->flags & TC_CODEL_ECN; Not clear if ECN plays well with CoDel at all. > +} > +*/ > + > + /* wrappers for ultimate statistics collection */ > + > +static int codel_drop(struct sk_buff *skb, struct Qdisc *sch) { > + return qdisc_drop(skb,sch); > +} > + > +/* > +static int codel_queue_drop(struct Qdisc *sch) { > + return qdisc_drop(skb,sch); > +} > +*/ > + > +struct sk_buff *codel_dequeue_head(struct Qdisc *sch) { > + return(qdisc_dequeue_head(sch)); > +} > + > +bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now) > +{ > + struct codel_sched_data *q = qdisc_priv(sch); > + bool drop = 0; bool drop = false; > + if (skb == NULL) { > + q->first_above_time.tv64 = 0; > + } else { > + ktime_t sojourn_time = ktime_sub(now, get_enqueue_time(skb)); > + if (sojourn_time.tv64 < > + q->target.tv64 || sch->qstats.backlog < q->minbytes) { > +/* went below so we’ll stay below for at least q->interval */ > + q->first_above_time.tv64 = 0; > + } else { > + if (q->first_above_time.tv64 == 0) { > + > +/* just went above from below. If we stay above > + * for at least q->interval we’ll say it’s ok to drop > + */ > + q->first_above_time = > + ktime_add(now,q->interval); > + } else if (now.tv64 >= q->first_above_time.tv64) { > + drop = 1; > + } > + } > + } > + return drop; > +} > + > +static struct sk_buff *codel_dequeue(struct Qdisc *sch) > +{ > + struct codel_sched_data *q = qdisc_priv(sch); > + struct sk_buff *skb = codel_dequeue_head(sch); > + ktime_t now; > + bool drop; missing newline > + if (skb == NULL) { > + q->dropping = 0; > + q->first_above_time.tv64 = 0; > + return skb; > + } > + now = ktime_get(); > + drop = should_drop(skb, sch, now); > + if (q->dropping) { > + if (drop) { > +/* sojourn time below target - leave dropping state */ > + q->dropping = 0; > + } else if (now.tv64 >= q->drop_next.tv64) { > +/* > + * It’s time for the next drop. Drop the current packet and dequeue the next. > + * The dequeue might take us out of dropping state. If not, schedule the > + * next drop. A large backlog might result in drop rates so high that the next > + * drop should happen now, hence the ‘while’ loop. > + */ > + while (now.tv64 >= q->drop_next.tv64 && q->dropping) { > + codel_drop(skb, sch); > + q->count++; > + skb = codel_dequeue_head(sch); > + if (should_drop(skb,sch,now)) { > +/* leave dropping state */ > + q->dropping = 0; > + } else { > +/* and schedule the next drop */ > + q->drop_next = > + control_law(q,q->drop_next); > + } > + } > + } else if (drop && > + ((now.tv64 - q->drop_next.tv64 < > + 16 * q->interval.tv64) || > + (now.tv64 - q->first_above_time.tv64 >= > + 2 * q->interval.tv64))) { > + codel_drop(skb, sch); > + skb = codel_dequeue_head(sch); > + /* engage state machine */ > + drop = should_drop(skb,sch,now); > + q->dropping = 1; > + > +/* if min went above target close to when we last went below it > + * assume that the drop rate that controlled the queue on the > + * last cycle is a good starting point to control it now. > + */ > + if (now.tv64 - q->drop_next.tv64 < > + 16 * q->interval.tv64) { doing ktime_t compares is tricky. Here its not done properly. Are you sure we need 64bit and ns resolution ? IMHO 'unsigned long' values are the best, for 32bit arches. CoDel paper mentions ms resolution... Even if you use us resolution, we can have about 2000 secs of window in an unsigned long. > + int c = q->count - 1; > + q->count = c < 1 ? 1 : c; > + } else { > + q->count = 1; > + } > + q->drop_next = control_law(q,now); > + } > + } > + return skb; > +} > + > + > +static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) > +{ I am worried about the latency spike in dequeue(), if we need to drop hundred of packets (while device is idle... If BQL is in use... its really unfortunate) Could we try to check if we can drop right now one or two packets that were queued in previous ->queue() calls, to smooth the drop processing ? > + if (likely(skb_queue_len(&sch->q) < sch->limit)) { > + set_enqueue_time(skb,ktime_get()); > + return qdisc_enqueue_tail(skb, sch); > + } > + return qdisc_reshape_fail(skb, sch); > +} > + > +static int codel_change(struct Qdisc *sch, struct nlattr *opt) > +{ > + struct codel_sched_data *q = qdisc_priv(sch); > + struct tc_codel_qopt *ctl = nla_data(opt); > + unsigned int qlen; > + > + if (opt->nla_len < nla_attr_size(sizeof(*ctl))) > + return -EINVAL; > + if (ctl->depth && (ctl->depth < 2 || ctl->depth > 65536)) > + return -EINVAL; > + if (ctl->minbytes && (ctl->minbytes < 64 || ctl->minbytes > 65536)) > + return -EINVAL; > + sch_tree_lock(sch); > + if (ctl->minbytes) q->minbytes = ctl->minbytes; if (cond) // NEWLINE statement; > + if (ctl->flags) q->flags = ctl->flags; > + if (ctl->target) q->target = ns_to_ktime((u64) ctl->target * 1000); > + if (ctl->interval) q->interval = > + ns_to_ktime((u64) ctl->interval * 1000); > + > + /* something saner than this for depth is probably needed */ > + > + if (ctl->depth) sch->limit = ctl->depth; > + qlen = sch->q.qlen; > +// while (sch->q.qlen > ctl->depth) > +// codel_drop(skb,sch); > +// qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); //? > + q->drop_next.tv64 = q->first_above_time.tv64 = 0; > + q->dropping = 0; /* exit dropping state */ > + sch_tree_unlock(sch); > + return 0; > +} > + > +static int codel_init(struct Qdisc *sch, struct nlattr *opt) > +{ > + struct codel_sched_data *q = qdisc_priv(sch); > + q->target = MS2TIME(5); > + sch->limit = DEFAULT_CODEL_DEPTH; > + q->minbytes = 1500; Please dont hard code this value, use ETH_DATA_LEN, or better : psched_mtu(qdisc_dev(sch)); > + q->interval = MS2TIME(100); > + if (opt) { > + int err = codel_change(sch, opt); > + if (err) > + return err; > + } > + > +/* if (sch->limit >= 1) > + sch->flags |= TCQ_F_CAN_BYPASS; > + else > + sch->flags &= ~TCQ_F_CAN_BYPASS; > +*/ > + return 0; > +} > + > +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) > +{ > + struct codel_sched_data *q = qdisc_priv(sch); > + struct tc_codel_qopt opt; > + opt.target = (u32) ktime_to_us(q->target); > + opt.interval = (u32) ktime_to_us(q->interval); > + opt.depth = sch->limit; > + opt.flags = q->flags; > + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); NLA_PUT() doesnt exist anymore in net-next tree, please rebase on net-next > + return skb->len; > + > +nla_put_failure: > +// nlmsg_trim(skb, b); > + return -1; > +} > + > +static void > +codel_reset(struct Qdisc *sch) > +{ > + struct sk_buff *skb; > + > + while ((skb = codel_dequeue(sch)) != NULL) > + kfree_skb(skb); > +} > + > +struct Qdisc_ops codel_qdisc_ops __read_mostly = { > + .id = "codel", > + .priv_size = sizeof(struct codel_sched_data), > + .enqueue = codel_enqueue, > + .dequeue = codel_dequeue, > + .peek = qdisc_peek_head, > +/* .drop = codel_queue_drop, */ > + .init = codel_init, > + .reset = codel_reset, > + .change = codel_change, > + .dump = codel_dump, > + .owner = THIS_MODULE, > +}; > +EXPORT_SYMBOL(codel_qdisc_ops); > + > +static int __init codel_module_init(void) > +{ > + return register_qdisc(&codel_qdisc_ops); > +} > +static void __exit codel_module_exit(void) > +{ > + unregister_qdisc(&codel_qdisc_ops); > +} > +module_init(codel_module_init) > +module_exit(codel_module_exit) > +MODULE_LICENSE("GPL"); > + > _______________________________________________ > Codel mailing list > Codel@lists.bufferbloat.net > https://lists.bufferbloat.net/listinfo/codel ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Codel] [PATCH] Preliminary codel implementation 2012-05-03 18:17 ` Eric Dumazet @ 2012-05-03 22:38 ` Dave Taht 0 siblings, 0 replies; 11+ messages in thread From: Dave Taht @ 2012-05-03 22:38 UTC (permalink / raw) To: Eric Dumazet; +Cc: codel, Dave Täht On Thu, May 3, 2012 at 11:17 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > On Thu, 2012-05-03 at 10:52 -0700, Dave Täht wrote: >> --- >> include/linux/pkt_sched.h | 29 +++++ >> net/sched/Kconfig | 11 ++ >> net/sched/Makefile | 1 + >> net/sched/sch_codel.c | 296 +++++++++++++++++++++++++++++++++++++++++++++ >> 4 files changed, 337 insertions(+) >> create mode 100644 net/sched/sch_codel.c >> > > > Hi Dave. > > Was this tested or is just a draft ? It compiled, seemed logically correct (since disproven), passedpackets and didn't crash. :) I was happy with the night's work for all of 10 minutes. I do massively appreciate your feedback and advice, as these are areas of the kernel that are opaque to most. > >> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h >> index 0d5b793..c21b720 100644 >> --- a/include/linux/pkt_sched.h >> +++ b/include/linux/pkt_sched.h >> @@ -633,4 +633,33 @@ struct tc_qfq_stats { >> __u32 lmax; >> }; >> >> +/* CODEL */ >> + >> +enum { >> + TCA_CODEL_UNSPEC, >> + TCA_CODEL_PARMS, >> + TCA_CODEL_TARGET, >> + TCA_CODEL_DEPTH, >> + TCA_CODEL_MINBYTES, >> + TCA_CODEL_INTERVAL, >> + __TCA_CODEL_MAX >> +}; >> + >> +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) >> +#define TC_CODEL_ECN 1 >> + >> +struct tc_codel_qopt { >> + __u32 flags; /* flags (e.g. ecn) */ >> + __u32 target; /* max delay, in us */ >> + __u32 depth; /* queue depth in packets */ >> + __u32 minbytes; /* MTU (usually) */ >> + __u32 interval; /* Sliding min time window width (us) */ >> +}; >> + >> +struct tc_codel_stats { >> + __u64 drops; >> + __u64 marks; >> +}; >> + >> + >> #endif >> diff --git a/net/sched/Kconfig b/net/sched/Kconfig >> index 2590e91..8106c42 100644 >> --- a/net/sched/Kconfig >> +++ b/net/sched/Kconfig >> @@ -250,6 +250,17 @@ config NET_SCH_QFQ >> >> If unsure, say N. >> >> +config NET_SCH_CODEL >> + tristate "Controlled Delay AQM (CODEL)" >> + help >> + Say Y here if you want to use the Controlled Delay (CODEL) >> + packet scheduling algorithm. >> + >> + To compile this driver as a module, choose M here: the module >> + will be called sch_codel. >> + >> + If unsure, say N. >> + >> config NET_SCH_INGRESS >> tristate "Ingress Qdisc" >> depends on NET_CLS_ACT >> diff --git a/net/sched/Makefile b/net/sched/Makefile >> index dc5889c..41130b5 100644 >> --- a/net/sched/Makefile >> +++ b/net/sched/Makefile >> @@ -36,6 +36,7 @@ obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o >> obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o >> obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o >> obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o >> +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o >> >> obj-$(CONFIG_NET_CLS_U32) += cls_u32.o >> obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o >> diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c >> new file mode 100644 >> index 0000000..159df47 >> --- /dev/null >> +++ b/net/sched/sch_codel.c >> @@ -0,0 +1,296 @@ >> +/* >> + * net/sched/sch_codel.c A Codel implementation >> + * >> + * This program is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU General Public License >> + * as published by the Free Software Foundation; either version >> + * 2 of the License, or (at your option) any later version. >> + * >> + * Based on ns2 simulation code presented by Kathie Nichols >> + * Authors: Dave Täht <d@taht.net> >> + */ >> + >> +#include <linux/module.h> >> +#include <linux/slab.h> >> +#include <linux/types.h> >> +#include <linux/kernel.h> >> +#include <linux/errno.h> >> +#include <linux/ktime.h> >> +#include <linux/skbuff.h> >> +#include <net/pkt_sched.h> >> + >> +#define MS2TIME(a) (ns_to_ktime( (u64) a*1000000)) >> +#define DEFAULT_CODEL_DEPTH 1000 >> + >> +/* Per-queue state (codel_queue_t instance variables) */ >> + >> +struct codel_sched_data { >> + u32 flags; >> + u32 minbytes; >> + u32 count; /* packets dropped since we went into drop state */ >> + bool dropping; /* 1 if in drop state, might just add to flags */ >> + ktime_t target; >> + ktime_t interval; >> + /* time to declare above q->target (0 if below)*/ >> + ktime_t first_above_time; >> + ktime_t drop_next; /* time to drop next packet */ >> +}; >> + >> +struct codel_skb_cb { >> + ktime_t enqueue_time; >> + char data[16]; >> +}; >> + >> +static inline struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb) >> +{ >> + return (struct codel_skb_cb *)skb->cb; >> +} > > You cant do that. You MUST keep the generic qdisc cb unchanged > I would prefer to have the timestamp on entry to the overall system, if there was some way to leverage the skb_shared_timestamps->syststamp facility. But, otherwise noted and corrected. > { > qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb)); > return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data; > } > > > > >> + >> +static inline ktime_t get_enqueue_time(const struct sk_buff *skb) { >> + struct codel_skb_cb *cb = get_codel_cb(skb); > > const struct codel_skb_cb *cb = get_codel_cb(skb); > >> + return cb->enqueue_time; > > or : return get_codel_cb(skb)->enqueue_time > >> +} >> + OK >> +static inline ktime_t set_enqueue_time(struct sk_buff *skb, ktime_t t ) { >> + struct codel_skb_cb *cb = get_codel_cb(skb); >> + cb->enqueue_time = t; > > get_codel_cb(skb)->enqueue_time = t; > >> + return t; >> +} >> + I read that and my eyes cross. >> +static inline ktime_t control_law(const struct codel_sched_data *q, ktime_t t) >> +{ >> + t.tv64 = t.tv64 + q->interval.tv64 / int_sqrt(q->count); > > thats a killer for lot of machines... (divide plus the int_sqrt()) If you think this is high overhead compared to everything else going on in a box, I'm geniunely puzzled... 3 pages of function calls have to be traversed to get to here... I am tending to agree now that the interval needent be 64bit. >> + return t; >> +} >> + >> +/* >> +static int codel_prob_mark(const struct codel_sched_data *q) >> +{ >> + return q->flags & TC_CODEL_ECN; > > Not clear if ECN plays well with CoDel at all. Not clear to me either, wanted to leave room in the API to play with it later. >> +} >> +*/ >> + >> + /* wrappers for ultimate statistics collection */ >> + >> +static int codel_drop(struct sk_buff *skb, struct Qdisc *sch) { >> + return qdisc_drop(skb,sch); >> +} >> + >> +/* >> +static int codel_queue_drop(struct Qdisc *sch) { >> + return qdisc_drop(skb,sch); >> +} >> +*/ > Aside from drops/marks as statistics, I was wanting something like 'avg closure rate', and 'dropping intervals' To try and explain the first I'd have to refer to a 1/10th brilliant paper about the microscopic behavior of tcp that Andrew and I were discussing that I can't find right now. Andrew? >> +struct sk_buff *codel_dequeue_head(struct Qdisc *sch) { >> + return(qdisc_dequeue_head(sch)); >> +} >> + >> +bool should_drop(struct sk_buff *skb, struct Qdisc *sch, ktime_t now) >> +{ >> + struct codel_sched_data *q = qdisc_priv(sch); >> + bool drop = 0; > > bool drop = false; > >> + if (skb == NULL) { >> + q->first_above_time.tv64 = 0; >> + } else { >> + ktime_t sojourn_time = ktime_sub(now, get_enqueue_time(skb)); >> + if (sojourn_time.tv64 < >> + q->target.tv64 || sch->qstats.backlog < q->minbytes) { >> +/* went below so we’ll stay below for at least q->interval */ >> + q->first_above_time.tv64 = 0; >> + } else { >> + if (q->first_above_time.tv64 == 0) { >> + >> +/* just went above from below. If we stay above >> + * for at least q->interval we’ll say it’s ok to drop >> + */ >> + q->first_above_time = >> + ktime_add(now,q->interval); >> + } else if (now.tv64 >= q->first_above_time.tv64) { >> + drop = 1; >> + } >> + } >> + } >> + return drop; >> +} >> + >> +static struct sk_buff *codel_dequeue(struct Qdisc *sch) >> +{ >> + struct codel_sched_data *q = qdisc_priv(sch); >> + struct sk_buff *skb = codel_dequeue_head(sch); >> + ktime_t now; >> + bool drop; > > missing newline KernelStyle is a newline after variable declarations? ok... thx > >> + if (skb == NULL) { >> + q->dropping = 0; >> + q->first_above_time.tv64 = 0; >> + return skb; >> + } >> + now = ktime_get(); >> + drop = should_drop(skb, sch, now); >> + if (q->dropping) { >> + if (drop) { >> +/* sojourn time below target - leave dropping state */ >> + q->dropping = 0; >> + } else if (now.tv64 >= q->drop_next.tv64) { >> +/* >> + * It’s time for the next drop. Drop the current packet and dequeue the next. >> + * The dequeue might take us out of dropping state. If not, schedule the >> + * next drop. A large backlog might result in drop rates so high that the next >> + * drop should happen now, hence the ‘while’ loop. >> + */ >> + while (now.tv64 >= q->drop_next.tv64 && q->dropping) { >> + codel_drop(skb, sch); >> + q->count++; >> + skb = codel_dequeue_head(sch); >> + if (should_drop(skb,sch,now)) { >> +/* leave dropping state */ >> + q->dropping = 0; >> + } else { >> +/* and schedule the next drop */ >> + q->drop_next = >> + control_law(q,q->drop_next); >> + } >> + } >> + } else if (drop && >> + ((now.tv64 - q->drop_next.tv64 < >> + 16 * q->interval.tv64) || >> + (now.tv64 - q->first_above_time.tv64 >= >> + 2 * q->interval.tv64))) { >> + codel_drop(skb, sch); >> + skb = codel_dequeue_head(sch); >> + /* engage state machine */ >> + drop = should_drop(skb,sch,now); >> + q->dropping = 1; >> + >> +/* if min went above target close to when we last went below it >> + * assume that the drop rate that controlled the queue on the >> + * last cycle is a good starting point to control it now. >> + */ >> + if (now.tv64 - q->drop_next.tv64 < >> + 16 * q->interval.tv64) { > > doing ktime_t compares is tricky. Here its not done properly. yea, um, I screwed this up globally. Rethinking now... > Are you sure we need 64bit and ns resolution ? No. However there are several things that I have in my head that overcomplicated things. a typical 5ms target for drop is a LOT of packets on 10GigE. Am trying to think ahead a decade or two or three and to a tiny equivalent of DCB in the 1000s of processors connected directly to your brain implant. > IMHO 'unsigned long' values are the best, for 32bit arches. > > CoDel paper mentions ms resolution... > Even if you use us resolution, we can have about 2000 secs of window in > an unsigned long. I wasn't as concerned as getting into the oort cloud as to handling really fast gear. > >> + int c = q->count - 1; >> + q->count = c < 1 ? 1 : c; >> + } else { >> + q->count = 1; >> + } >> + q->drop_next = control_law(q,now); >> + } >> + } >> + return skb; >> +} >> + >> + >> +static int codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) >> +{ > > I am worried about the latency spike in dequeue(), if we need to drop > hundred of packets (while device is idle... If BQL is in use... its > really unfortunate) I'm concerned about it too, I don't think anything else currently loops in that call. as best as I recall that's under rcu protection? > > Could we try to check if we can drop right now one or two packets that > were queued in previous ->queue() calls, to smooth the drop processing ? I have been speculating wildly about ways to shorten the tail of codel's compensation, and (especially) only shoot the elephants, not mice, using fq techniques, but have to wait to see this stuff run as is first. Do you have a suggestion? I'm never big on dropping more than one packet in a flow at a time. >> + if (likely(skb_queue_len(&sch->q) < sch->limit)) { >> + set_enqueue_time(skb,ktime_get()); >> + return qdisc_enqueue_tail(skb, sch); >> + } >> + return qdisc_reshape_fail(skb, sch); >> +} >> + >> +static int codel_change(struct Qdisc *sch, struct nlattr *opt) >> +{ >> + struct codel_sched_data *q = qdisc_priv(sch); >> + struct tc_codel_qopt *ctl = nla_data(opt); >> + unsigned int qlen; >> + >> + if (opt->nla_len < nla_attr_size(sizeof(*ctl))) >> + return -EINVAL; >> + if (ctl->depth && (ctl->depth < 2 || ctl->depth > 65536)) >> + return -EINVAL; >> + if (ctl->minbytes && (ctl->minbytes < 64 || ctl->minbytes > 65536)) >> + return -EINVAL; >> + sch_tree_lock(sch); >> + if (ctl->minbytes) q->minbytes = ctl->minbytes; > > if (cond) // NEWLINE > statement; > >> + if (ctl->flags) q->flags = ctl->flags; >> + if (ctl->target) q->target = ns_to_ktime((u64) ctl->target * 1000); >> + if (ctl->interval) q->interval = >> + ns_to_ktime((u64) ctl->interval * 1000); >> + >> + /* something saner than this for depth is probably needed */ >> + >> + if (ctl->depth) sch->limit = ctl->depth; >> + qlen = sch->q.qlen; >> +// while (sch->q.qlen > ctl->depth) >> +// codel_drop(skb,sch); >> +// qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); //? I don't get the how/when/why of this commented out bit. Can you clue me up? >> + q->drop_next.tv64 = q->first_above_time.tv64 = 0; >> + q->dropping = 0; /* exit dropping state */ >> + sch_tree_unlock(sch); >> + return 0; >> +} >> + >> +static int codel_init(struct Qdisc *sch, struct nlattr *opt) >> +{ >> + struct codel_sched_data *q = qdisc_priv(sch); >> + q->target = MS2TIME(5); >> + sch->limit = DEFAULT_CODEL_DEPTH; >> + q->minbytes = 1500; > > Please dont hard code this value, use ETH_DATA_LEN, or better : > > psched_mtu(qdisc_dev(sch)); Done. > >> + q->interval = MS2TIME(100); >> + if (opt) { >> + int err = codel_change(sch, opt); >> + if (err) >> + return err; >> + } >> + >> +/* if (sch->limit >= 1) >> + sch->flags |= TCQ_F_CAN_BYPASS; >> + else >> + sch->flags &= ~TCQ_F_CAN_BYPASS; >> +*/ >> + return 0; >> +} It was unclear to me the use of the above functionality I found in other qdiscs. In particular, it seems like codel could run with an unlimited queue just fine, I was uncomfortable with doing that as a first pass. >> + >> +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) >> +{ >> + struct codel_sched_data *q = qdisc_priv(sch); >> + struct tc_codel_qopt opt; >> + opt.target = (u32) ktime_to_us(q->target); >> + opt.interval = (u32) ktime_to_us(q->interval); >> + opt.depth = sch->limit; >> + opt.flags = q->flags; >> + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); > > NLA_PUT() doesnt exist anymore in net-next tree, please rebase on > net-next I saw it was transitioning, but I would prefer to make this stable on something I have deployed, get more people running it day to day... then submit to net-next. >> + return skb->len; >> + >> +nla_put_failure: >> +// nlmsg_trim(skb, b); >> + return -1; >> +} >> + >> +static void >> +codel_reset(struct Qdisc *sch) >> +{ >> + struct sk_buff *skb; >> + >> + while ((skb = codel_dequeue(sch)) != NULL) >> + kfree_skb(skb); > > >> +} >> + >> +struct Qdisc_ops codel_qdisc_ops __read_mostly = { >> + .id = "codel", >> + .priv_size = sizeof(struct codel_sched_data), >> + .enqueue = codel_enqueue, >> + .dequeue = codel_dequeue, >> + .peek = qdisc_peek_head, >> +/* .drop = codel_queue_drop, */ I'm puzzled as to what this is supposed to really do and who it's callers are. >> + .init = codel_init, >> + .reset = codel_reset, >> + .change = codel_change, >> + .dump = codel_dump, >> + .owner = THIS_MODULE, >> +}; >> +EXPORT_SYMBOL(codel_qdisc_ops); >> + >> +static int __init codel_module_init(void) >> +{ >> + return register_qdisc(&codel_qdisc_ops); >> +} >> +static void __exit codel_module_exit(void) >> +{ >> + unregister_qdisc(&codel_qdisc_ops); >> +} >> +module_init(codel_module_init) >> +module_exit(codel_module_exit) >> +MODULE_LICENSE("GPL"); >> + >> _______________________________________________ >> Codel mailing list >> Codel@lists.bufferbloat.net >> https://lists.bufferbloat.net/listinfo/codel > > > _______________________________________________ > Codel mailing list > Codel@lists.bufferbloat.net > https://lists.bufferbloat.net/listinfo/codel -- Dave Täht SKYPE: davetaht US Tel: 1-239-829-5608 http://www.bufferbloat.net ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Codel] [PATCH] Preliminary codel implementation 2012-05-03 17:52 ` [Codel] [PATCH] Preliminary codel implementation Dave Täht 2012-05-03 18:01 ` Dave Taht 2012-05-03 18:17 ` Eric Dumazet @ 2012-05-03 20:47 ` Jim Gettys 2012-05-03 21:35 ` Dave Taht 2 siblings, 1 reply; 11+ messages in thread From: Jim Gettys @ 2012-05-03 20:47 UTC (permalink / raw) To: Dave Täht; +Cc: codel On 05/03/2012 01:52 PM, Dave Täht wrote: > --- > include/linux/pkt_sched.h | 29 +++++ > net/sched/Kconfig | 11 ++ > net/sched/Makefile | 1 + > net/sched/sch_codel.c | 296 +++++++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 337 insertions(+) > create mode 100644 net/sched/sch_codel.c > > diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h > index 0d5b793..c21b720 100644 > --- a/include/linux/pkt_sched.h > +++ b/include/linux/pkt_sched.h > @@ -633,4 +633,33 @@ struct tc_qfq_stats { > __u32 lmax; > }; > > +/* CODEL */ > + > +enum { > + TCA_CODEL_UNSPEC, > + TCA_CODEL_PARMS, > + TCA_CODEL_TARGET, > + TCA_CODEL_DEPTH, > + TCA_CODEL_MINBYTES, > + TCA_CODEL_INTERVAL, > + __TCA_CODEL_MAX > +}; > + > +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) > +#define TC_CODEL_ECN 1 > + > +struct tc_codel_qopt { > + __u32 flags; /* flags (e.g. ecn) */ > + __u32 target; /* max delay, in us */ > + __u32 depth; /* queue depth in packets */ > + __u32 minbytes; /* MTU (usually) */ > + __u32 interval; /* Sliding min time window width (us) */ > +}; > + > +struct tc_codel_stats { > + __u64 drops; > + __u64 marks; > +}; > + > + > #endif > diff --git a/net/sched/Kconfig b/net/sched/Kconfig > index 2590e91..8106c42 100644 > --- a/net/sched/Kconfig > +++ b/net/sched/Kconfig > @@ -250,6 +250,17 @@ config NET_SCH_QFQ > > If unsure, say N. > > +config NET_SCH_CODEL > + tristate "Controlled Delay AQM (CODEL)" > + help > + Say Y here if you want to use the Controlled Delay (CODEL) > + packet scheduling algorithm. > + > + To compile this driver as a module, choose M here: the module > + will be called sch_codel. > + > + If unsure, say N. > + > config NET_SCH_INGRESS > tristate "Ingress Qdisc" > depends on NET_CLS_ACT > diff --git a/net/sched/Makefile b/net/sched/Makefile > index dc5889c..41130b5 100644 > --- a/net/sched/Makefile > +++ b/net/sched/Makefile > @@ -36,6 +36,7 @@ obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o > obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o > obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o > obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o > +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o > > obj-$(CONFIG_NET_CLS_U32) += cls_u32.o > obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o > diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c > new file mode 100644 > index 0000000..159df47 > --- /dev/null > +++ b/net/sched/sch_codel.c > @@ -0,0 +1,296 @@ > +/* > + * net/sched/sch_codel.c A Codel implementation > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License > + * as published by the Free Software Foundation; either version > + * 2 of the License, or (at your option) any later version. > + * > + * Based on ns2 simulation code presented by Kathie Nichols > + * Authors: Dave Täht <d@taht.net> > + */ > + > +#include <linux/module.h> > +#include <linux/slab.h> > +#include <linux/types.h> > +#include <linux/kernel.h> > +#include <linux/errno.h> > +#include <linux/ktime.h> > +#include <linux/skbuff.h> > +#include <net/pkt_sched.h> > + > +#define MS2TIME(a) (ns_to_ktime( (u64) a*1000000)) > +#define DEFAULT_CODEL_DEPTH 1000 > + > +/* Per-queue state (codel_queue_t instance variables) */ > + > +struct codel_sched_data { > + u32 flags; > + u32 minbytes; > + u32 count; /* packets dropped since we went into drop state */ > + bool dropping; /* 1 if in drop state, might just add to flags */ > + ktime_t target; > + ktime_t interval; > + /* time to declare above q->target (0 if below)*/ > + ktime_t first_above_time; > + ktime_t drop_next; /* time to drop next packet */ > +}; > + > +struct codel_skb_cb { > + ktime_t enqueue_time; > + char data[16]; > +}; > + > +static inline struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb) > +{ > + return (struct codel_skb_cb *)skb->cb; > +} > + > +static inline ktime_t get_enqueue_time(const struct sk_buff *skb) { > + struct codel_skb_cb *cb = get_codel_cb(skb); > + return cb->enqueue_time; > +} > + > +static inline ktime_t set_enqueue_time(struct sk_buff *skb, ktime_t t ) { > + struct codel_skb_cb *cb = get_codel_cb(skb); > + cb->enqueue_time = t; > + return t; > +} > + > +static inline ktime_t control_law(const struct codel_sched_data *q, ktime_t t) > +{ > + t.tv64 = t.tv64 + q->interval.tv64 / int_sqrt(q->count); > + return t; > +} There is a note in the article at this point in its exposition of the pseudo code: "Note that for embedded systems or kernel implementation that the inverse *sqrt* can be computed efficiently using only integer multiplication." That will likely be faster than the division by sqrt; if/when this optimisation is done, we should leave comments about why it is division by a square root, to correspond to TCP's control law. Before we upstream this, I'd recommend taking the comments documenting the algorithm from the article and including them. I think Kathie had to trim on the slides. - Jim ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Codel] [PATCH] Preliminary codel implementation 2012-05-03 20:47 ` Jim Gettys @ 2012-05-03 21:35 ` Dave Taht 2012-05-03 23:12 ` Jim Gettys 0 siblings, 1 reply; 11+ messages in thread From: Dave Taht @ 2012-05-03 21:35 UTC (permalink / raw) To: Jim Gettys; +Cc: codel, Dave Täht On Thu, May 3, 2012 at 1:47 PM, Jim Gettys > There is a note in the article at this point in its exposition of the > pseudo code: > > "Note that for embedded systems or kernel implementation that the > inverse *sqrt* can be computed efficiently using only integer > multiplication." > > That will likely be faster than the division by sqrt; if/when this > optimisation is done, we should leave comments about why it is division > by a square root, to correspond to TCP's control law. > > Before we upstream this, I'd recommend taking the comments documenting > the algorithm from the article and including them. I think Kathie had > to trim on the slides. > - Jim 1) My own hurry is because I want to see this work outside of a model, before monday. 2) I strongly believe in correctness first, optimizations last. I guess I'm the only guy here that kind of wishes we were using floating point for this function! 3) I also would massively prefer that we measure queue length from actual ingress (eg the application or incoming network card), to egress. I regard scribbling on the cb as a hack, I'd rather use a normalized hardware timestamp gained on ingres. There are 3+ pages of code that need to be traversed to get from there to there, and those delays are significant (as we proved last year) and not accounted for in a fluid model. 4) Despite eric's request that I rebase on net-next, there's plenty of time for that, after the code is proven correct, and stable. I'm very glad we're at a point where it will be very easy to port forward to net-next, but I'm totally unwilling to cope with random breakage that goes with stepping over that edge. I have a dozen machines in the lab, setup on 3.3.4, ready to let me do head to head comparisons of various technologies in play, on three different architectures, using 3 forms of wireless and 2 forms of ethernet. (If I can beg pity from anyone, one of those arches has 300+ out of tree patches and it's a cast iron bitch to work on net-next in it) Yes, we absolutely want something to go upstream. It doesn't need to happen next week or even next month. I'd like a sane API and naming scheme, and a man page to go with it. I happen to be fond of ECN for long RTTs, and think it has great potential for wireless, I know others are not, and the code structure turned out to be hard to put that in... I didn't get sufficient time to evaluate sfq+red before it went upstream, and I really regret that. There is plenty of time in net-next's upcoming window, regardless, to get something into linux. I'd like to see someone working on BSD, and on a ns3 model that can integrate with the wifi model. Let's try to do some quality work here! I'd like very much to explore, for example, what happens in a massive dropping scenario, such as what eric alludes to might happen on bql + gigE. And to have effective measurements of what the actual overhead of a sqrt and divide are - so far as I know on modern arches it's like 11-25 clocks, much of which is lost in the noise on superscalar architecture. This particular portion of the algorithm is not triggered particularly often. I'd like to run a real 1000 streams of real ledbat through it, voip, web access, and games... play with prioritization and FQ techniques, etc. but first up the simplest possible implementation that's correct? OK? Measure first. Get it correct. Then analyze the real behavior. Then optimize.... Wash, rinse, repeat. This is such an old lesson I can't believe you guys are so over focused on it! A sqrt seems correct to me for tcp traffic but what about everything else? I very much appreciate the code review that happened earlier today, and hopefully my next attempt will be closer to correct. (In particular I hadn't realized how gnarly dealing with ktime was) But: I assure you all I NEVER get right on the first couple of tries. And I wasn't planning on being the guy to write the code, I just couldn't sleep, not seeing, feeling, experiencing, it work. So I'd like to setup a git repo for this and iproute2 ASAP. I'd be willing to wait til after monday and just rely on code review. -- Dave Täht SKYPE: davetaht US Tel: 1-239-829-5608 http://www.bufferbloat.net ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Codel] [PATCH] Preliminary codel implementation 2012-05-03 21:35 ` Dave Taht @ 2012-05-03 23:12 ` Jim Gettys 0 siblings, 0 replies; 11+ messages in thread From: Jim Gettys @ 2012-05-03 23:12 UTC (permalink / raw) To: Dave Taht; +Cc: codel, Dave Täht On 05/03/2012 05:35 PM, Dave Taht wrote: > On Thu, May 3, 2012 at 1:47 PM, Jim Gettys >> There is a note in the article at this point in its exposition of the >> pseudo code: >> >> "Note that for embedded systems or kernel implementation that the >> inverse *sqrt* can be computed efficiently using only integer >> multiplication." >> >> That will likely be faster than the division by sqrt; if/when this >> optimisation is done, we should leave comments about why it is division >> by a square root, to correspond to TCP's control law. >> >> Before we upstream this, I'd recommend taking the comments documenting >> the algorithm from the article and including them. I think Kathie had >> to trim on the slides. >> - Jim > > 1) My own hurry is because I want to see this work outside of a model, > before monday. > > 2) I strongly believe in correctness first, optimizations last. I > guess I'm the only guy here that kind of wishes we were using floating > point for this function! Go get some sleep. I sent the mail I did as I did not want someone reading these archives next week to presume that a square root function, in this context, is an expensive operation. Not everyone will wade through the article to come across that implementation note. - Jim > 3) I also would massively prefer that we measure queue length from > actual ingress (eg the application or incoming network card), > to egress. I regard scribbling on the cb as a hack, I'd rather use a > normalized hardware timestamp gained on ingres. > > There are 3+ pages of code that need to be traversed to get from there > to there, and those delays are significant (as we proved last year) > and not accounted for in a fluid model. > > 4) Despite eric's request that I rebase on net-next, there's plenty of > time for that, after the code is proven correct, and stable. > > I'm very glad we're at a point where it will be very easy to port > forward to net-next, but I'm totally unwilling to cope with random > breakage that goes with stepping over that edge. > > I have a dozen machines in the lab, setup on 3.3.4, ready to let me do > head to head comparisons of various technologies in play, on three > different architectures, using 3 forms of wireless and 2 forms of > ethernet. > > (If I can beg pity from anyone, one of those arches has 300+ out of > tree patches and it's a cast iron bitch to work on net-next in it) > > Yes, we absolutely want something to go upstream. It doesn't need to > happen next week or even next month. I'd like a sane API and naming > scheme, and a man page to go with it. > > I happen to be fond of ECN for long RTTs, and think it has > great potential for wireless, I know others are not, > and the code structure turned out to be hard to put that in... > > I didn't get sufficient time to evaluate sfq+red before it went > upstream, and I really regret that. > > There is plenty of time in net-next's upcoming window, regardless, > to get something into linux. I'd like to see someone working on BSD, > and on a ns3 model that can integrate with the wifi model. > > Let's try to do some quality work here! > > I'd like very much to explore, for example, what happens in a massive > dropping scenario, such as what eric alludes to might happen on bql + > gigE. And to have effective measurements of what the actual overhead > of a sqrt and divide are - so far as I know on modern arches it's like > 11-25 clocks, much of which is lost in the noise on superscalar > architecture. This particular portion of the algorithm is not > triggered particularly often. > > I'd like to run a real 1000 streams of real ledbat through it, voip, > web access, and games... play with prioritization and FQ techniques, > etc. > > but first up the simplest possible implementation that's correct? > OK? > > Measure first. Get it correct. Then analyze the real behavior. Then > optimize.... Wash, rinse, repeat. This is such an old lesson I can't > believe you guys are so over focused on it! A sqrt seems correct to me > for tcp traffic but what about everything else? > > I very much appreciate the code review that happened earlier today, > and hopefully my next attempt will be closer to correct. > > (In particular I hadn't realized how gnarly dealing with ktime was) > > But: I assure you all I NEVER get right on the first couple of tries. > > And I wasn't planning on being the guy to write the code, I just > couldn't sleep, not seeing, feeling, experiencing, it work. > > So I'd like to setup a git repo for this and iproute2 ASAP. > > I'd be willing to wait til after monday and just rely on code review. > > ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Codel] [PATCH] iproute2: added preliminary codel support 2012-05-03 17:52 [Codel] [PATCH] iproute2: added preliminary codel support Dave Täht 2012-05-03 17:52 ` [Codel] [PATCH] Preliminary codel implementation Dave Täht @ 2012-05-03 17:59 ` Dave Taht 1 sibling, 0 replies; 11+ messages in thread From: Dave Taht @ 2012-05-03 17:59 UTC (permalink / raw) To: Dave Täht; +Cc: codel I should have prepended 'RFC' to these two posts. While this code compiles, and indeed lets you insert and use the codel kernel module... I appear to have goofed on the netlink implementation. I get back: !!!Deficit 20, rta_len=24 so I'm confused about the correct usage of the PARM parameter. certainly a little code review from the more knowledgable is highly desired. Or, I'll get back on it after a major nap. On Thu, May 3, 2012 at 10:52 AM, Dave Täht <dave.taht@bufferbloat.net> wrote: > this makes certain constants in the code variables, but > has seemingly good defaults. > > target and interval can be specified with a ms, s, or us parameter > mtu is in bytes > > ecn, while part of the api, is not enabled in the code > --- > include/linux/pkt_sched.h | 28 ++++++++++ > tc/Makefile | 1 + > tc/q_codel.c | 135 +++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 164 insertions(+) > create mode 100644 tc/q_codel.c > > diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h > index 410b33d..150aee9 100644 > --- a/include/linux/pkt_sched.h > +++ b/include/linux/pkt_sched.h > @@ -654,4 +654,32 @@ struct tc_qfq_stats { > __u32 lmax; > }; > > +/* CODEL */ > + > +enum { > + TCA_CODEL_UNSPEC, > + TCA_CODEL_PARMS, > + TCA_CODEL_TARGET, > + TCA_CODEL_DEPTH, > + TCA_CODEL_MINBYTES, > + TCA_CODEL_INTERVAL, > + __TCA_CODEL_MAX > +}; > + > +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) > +#define TC_CODEL_ECN 1 > + > +struct tc_codel_qopt { > + __u32 flags; /* flags (e.g. ecn) */ > + __u32 target; /* max delay, in us */ > + __u32 depth; /* queue depth in packets */ > + __u32 minbytes; /* MTU (usually) */ > + __u32 interval; /* Sliding min time window width (us) */ > +}; > + > +struct tc_codel_stats { > + __u64 drops; > + __u64 marks; > +}; > + > #endif > diff --git a/tc/Makefile b/tc/Makefile > index be8cd5a..8a7cc8d 100644 > --- a/tc/Makefile > +++ b/tc/Makefile > @@ -47,6 +47,7 @@ TCMODULES += em_cmp.o > TCMODULES += em_u32.o > TCMODULES += em_meta.o > TCMODULES += q_mqprio.o > +TCMODULES += q_codel.o > > TCSO := > ifeq ($(TC_CONFIG_ATM),y) > diff --git a/tc/q_codel.c b/tc/q_codel.c > new file mode 100644 > index 0000000..2ff51a1 > --- /dev/null > +++ b/tc/q_codel.c > @@ -0,0 +1,135 @@ > +/* > + * q_codel.c Controlled Delay > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License > + * as published by the Free Software Foundation; either version > + * 2 of the License, or (at your option) any later version. > + * > + * Authors: Dave Taht <d@taht.net> > + * > + */ > + > + > + > +#include <stdio.h> > +#include <stdlib.h> > +#include <unistd.h> > +#include <syslog.h> > +#include <fcntl.h> > +#include <sys/socket.h> > +#include <netinet/in.h> > +#include <arpa/inet.h> > +#include <string.h> > + > +#include "utils.h" > +#include "tc_util.h" > + > +static void explain(void) > +{ > + fprintf(stderr, > + "Usage: ... codel [target MAX_DELAY] [depth PACKETS]" > + " [mtu MINBYTES]\n " > + "[interval INTEGRATION_TIME] [ecn]\n" ); > +} > + > +static int codel_parse_opt(struct qdisc_util *qu, int argc, char **argv, > + struct nlmsghdr *n) > +{ > + struct tc_codel_qopt opt; > + struct rtattr *tail; > + > + memset(&opt, 0, sizeof(opt)); > + opt.target = 5000; > + opt.depth = 1000; > + opt.interval = 1000*100; > + opt.minbytes = 1500; > + opt.flags = 0; > + > + while (argc > 0) { > + if (strcmp(*argv, "target") == 0) { > + NEXT_ARG(); > + if (get_time(&opt.target, *argv)) { > + fprintf(stderr, "Illegal \"target\"\n"); > + return -1; > + } > + } else if (strcmp(*argv, "depth") == 0) { > + NEXT_ARG(); > + if (get_u32(&opt.depth, *argv, 0)) { > + fprintf(stderr, "Illegal \"depth\"\n"); > + return -1; > + } > + } else if (strcmp(*argv, "interval") == 0) { > + NEXT_ARG(); > + if (get_time(&opt.interval, *argv)) { > + fprintf(stderr, "Illegal \"interval\"\n"); > + return -1; > + } > + } else if (strcmp(*argv, "mtu") == 0) { > + NEXT_ARG(); > + if (get_u32(&opt.minbytes, *argv, 0)) { > + fprintf(stderr, "Illegal \"mtu\"\n"); > + return -1; > + } > + } else if (strcmp(*argv, "ecn") == 0) { > + opt.flags |= TC_CODEL_ECN; > + } else { > + fprintf(stderr, "What is \"%s\"?\n", *argv); > + explain(); > + return -1; > + } > + argc--; argv++; > + } > + > + tail = NLMSG_TAIL(n); > + addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); > + addattr_l(n, 1024, TCA_CODEL_PARMS, &opt, sizeof(opt)); > + tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail; > + return 0; > +} > + > +static int codel_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) > +{ > + struct rtattr *tb[__TCA_CODEL_MAX]; > + struct tc_codel_qopt *qopt; > + > + if (opt == NULL) > + return 0; > + > + parse_rtattr_nested(tb, TCA_CODEL_MAX, opt); > + if (tb[TCA_CODEL_PARMS] == NULL) > + return -1; > + qopt = RTA_DATA(tb[TCA_CODEL_PARMS]); > + if (RTA_PAYLOAD(tb[TCA_CODEL_PARMS]) < sizeof(*qopt)) > + return -1; > + fprintf(f," target %dus depth %d mtu %d interval %dus %s\n", > + qopt->target, qopt->depth, qopt->minbytes, > + qopt->interval, (qopt->flags & TC_CODEL_ECN) ? "ecn" : ""); > + > + return 0; > +} > + > +static int codel_print_xstats(struct qdisc_util *qu, FILE *f, > + struct rtattr *xstats) > +{ > + struct tc_codel_stats *st; > + > + if (xstats == NULL) > + return 0; > + > + if (RTA_PAYLOAD(xstats) < sizeof(*st)) > + return -1; > + > + st = RTA_DATA(xstats); > + fprintf(f, " dropped %lu marked %lu ", > + st->drops, st->marks); > + > + return 0; > +} > + > +struct qdisc_util codel_qdisc_util = { > + .id = "codel", > + .parse_qopt = codel_parse_opt, > + .print_qopt = codel_print_opt, > + .print_xstats = codel_print_xstats, > +}; > -- > 1.7.9.5 > > _______________________________________________ > Codel mailing list > Codel@lists.bufferbloat.net > https://lists.bufferbloat.net/listinfo/codel -- Dave Täht SKYPE: davetaht US Tel: 1-239-829-5608 http://www.bufferbloat.net ^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2012-05-03 23:12 UTC | newest] Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2012-05-03 17:52 [Codel] [PATCH] iproute2: added preliminary codel support Dave Täht 2012-05-03 17:52 ` [Codel] [PATCH] Preliminary codel implementation Dave Täht 2012-05-03 18:01 ` Dave Taht 2012-05-03 18:17 ` Rick Jones 2012-05-03 18:19 ` Eric Dumazet 2012-05-03 18:17 ` Eric Dumazet 2012-05-03 22:38 ` Dave Taht 2012-05-03 20:47 ` Jim Gettys 2012-05-03 21:35 ` Dave Taht 2012-05-03 23:12 ` Jim Gettys 2012-05-03 17:59 ` [Codel] [PATCH] iproute2: added preliminary codel support Dave Taht
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox