[Cerowrt-devel] Fwd: [PATCH] net_sched: restore "linklayer atm" handling

Sebastian Moeller moeller0 at gmx.de
Wed Aug 14 20:54:56 EDT 2013


Hi Dave,


On Aug 15, 2013, at 02:46 , Dave Taht <dave.taht at gmail.com> wrote:

> jesper just dropped this set of patches on the netdev list, which I hope address some of the remaining issues on htb. There was some debate the last time this went by on the list, so the word is not in yet.

	I know Jesper created the link layer adaptation in the first place and probably has a soft spot n his heart for the HTB implementation thereof, but really he should switch HTB to use the size table instead of the rate table approach. This should allow for backward compatibility and will eliminate one of two ways to achieve basically the same. (And to soothe him, it looks pretty much that the stab code heavily borrowed from Jespers code)

> 
> I note I would really like to get around to profiling the "final" aqm code in the hope of squeezing more performance out of it. Presently it tops out at about 25/25Mbit. (it's not fq_codel but htb)

One reason I keep harping this cord is that should we need switching to a different disc (like hfsc) stab will just work… 

best
	Sebastian


> 
> ---------- Forwarded message ----------
> From: Jesper Dangaard Brouer <brouer at redhat.com>
> Date: Wed, Aug 14, 2013 at 2:47 PM
> Subject: [PATCH] net_sched: restore "linklayer atm" handling
> To: "David S. Miller" <davem at davemloft.net>, Dave Taht <dave.taht at gmail.com>, netdev at vger.kernel.org
> Cc: Jesper Dangaard Brouer <brouer at redhat.com>, Stephen Hemminger <shemminger at vyatta.com>
> 
> 
> commit 56b765b79 ("htb: improved accuracy at high rates")
> broke the "linklayer atm" handling.
> 
>  tc class add ... htb rate X ceil Y linklayer atm
> 
> The linklayer setting is implemented by modifying the rate table
> which is send to the kernel.  No direct parameter were
> transferred to the kernel indicating the linklayer setting.
> 
> The commit 56b765b79 ("htb: improved accuracy at high rates")
> removed the use of the rate table system.
> 
> To keep compatible with older iproute2 utils, this patch detects
> the linklayer by parsing the rate table.  It also supports future
> versions of iproute2 to send this linklayer parameter to the
> kernel directly. This is done by using the __reserved field in
> struct tc_ratespec, to convey the choosen linklayer option, but
> only using the lower 4 bits of this field.
> 
> Linklayer detection is limited to speeds below 100Mbit/s, because
> at high rates the rtab is gets too inaccurate, so bad that
> several fields contain the same values, this resembling the ATM
> detect.  Fields even start to contain "0" time to send, e.g. at
> 1000Mbit/s sending a 96 bytes packet cost "0", thus the rtab have
> been more broken than we first realized.
> 
> Signed-off-by: Jesper Dangaard Brouer <brouer at redhat.com>
> ---
> 
>  include/net/sch_generic.h      |    9 ++++++++-
>  include/uapi/linux/pkt_sched.h |   10 +++++++++-
>  net/sched/sch_api.c            |   41 ++++++++++++++++++++++++++++++++++++++++
>  net/sched/sch_generic.c        |    1 +
>  net/sched/sch_htb.c            |   13 +++++++++++++
>  5 files changed, 72 insertions(+), 2 deletions(-)
> 
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 6eab633..e5ae0c5 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -683,13 +683,19 @@ struct psched_ratecfg {
>         u64     rate_bytes_ps; /* bytes per second */
>         u32     mult;
>         u16     overhead;
> +       u8      linklayer;
>         u8      shift;
>  };
> 
>  static inline u64 psched_l2t_ns(const struct psched_ratecfg *r,
>                                 unsigned int len)
>  {
> -       return ((u64)(len + r->overhead) * r->mult) >> r->shift;
> +       len += r->overhead;
> +
> +       if (unlikely(r->linklayer == TC_LINKLAYER_ATM))
> +               return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift;
> +
> +       return ((u64)len * r->mult) >> r->shift;
>  }
> 
>  extern void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf);
> @@ -700,6 +706,7 @@ static inline void psched_ratecfg_getrate(struct tc_ratespec *res,
>         memset(res, 0, sizeof(*res));
>         res->rate = r->rate_bytes_ps;
>         res->overhead = r->overhead;
> +       res->linklayer = (r->linklayer & TC_LINKLAYER_MASK);
>  }
> 
>  #endif
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index dbd71b0..09d62b9 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
> @@ -73,9 +73,17 @@ struct tc_estimator {
>  #define TC_H_ROOT      (0xFFFFFFFFU)
>  #define TC_H_INGRESS    (0xFFFFFFF1U)
> 
> +/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */
> +enum tc_link_layer {
> +       TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */
> +       TC_LINKLAYER_ETHERNET,
> +       TC_LINKLAYER_ATM,
> +};
> +#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */
> +
>  struct tc_ratespec {
>         unsigned char   cell_log;
> -       unsigned char   __reserved;
> +       __u8            linklayer; /* lower 4 bits */
>         unsigned short  overhead;
>         short           cell_align;
>         unsigned short  mpu;
> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> index 281c1bd..51b968d 100644
> --- a/net/sched/sch_api.c
> +++ b/net/sched/sch_api.c
> @@ -285,6 +285,45 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
>         return q;
>  }
> 
> +/* The linklayer setting were not transferred from iproute2, in older
> + * versions, and the rate tables lookup systems have been dropped in
> + * the kernel. To keep backward compatible with older iproute2 tc
> + * utils, we detect the linklayer setting by detecting if the rate
> + * table were modified.
> + *
> + * For linklayer ATM table entries, the rate table will be aligned to
> + * 48 bytes, thus some table entries will contain the same value.  The
> + * mpu (min packet unit) is also encoded into the old rate table, thus
> + * starting from the mpu, we find low and high table entries for
> + * mapping this cell.  If these entries contain the same value, when
> + * the rate tables have been modified for linklayer ATM.
> + *
> + * This is done by rounding mpu to the nearest 48 bytes cell/entry,
> + * and then roundup to the next cell, calc the table entry one below,
> + * and compare.
> + */
> +static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
> +{
> +       int low       = roundup(r->mpu, 48);
> +       int high      = roundup(low+1, 48);
> +       int cell_low  = low >> r->cell_log;
> +       int cell_high = (high >> r->cell_log) - 1;
> +
> +       /* rtab is too inaccurate at rates > 100Mbit/s */
> +       if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
> +               pr_debug("TC linklayer: Giving up ATM detection\n");
> +               return TC_LINKLAYER_ETHERNET;
> +       }
> +
> +       if ((cell_high > cell_low) && (cell_high < 256)
> +           && (rtab[cell_low] == rtab[cell_high])) {
> +               pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
> +                        cell_low, cell_high, rtab[cell_high]);
> +               return TC_LINKLAYER_ATM;
> +       }
> +       return TC_LINKLAYER_ETHERNET;
> +}
> +
>  static struct qdisc_rate_table *qdisc_rtab_list;
> 
>  struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
> @@ -308,6 +347,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *ta
>                 rtab->rate = *r;
>                 rtab->refcnt = 1;
>                 memcpy(rtab->data, nla_data(tab), 1024);
> +               if (r->linklayer == TC_LINKLAYER_UNAWARE)
> +                       r->linklayer = __detect_linklayer(r, rtab->data);
>                 rtab->next = qdisc_rtab_list;
>                 qdisc_rtab_list = rtab;
>         }
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index eeb8276..48be3d5 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -909,6 +909,7 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r,
>         memset(r, 0, sizeof(*r));
>         r->overhead = conf->overhead;
>         r->rate_bytes_ps = conf->rate;
> +       r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
>         r->mult = 1;
>         /*
>          * The deal here is to replace a divide by a reciprocal one
> diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
> index 45e7515..c2178b1 100644
> --- a/net/sched/sch_htb.c
> +++ b/net/sched/sch_htb.c
> @@ -1329,6 +1329,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
>         struct htb_sched *q = qdisc_priv(sch);
>         struct htb_class *cl = (struct htb_class *)*arg, *parent;
>         struct nlattr *opt = tca[TCA_OPTIONS];
> +       struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
>         struct nlattr *tb[TCA_HTB_MAX + 1];
>         struct tc_htb_opt *hopt;
> 
> @@ -1350,6 +1351,18 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
>         if (!hopt->rate.rate || !hopt->ceil.rate)
>                 goto failure;
> 
> +       /* Keeping backward compatible with rate_table based iproute2 tc */
> +       if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) {
> +               rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
> +               if (rtab)
> +                       qdisc_put_rtab(rtab);
> +       }
> +       if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) {
> +               ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
> +               if (ctab)
> +                       qdisc_put_rtab(ctab);
> +       }
> +
>         if (!cl) {              /* new class */
>                 struct Qdisc *new_q;
>                 int prio;
> 
> 
> 
> 
> -- 
> Dave Täht
> 
> Fixing bufferbloat with cerowrt: http://www.teklibre.com/cerowrt/subscribe.html
> _______________________________________________
> Cerowrt-devel mailing list
> Cerowrt-devel at lists.bufferbloat.net
> https://lists.bufferbloat.net/listinfo/cerowrt-devel




More information about the Cerowrt-devel mailing list