[PATCH RFC 1/3] Broadcast qdisc statistics via netlink on packet dequeue.

Dave Taht dave.taht at gmail.com
Mon Jun 10 13:20:57 EDT 2013


It is highly unlikely these patches would be accepted by upstream as
allocating a skb in the fast path is probably a no-no. However at least
part of them seem generically useful - instrumenting pfifo_fast in
particular - and certainly helpful for debugging in general.

I had hoped it would be possible to do this without mucking with locking,
tho. :(

In my own case I'd like to have a
qdisc_drop_with_stats(skb,sch,"fq_codel",reason) that could be compiled in
or out easily, sending the packet headers of the dropped packet to a
listener much as you do here. This would be useful for monitoring, have
potential as a routing metric, and make it easier to analyze the behavior
of various qdisc types, without having to poll and lock via tc.

Also as a drop takes place only when the card is producing delays, it would
be possible to get a bandwidth estimate over smaller intervals from it
easily, and be less invasive overall...

So thanks for digging into netlink so much and having a sample program. I
almost get it now...

A couple more comments below:

On Mon, Jun 10, 2013 at 5:39 AM, Toke Høiland-Jørgensen <toke at toke.dk>wrote:

Email syntax for a RFC PATCH is RFC PATCH, not PATCH RFC

---
>  include/uapi/linux/rtnetlink.h | 15 ++++++---
>  net/core/gen_stats.c           |  6 ++--
>  net/sched/Kconfig              |  8 +++++
>  net/sched/sch_generic.c        | 71
> ++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 93 insertions(+), 7 deletions(-)
>
> diff --git a/include/uapi/linux/rtnetlink.h
> b/include/uapi/linux/rtnetlink.h
> index 7a2144e..e9d91c5 100644
> --- a/include/uapi/linux/rtnetlink.h
> +++ b/include/uapi/linux/rtnetlink.h
> @@ -68,6 +68,9 @@ enum {
>         RTM_GETQDISC,
>  #define RTM_GETQDISC   RTM_GETQDISC
>
> +       RTM_QDISC_STATS,
> +#define RTM_QDISC_STATS RTM_QDISC_STATS
> +
>


>         RTM_NEWTCLASS   = 40,
>  #define RTM_NEWTCLASS  RTM_NEWTCLASS
>         RTM_DELTCLASS,
> @@ -140,7 +143,7 @@ enum {
>  #define RTM_NR_FAMILIES        (RTM_NR_MSGTYPES >> 2)
>  #define RTM_FAM(cmd)   (((cmd) - RTM_BASE) >> 2)
>
> -/*
> +/*
>     Generic structure for encapsulation of optional route information.
>     It is reminiscent of sockaddr, but with sa_family replaced
>     with attribute type.
> @@ -180,7 +183,7 @@ struct rtmsg {
>
>         unsigned char           rtm_table;      /* Routing table id */
>         unsigned char           rtm_protocol;   /* Routing protocol; see
> below  */
> -       unsigned char           rtm_scope;      /* See below */
> +       unsigned char           rtm_scope;      /* See below */
>         unsigned char           rtm_type;       /* See below    */
>
>
unnecessary change.


>         unsigned                rtm_flags;
> @@ -450,7 +453,7 @@ struct ifinfomsg {
>  };
>
>  /********************************************************************
> - *             prefix information
> + *             prefix information
>   ****/
>
>
unnecessary change. try to touch as little as possible... There is an emacs
mode for linux indentation, btw
which helps avoid annoyingly messing up existing code.

 struct prefixmsg {
> @@ -464,7 +467,7 @@ struct prefixmsg {
>         unsigned char   prefix_pad3;
>  };
>
> -enum
> +enum
>  {
>         PREFIX_UNSPEC,
>         PREFIX_ADDRESS,
> @@ -571,6 +574,8 @@ enum rtnetlink_groups {
>  #define RTNLGRP_NEIGH          RTNLGRP_NEIGH
>         RTNLGRP_TC,
>  #define RTNLGRP_TC             RTNLGRP_TC
> +       RTNLGRP_TC_STATS,
> +#define RTNLGRP_TC_STATS       RTNLGRP_TC_STATS
>         RTNLGRP_IPV4_IFADDR,
>  #define RTNLGRP_IPV4_IFADDR    RTNLGRP_IPV4_IFADDR
>         RTNLGRP_IPV4_MROUTE,
>

It's not clear to me where this stuff enters userspace or not, and in
general it's
safer to append new enums/defines rather than insert them.


> @@ -625,7 +630,7 @@ struct tcamsg {
>  };
>  #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) +
> NLMSG_ALIGN(sizeof(struct tcamsg))))
>  #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
> -#define TCA_ACT_TAB 1 /* attr type must be >=1 */
> +#define TCA_ACT_TAB 1 /* attr type must be >=1 */
>  #define TCAA_MAX 1
>

Uneeeded change...


>
>  /* New extended info filters for IFLA_EXT_MASK */
> diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
> index ddedf21..68df614 100644
> --- a/net/core/gen_stats.c
> +++ b/net/core/gen_stats.c
> @@ -61,7 +61,8 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int
> type, int tc_stats_type,
>  {
>         memset(d, 0, sizeof(*d));
>
> -       spin_lock_bh(lock);
> +       if(lock)
> +               spin_lock_bh(lock);
>

So this acted up on a SMP kernel?


>         d->lock = lock;
>         if (type)
>                 d->tail = (struct nlattr *)skb_tail_pointer(skb);
> @@ -245,7 +246,8 @@ gnet_stats_finish_copy(struct gnet_dump *d)
>                         return -1;
>         }
>
> -       spin_unlock_bh(d->lock);
> +       if(d->lock)
> +               spin_unlock_bh(d->lock);
>         return 0;
>  }
>  EXPORT_SYMBOL(gnet_stats_finish_copy);
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index 235e01a..4902812 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -308,6 +308,14 @@ config NET_SCH_PLUG
>           To compile this code as a module, choose M here: the
>           module will be called sch_plug.
>
> +config NET_SCH_BROADCAST_STATS
> +       bool "Enable Qdisc statistics broadcast"
> +       ---help---
> +
> +         Select this option if you want to enable qdisc stats broadcast
> through
> +          netlink multicast. Broadcast happens on packet dequeue, limited
> to the
> +          interval set by the <something> sysctl parameter.
> +
>  comment "Classification"
>

QDISC_BROADCAST_DEQUEUE_STATS maybe?

Elsewhere you've defined the sysctl. Would argue for sysfs...


>
>  config NET_CLS
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index eac7e0e..56490a1 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -28,6 +28,7 @@
>  #include <net/sch_generic.h>
>  #include <net/pkt_sched.h>
>  #include <net/dst.h>
> +#include <net/netlink.h>
>
>  /* Main transmission queue. */
>
> @@ -148,6 +149,73 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc
> *q,
>         return ret;
>  }
>
> +#ifdef CONFIG_NET_SCH_BROADCAST_STATS
> +int qdisc_broadcast_stats(struct Qdisc *q)
> +{
> +       struct tcmsg *tcm;
> +       struct nlmsghdr *nlh;
> +       struct gnet_dump d;
> +       struct sk_buff *skb;
> +       unsigned char *b;
> +
> +       if(!q->dev_queue || !q->dev_queue->dev)
> +               return 0;
>
>
Is there a way to also merely detect if there is a listener present?


> +       printk(KERN_DEBUG "Packet dequeue, ifname %s(%d), qdisc %s, qlen
> %d\n",
> +               qdisc_dev(q)->name,
> +               qdisc_dev(q)->ifindex,
> +               q->ops->id,
> +               q->qstats.qlen);
>

Useful for debugging, but invasive for general use, far more heavyweight
than tossing
it into netlink.


> +       skb = alloc_skb(NLMSG_SPACE(1024), GFP_KERNEL);
>

NLMSG_SPACE seems arbitrary, and perhaps this needs to be GFP_ATOMIC?
(But I am totally not an expert down here)


> +       if(!skb)
> +               return -ENOBUFS;
> +       b = skb_tail_pointer(skb);
> +
> +       nlh = nlmsg_put(skb, 0, 0, RTM_QDISC_STATS, sizeof(*tcm),
> NLM_F_MULTI);
> +       if (!nlh)
> +               goto out_nlmsg_trim;
> +
> +       tcm = nlmsg_data(nlh);
> +       tcm->tcm_family = AF_UNSPEC;
> +       tcm->tcm__pad1 = 0;
> +       tcm->tcm__pad2 = 0;
> +       tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
> +       tcm->tcm_parent = q->parent;
> +       tcm->tcm_handle = q->handle;
> +       tcm->tcm_info = atomic_read(&q->refcnt);
> +
> +       if (nla_put_string(skb, TCA_KIND, q->ops->id))
> +               goto nla_put_failure;
> +
> +       if (gnet_stats_start_copy(skb, TCA_STATS2, NULL, &d) < 0)
> +               goto nla_put_failure;
> +
> +       if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
> +               goto nla_put_failure;
> +
> +       if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
> +           gnet_stats_copy_queue(&d, &q->qstats) < 0)
> +               goto nla_put_failure;
> +
> +       if (gnet_stats_finish_copy(&d) < 0)
> +               goto nla_put_failure;
> +
> +       nlh->nlmsg_len = skb_tail_pointer(skb) - b;
> +
> +       rtnetlink_send(skb, qdisc_dev(q)->nd_net, 0, RTNLGRP_TC_STATS, 0);
> +
> +       return 0;
> +
> +nla_put_failure:
> +out_nlmsg_trim:
> +       nlmsg_trim(skb, b);
> +       return -1;
> +
> +}
> +#endif
> +
> +
>  /*
>   * NOTE: Called under qdisc_lock(q) with locally disabled BH.
>   *
> @@ -173,6 +241,9 @@ static inline int qdisc_restart(struct Qdisc *q)
>         struct net_device *dev;
>         spinlock_t *root_lock;
>         struct sk_buff *skb;
> +#ifdef CONFIG_NET_SCH_BROADCAST_STATS
> +       qdisc_broadcast_stats(q);
> +#endif
>

Am thinking this could be a callback in the Qdisc_ops struct...


>         /* Dequeue packet */
>         skb = dequeue_skb(q);
> --
> 1.8.3
>
> _______________________________________________
> Bloat-devel mailing list
> Bloat-devel at lists.bufferbloat.net
> https://lists.bufferbloat.net/listinfo/bloat-devel
>



-- 
Dave Täht

Fixing bufferbloat with cerowrt:
http://www.teklibre.com/cerowrt/subscribe.html
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.bufferbloat.net/pipermail/bloat-devel/attachments/20130610/b7d9db28/attachment-0002.html>


More information about the Bloat-devel mailing list