It is highly unlikely these patches would be accepted by upstream as allocating a skb in the fast path is probably a no-no. However at least part of them seem generically useful - instrumenting pfifo_fast in particular - and certainly helpful for debugging in general.

I had hoped it would be possible to do this without mucking with locking, tho. :(

In my own case I'd like to have a qdisc_drop_with_stats(skb,sch,"fq_codel",reason) that could be compiled in or out easily, sending the packet headers of the dropped packet to a listener much as you do here. This would be useful for monitoring, have potential as a routing metric, and make it easier to analyze the behavior of various qdisc types, without having to poll and lock via tc.

Also as a drop takes place only when the card is producing delays, it would be possible to get a bandwidth estimate over smaller intervals from it easily, and be less invasive overall...

So thanks for digging into netlink so much and having a sample program. I almost get it now...

A couple more comments below:

On Mon, Jun 10, 2013 at 5:39 AM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:

Email syntax for a RFC PATCH is RFC PATCH, not PATCH RFC

---
 include/uapi/linux/rtnetlink.h | 15 ++++++---
 net/core/gen_stats.c           |  6 ++--
 net/sched/Kconfig              |  8 +++++
 net/sched/sch_generic.c        | 71 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 7a2144e..e9d91c5 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -68,6 +68,9 @@ enum {
        RTM_GETQDISC,
 #define RTM_GETQDISC   RTM_GETQDISC

+       RTM_QDISC_STATS,
+#define RTM_QDISC_STATS RTM_QDISC_STATS
+
 
        RTM_NEWTCLASS   = 40,
 #define RTM_NEWTCLASS  RTM_NEWTCLASS
        RTM_DELTCLASS,
@@ -140,7 +143,7 @@ enum {
 #define RTM_NR_FAMILIES        (RTM_NR_MSGTYPES >> 2)
 #define RTM_FAM(cmd)   (((cmd) - RTM_BASE) >> 2)

-/*
+/*
    Generic structure for encapsulation of optional route information.
    It is reminiscent of sockaddr, but with sa_family replaced
    with attribute type.
@@ -180,7 +183,7 @@ struct rtmsg {

        unsigned char           rtm_table;      /* Routing table id */
        unsigned char           rtm_protocol;   /* Routing protocol; see below  */
-       unsigned char           rtm_scope;      /* See below */
+       unsigned char           rtm_scope;      /* See below */
        unsigned char           rtm_type;       /* See below    */


unnecessary change.
 
        unsigned                rtm_flags;
@@ -450,7 +453,7 @@ struct ifinfomsg {
 };

 /********************************************************************
- *             prefix information
+ *             prefix information
  ****/


unnecessary change. try to touch as little as possible... There is an emacs mode for linux indentation, btw
which helps avoid annoyingly messing up existing code.

 struct prefixmsg {
@@ -464,7 +467,7 @@ struct prefixmsg {
        unsigned char   prefix_pad3;
 };

-enum
+enum
 {
        PREFIX_UNSPEC,
        PREFIX_ADDRESS,
@@ -571,6 +574,8 @@ enum rtnetlink_groups {
 #define RTNLGRP_NEIGH          RTNLGRP_NEIGH
        RTNLGRP_TC,
 #define RTNLGRP_TC             RTNLGRP_TC
+       RTNLGRP_TC_STATS,
+#define RTNLGRP_TC_STATS       RTNLGRP_TC_STATS
        RTNLGRP_IPV4_IFADDR,
 #define RTNLGRP_IPV4_IFADDR    RTNLGRP_IPV4_IFADDR
        RTNLGRP_IPV4_MROUTE,

It's not clear to me where this stuff enters userspace or not, and in general it's
safer to append new enums/defines rather than insert them.
 
@@ -625,7 +630,7 @@ struct tcamsg {
 };
 #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
 #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
-#define TCA_ACT_TAB 1 /* attr type must be >=1 */
+#define TCA_ACT_TAB 1 /* attr type must be >=1 */
 #define TCAA_MAX 1

Uneeeded change...
 

 /* New extended info filters for IFLA_EXT_MASK */
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index ddedf21..68df614 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -61,7 +61,8 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
 {
        memset(d, 0, sizeof(*d));

-       spin_lock_bh(lock);
+       if(lock)
+               spin_lock_bh(lock);

So this acted up on a SMP kernel?
 
        d->lock = lock;
        if (type)
                d->tail = (struct nlattr *)skb_tail_pointer(skb);
@@ -245,7 +246,8 @@ gnet_stats_finish_copy(struct gnet_dump *d)
                        return -1;
        }

-       spin_unlock_bh(d->lock);
+       if(d->lock)
+               spin_unlock_bh(d->lock);
        return 0;
 }
 EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 235e01a..4902812 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -308,6 +308,14 @@ config NET_SCH_PLUG
          To compile this code as a module, choose M here: the
          module will be called sch_plug.

+config NET_SCH_BROADCAST_STATS
+       bool "Enable Qdisc statistics broadcast"
+       ---help---
+
+         Select this option if you want to enable qdisc stats broadcast through
+          netlink multicast. Broadcast happens on packet dequeue, limited to the
+          interval set by the <something> sysctl parameter.
+
 comment "Classification"

QDISC_BROADCAST_DEQUEUE_STATS maybe?

Elsewhere you've defined the sysctl. Would argue for sysfs...
 

 config NET_CLS
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index eac7e0e..56490a1 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -28,6 +28,7 @@
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
 #include <net/dst.h>
+#include <net/netlink.h>

 /* Main transmission queue. */

@@ -148,6 +149,73 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
        return ret;
 }

+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+int qdisc_broadcast_stats(struct Qdisc *q)
+{
+       struct tcmsg *tcm;
+       struct nlmsghdr *nlh;
+       struct gnet_dump d;
+       struct sk_buff *skb;
+       unsigned char *b;
+
+       if(!q->dev_queue || !q->dev_queue->dev)
+               return 0;


Is there a way to also merely detect if there is a listener present?
 
+       printk(KERN_DEBUG "Packet dequeue, ifname %s(%d), qdisc %s, qlen %d\n",
+               qdisc_dev(q)->name,
+               qdisc_dev(q)->ifindex,
+               q->ops->id,
+               q->qstats.qlen);

Useful for debugging, but invasive for general use, far more heavyweight than tossing
it into netlink.
 
+       skb = alloc_skb(NLMSG_SPACE(1024), GFP_KERNEL);

NLMSG_SPACE seems arbitrary, and perhaps this needs to be GFP_ATOMIC?
(But I am totally not an expert down here)
 
+       if(!skb)
+               return -ENOBUFS;
+       b = skb_tail_pointer(skb);
+
+       nlh = nlmsg_put(skb, 0, 0, RTM_QDISC_STATS, sizeof(*tcm), NLM_F_MULTI);
+       if (!nlh)
+               goto out_nlmsg_trim;
+
+       tcm = nlmsg_data(nlh);
+       tcm->tcm_family = AF_UNSPEC;
+       tcm->tcm__pad1 = 0;
+       tcm->tcm__pad2 = 0;
+       tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+       tcm->tcm_parent = q->parent;
+       tcm->tcm_handle = q->handle;
+       tcm->tcm_info = atomic_read(&q->refcnt);
+
+       if (nla_put_string(skb, TCA_KIND, q->ops->id))
+               goto nla_put_failure;
+
+       if (gnet_stats_start_copy(skb, TCA_STATS2, NULL, &d) < 0)
+               goto nla_put_failure;
+
+       if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
+               goto nla_put_failure;
+
+       if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
+           gnet_stats_copy_queue(&d, &q->qstats) < 0)
+               goto nla_put_failure;
+
+       if (gnet_stats_finish_copy(&d) < 0)
+               goto nla_put_failure;
+
+       nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
+       rtnetlink_send(skb, qdisc_dev(q)->nd_net, 0, RTNLGRP_TC_STATS, 0);
+
+       return 0;
+
+nla_put_failure:
+out_nlmsg_trim:
+       nlmsg_trim(skb, b);
+       return -1;
+
+}
+#endif
+
+
 /*
  * NOTE: Called under qdisc_lock(q) with locally disabled BH.
  *
@@ -173,6 +241,9 @@ static inline int qdisc_restart(struct Qdisc *q)
        struct net_device *dev;
        spinlock_t *root_lock;
        struct sk_buff *skb;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+       qdisc_broadcast_stats(q);
+#endif

Am thinking this could be a callback in the Qdisc_ops struct...
 
        /* Dequeue packet */
        skb = dequeue_skb(q);
--
1.8.3

_______________________________________________
Bloat-devel mailing list
Bloat-devel@lists.bufferbloat.net
https://lists.bufferbloat.net/listinfo/bloat-devel



--
Dave Täht

Fixing bufferbloat with cerowrt: http://www.teklibre.com/cerowrt/subscribe.html