It is highly unlikely these patches would be accepted by upstream as allocating a skb in the fast path is probably a no-no. However at least part of them seem generically useful - instrumenting pfifo_fast in particular - and certainly helpful for debugging in general. I had hoped it would be possible to do this without mucking with locking, tho. :( In my own case I'd like to have a qdisc_drop_with_stats(skb,sch,"fq_codel",reason) that could be compiled in or out easily, sending the packet headers of the dropped packet to a listener much as you do here. This would be useful for monitoring, have potential as a routing metric, and make it easier to analyze the behavior of various qdisc types, without having to poll and lock via tc. Also as a drop takes place only when the card is producing delays, it would be possible to get a bandwidth estimate over smaller intervals from it easily, and be less invasive overall... So thanks for digging into netlink so much and having a sample program. I almost get it now... A couple more comments below: On Mon, Jun 10, 2013 at 5:39 AM, Toke Høiland-Jørgensen wrote: Email syntax for a RFC PATCH is RFC PATCH, not PATCH RFC --- > include/uapi/linux/rtnetlink.h | 15 ++++++--- > net/core/gen_stats.c | 6 ++-- > net/sched/Kconfig | 8 +++++ > net/sched/sch_generic.c | 71 > ++++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 93 insertions(+), 7 deletions(-) > > diff --git a/include/uapi/linux/rtnetlink.h > b/include/uapi/linux/rtnetlink.h > index 7a2144e..e9d91c5 100644 > --- a/include/uapi/linux/rtnetlink.h > +++ b/include/uapi/linux/rtnetlink.h > @@ -68,6 +68,9 @@ enum { > RTM_GETQDISC, > #define RTM_GETQDISC RTM_GETQDISC > > + RTM_QDISC_STATS, > +#define RTM_QDISC_STATS RTM_QDISC_STATS > + > > RTM_NEWTCLASS = 40, > #define RTM_NEWTCLASS RTM_NEWTCLASS > RTM_DELTCLASS, > @@ -140,7 +143,7 @@ enum { > #define RTM_NR_FAMILIES (RTM_NR_MSGTYPES >> 2) > #define RTM_FAM(cmd) (((cmd) - RTM_BASE) >> 2) > > -/* > +/* > Generic structure for encapsulation of optional route information. > It is reminiscent of sockaddr, but with sa_family replaced > with attribute type. > @@ -180,7 +183,7 @@ struct rtmsg { > > unsigned char rtm_table; /* Routing table id */ > unsigned char rtm_protocol; /* Routing protocol; see > below */ > - unsigned char rtm_scope; /* See below */ > + unsigned char rtm_scope; /* See below */ > unsigned char rtm_type; /* See below */ > > unnecessary change. > unsigned rtm_flags; > @@ -450,7 +453,7 @@ struct ifinfomsg { > }; > > /******************************************************************** > - * prefix information > + * prefix information > ****/ > > unnecessary change. try to touch as little as possible... There is an emacs mode for linux indentation, btw which helps avoid annoyingly messing up existing code. struct prefixmsg { > @@ -464,7 +467,7 @@ struct prefixmsg { > unsigned char prefix_pad3; > }; > > -enum > +enum > { > PREFIX_UNSPEC, > PREFIX_ADDRESS, > @@ -571,6 +574,8 @@ enum rtnetlink_groups { > #define RTNLGRP_NEIGH RTNLGRP_NEIGH > RTNLGRP_TC, > #define RTNLGRP_TC RTNLGRP_TC > + RTNLGRP_TC_STATS, > +#define RTNLGRP_TC_STATS RTNLGRP_TC_STATS > RTNLGRP_IPV4_IFADDR, > #define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR > RTNLGRP_IPV4_MROUTE, > It's not clear to me where this stuff enters userspace or not, and in general it's safer to append new enums/defines rather than insert them. > @@ -625,7 +630,7 @@ struct tcamsg { > }; > #define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + > NLMSG_ALIGN(sizeof(struct tcamsg)))) > #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg)) > -#define TCA_ACT_TAB 1 /* attr type must be >=1 */ > +#define TCA_ACT_TAB 1 /* attr type must be >=1 */ > #define TCAA_MAX 1 > Uneeeded change... > > /* New extended info filters for IFLA_EXT_MASK */ > diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c > index ddedf21..68df614 100644 > --- a/net/core/gen_stats.c > +++ b/net/core/gen_stats.c > @@ -61,7 +61,8 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int > type, int tc_stats_type, > { > memset(d, 0, sizeof(*d)); > > - spin_lock_bh(lock); > + if(lock) > + spin_lock_bh(lock); > So this acted up on a SMP kernel? > d->lock = lock; > if (type) > d->tail = (struct nlattr *)skb_tail_pointer(skb); > @@ -245,7 +246,8 @@ gnet_stats_finish_copy(struct gnet_dump *d) > return -1; > } > > - spin_unlock_bh(d->lock); > + if(d->lock) > + spin_unlock_bh(d->lock); > return 0; > } > EXPORT_SYMBOL(gnet_stats_finish_copy); > diff --git a/net/sched/Kconfig b/net/sched/Kconfig > index 235e01a..4902812 100644 > --- a/net/sched/Kconfig > +++ b/net/sched/Kconfig > @@ -308,6 +308,14 @@ config NET_SCH_PLUG > To compile this code as a module, choose M here: the > module will be called sch_plug. > > +config NET_SCH_BROADCAST_STATS > + bool "Enable Qdisc statistics broadcast" > + ---help--- > + > + Select this option if you want to enable qdisc stats broadcast > through > + netlink multicast. Broadcast happens on packet dequeue, limited > to the > + interval set by the sysctl parameter. > + > comment "Classification" > QDISC_BROADCAST_DEQUEUE_STATS maybe? Elsewhere you've defined the sysctl. Would argue for sysfs... > > config NET_CLS > diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c > index eac7e0e..56490a1 100644 > --- a/net/sched/sch_generic.c > +++ b/net/sched/sch_generic.c > @@ -28,6 +28,7 @@ > #include > #include > #include > +#include > > /* Main transmission queue. */ > > @@ -148,6 +149,73 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc > *q, > return ret; > } > > +#ifdef CONFIG_NET_SCH_BROADCAST_STATS > +int qdisc_broadcast_stats(struct Qdisc *q) > +{ > + struct tcmsg *tcm; > + struct nlmsghdr *nlh; > + struct gnet_dump d; > + struct sk_buff *skb; > + unsigned char *b; > + > + if(!q->dev_queue || !q->dev_queue->dev) > + return 0; > > Is there a way to also merely detect if there is a listener present? > + printk(KERN_DEBUG "Packet dequeue, ifname %s(%d), qdisc %s, qlen > %d\n", > + qdisc_dev(q)->name, > + qdisc_dev(q)->ifindex, > + q->ops->id, > + q->qstats.qlen); > Useful for debugging, but invasive for general use, far more heavyweight than tossing it into netlink. > + skb = alloc_skb(NLMSG_SPACE(1024), GFP_KERNEL); > NLMSG_SPACE seems arbitrary, and perhaps this needs to be GFP_ATOMIC? (But I am totally not an expert down here) > + if(!skb) > + return -ENOBUFS; > + b = skb_tail_pointer(skb); > + > + nlh = nlmsg_put(skb, 0, 0, RTM_QDISC_STATS, sizeof(*tcm), > NLM_F_MULTI); > + if (!nlh) > + goto out_nlmsg_trim; > + > + tcm = nlmsg_data(nlh); > + tcm->tcm_family = AF_UNSPEC; > + tcm->tcm__pad1 = 0; > + tcm->tcm__pad2 = 0; > + tcm->tcm_ifindex = qdisc_dev(q)->ifindex; > + tcm->tcm_parent = q->parent; > + tcm->tcm_handle = q->handle; > + tcm->tcm_info = atomic_read(&q->refcnt); > + > + if (nla_put_string(skb, TCA_KIND, q->ops->id)) > + goto nla_put_failure; > + > + if (gnet_stats_start_copy(skb, TCA_STATS2, NULL, &d) < 0) > + goto nla_put_failure; > + > + if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) > + goto nla_put_failure; > + > + if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || > + gnet_stats_copy_queue(&d, &q->qstats) < 0) > + goto nla_put_failure; > + > + if (gnet_stats_finish_copy(&d) < 0) > + goto nla_put_failure; > + > + nlh->nlmsg_len = skb_tail_pointer(skb) - b; > + > + rtnetlink_send(skb, qdisc_dev(q)->nd_net, 0, RTNLGRP_TC_STATS, 0); > + > + return 0; > + > +nla_put_failure: > +out_nlmsg_trim: > + nlmsg_trim(skb, b); > + return -1; > + > +} > +#endif > + > + > /* > * NOTE: Called under qdisc_lock(q) with locally disabled BH. > * > @@ -173,6 +241,9 @@ static inline int qdisc_restart(struct Qdisc *q) > struct net_device *dev; > spinlock_t *root_lock; > struct sk_buff *skb; > +#ifdef CONFIG_NET_SCH_BROADCAST_STATS > + qdisc_broadcast_stats(q); > +#endif > Am thinking this could be a callback in the Qdisc_ops struct... > /* Dequeue packet */ > skb = dequeue_skb(q); > -- > 1.8.3 > > _______________________________________________ > Bloat-devel mailing list > Bloat-devel@lists.bufferbloat.net > https://lists.bufferbloat.net/listinfo/bloat-devel > -- Dave Täht Fixing bufferbloat with cerowrt: http://www.teklibre.com/cerowrt/subscribe.html