This is the second attempt at a patch to broadcast qdisc statistics via netlink on packet dequeue. Changes since the first version: - Cleaned up the patch itself. - New enum entries in rtnetlink.h are now at the bottom. - Clean up buffers properly on failure. - Check if any clients are listening to the netlink group and abort if none are. - Use the non-blocking netlink API (I think...) - Call the main stats broadcast function from sch_direct_xmit(). Finally makes things work the way they're supposed to (i.e. it gets called on every packet dequeue). Client implementation is at https://github.com/tohojo/qstatsc Preliminary testing on a virtual machine between host and guests gives similar network performance and similar CPU load with and without the patch. Still, I wouldn't recommend turning it on at 10GigE speeds. Comments greatly appreciated. -Toke ----- >8 ----------- >8 ----- Add qdisc_stats_broadcast_interval sysctl parameter, and use it to limit stats broadcast interval. --- include/net/netns/ipv4.h | 4 ++ include/net/sch_generic.h | 4 ++ include/uapi/linux/rtnetlink.h | 5 +++ net/core/gen_stats.c | 6 ++- net/ipv4/sysctl_net_ipv4.c | 13 ++++++ net/sched/Kconfig | 13 ++++++ net/sched/sch_api.c | 7 ++++ net/sched/sch_generic.c | 89 ++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 139 insertions(+), 2 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2ba9de8..ff69564 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -64,6 +64,10 @@ struct netns_ipv4 { int sysctl_tcp_ecn; +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + int sysctl_qdisc_stats_broadcast_interval; +#endif + kgid_t sysctl_ping_group_range[2]; long sysctl_tcp_mem[3]; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f10818f..92f26cf 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -78,6 +78,10 @@ struct Qdisc { struct netdev_queue *dev_queue; struct Qdisc *next_sched; +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + u64 last_stats_broadcast; +#endif + struct sk_buff *gso_skb; /* * For performance sake on SMP, we put highly modified fields at the end diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index da0a60e..edee5a9 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -132,6 +132,9 @@ enum { RTM_GETMDB = 86, #define RTM_GETMDB RTM_GETMDB + RTM_QDISC_STATS, +#define RTM_QDISC_STATS RTM_QDISC_STATS + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; @@ -613,6 +616,8 @@ enum rtnetlink_groups { #define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF RTNLGRP_MDB, #define RTNLGRP_MDB RTNLGRP_MDB + RTNLGRP_TC_STATS, +#define RTNLGRP_TC_STATS RTNLGRP_TC_STATS __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index ddedf21..68df614 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -61,7 +61,8 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, { memset(d, 0, sizeof(*d)); - spin_lock_bh(lock); + if(lock) + spin_lock_bh(lock); d->lock = lock; if (type) d->tail = (struct nlattr *)skb_tail_pointer(skb); @@ -245,7 +246,8 @@ gnet_stats_finish_copy(struct gnet_dump *d) return -1; } - spin_unlock_bh(d->lock); + if(d->lock) + spin_unlock_bh(d->lock); return 0; } EXPORT_SYMBOL(gnet_stats_finish_copy); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 960fd29..896429c 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -851,6 +851,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_tcp_mem, }, +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + { + .procname = "qdisc_stats_broadcast_interval", + .data = &init_net.ipv4.sysctl_qdisc_stats_broadcast_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif { } }; @@ -880,6 +889,10 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) &net->ipv4.sysctl_ping_group_range; table[7].data = &net->ipv4.sysctl_tcp_ecn; +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + table[9].data = + &net->ipv4.sysctl_qdisc_stats_broadcast_interval; +#endif /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 235e01a..03958f0 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -308,6 +308,19 @@ config NET_SCH_PLUG To compile this code as a module, choose M here: the module will be called sch_plug. +config NET_SCH_BROADCAST_STATS + bool "Enable Qdisc statistics broadcast" + ---help--- + + Select this option if you want to enable qdisc stats broadcast through + netlink multicast. Broadcast happens on packet dequeue, limited to the + interval set by the qdisc_stats_broadcast_interval sysctl parameter. + + The statistics will be broadcast to the RTNLGRP_TC_STATS multicast group + and the message type is RTM_QDISC_STATS. + + See https://github.com/tohojo/qstatsc for a sample client implementation. + comment "Classification" config NET_CLS diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index c297e2a..154b316 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1772,6 +1772,10 @@ static int __net_init psched_net_init(struct net *net) if (e == NULL) return -ENOMEM; +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + net->ipv4.sysctl_qdisc_stats_broadcast_interval = 200000; +#endif + return 0; } @@ -1782,6 +1786,9 @@ static void __net_exit psched_net_exit(struct net *net) #else static int __net_init psched_net_init(struct net *net) { +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + net->ipv4.sysctl_qdisc_stats_broadcast_interval = 200000; +#endif return 0; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index eac7e0e..b220705 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -28,6 +28,7 @@ #include #include #include +#include /* Main transmission queue. */ @@ -101,6 +102,88 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, return ret; } +#ifdef CONFIG_NET_SCH_BROADCAST_STATS +static inline u64 qdisc_stats_time(void) +{ + u64 ns = ktime_to_ns(ktime_get()); + do_div(ns, NSEC_PER_USEC); + return ns; +} + +static int qdisc_broadcast_stats(struct Qdisc *q) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + struct gnet_dump d; + struct sk_buff *skb; + struct net *net; + unsigned char *b; + u64 time; + + if(!q->dev_queue || !q->dev_queue->dev) + return 0; + + net = dev_net(qdisc_dev(q)); + + if(!netlink_has_listeners(net->rtnl, RTNLGRP_TC_STATS)) + return 0; + + time = qdisc_stats_time(); + if(time < q->last_stats_broadcast + + net->ipv4.sysctl_qdisc_stats_broadcast_interval) + return 0; + + skb = alloc_skb(NLMSG_SPACE(1024), GFP_ATOMIC); + if(!skb) + return -ENOBUFS; + b = skb_tail_pointer(skb); + + nlh = nlmsg_put(skb, 0, 0, RTM_QDISC_STATS, sizeof(*tcm), NLM_F_MULTI); + if (!nlh) + goto out_free; + + tcm = nlmsg_data(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm__pad1 = 0; + tcm->tcm__pad2 = 0; + tcm->tcm_ifindex = qdisc_dev(q)->ifindex; + tcm->tcm_parent = q->parent; + tcm->tcm_handle = q->handle; + tcm->tcm_info = atomic_read(&q->refcnt); + + if (nla_put_string(skb, TCA_KIND, q->ops->id)) + goto nla_put_failure; + + if (gnet_stats_start_copy(skb, TCA_STATS2, NULL, &d) < 0) + goto nla_put_failure; + + if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) + goto nla_put_failure; + + q->qstats.qlen = q->q.qlen; + if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || + gnet_stats_copy_queue(&d, &q->qstats) < 0) + goto nla_put_failure; + + if (gnet_stats_finish_copy(&d) < 0) + goto nla_put_failure; + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + + nlmsg_notify(net->rtnl, skb, 0, RTNLGRP_TC_STATS, 0, 0); + + q->last_stats_broadcast = time; + + return 0; + +nla_put_failure: +out_free: + kfree_skb(skb); + return -1; + +} +#endif + /* * Transmit one skb, and handle the return status as required. Holding the * __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this @@ -115,6 +198,9 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, spinlock_t *root_lock) { int ret = NETDEV_TX_BUSY; +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + qdisc_broadcast_stats(q); +#endif /* And release qdisc */ spin_unlock(root_lock); @@ -565,6 +651,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + sch->last_stats_broadcast = qdisc_stats_time(); +#endif dev_hold(dev); atomic_set(&sch->refcnt, 1); -- 1.8.3.1