[RFC PATCH v2] Broadcast qdisc statistics via netlink on packet dequeue.
Toke Høiland-Jørgensen
toke at toke.dk
Mon Jun 17 12:07:27 PDT 2013
This is the second attempt at a patch to broadcast qdisc statistics via
netlink on packet dequeue.
Changes since the first version:
- Cleaned up the patch itself.
- New enum entries in rtnetlink.h are now at the bottom.
- Clean up buffers properly on failure.
- Check if any clients are listening to the netlink group and abort if
none are.
- Use the non-blocking netlink API (I think...)
- Call the main stats broadcast function from sch_direct_xmit(). Finally
makes things work the way they're supposed to (i.e. it gets called on
every packet dequeue).
Client implementation is at https://github.com/tohojo/qstatsc
Preliminary testing on a virtual machine between host and guests gives
similar network performance and similar CPU load with and without the
patch. Still, I wouldn't recommend turning it on at 10GigE speeds.
Comments greatly appreciated.
-Toke
----- >8 ----------- >8 -----
Add qdisc_stats_broadcast_interval sysctl parameter, and use it to limit
stats broadcast interval.
---
include/net/netns/ipv4.h | 4 ++
include/net/sch_generic.h | 4 ++
include/uapi/linux/rtnetlink.h | 5 +++
net/core/gen_stats.c | 6 ++-
net/ipv4/sysctl_net_ipv4.c | 13 ++++++
net/sched/Kconfig | 13 ++++++
net/sched/sch_api.c | 7 ++++
net/sched/sch_generic.c | 89 ++++++++++++++++++++++++++++++++++++++++++
8 files changed, 139 insertions(+), 2 deletions(-)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2ba9de8..ff69564 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -64,6 +64,10 @@ struct netns_ipv4 {
int sysctl_tcp_ecn;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ int sysctl_qdisc_stats_broadcast_interval;
+#endif
+
kgid_t sysctl_ping_group_range[2];
long sysctl_tcp_mem[3];
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f10818f..92f26cf 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -78,6 +78,10 @@ struct Qdisc {
struct netdev_queue *dev_queue;
struct Qdisc *next_sched;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ u64 last_stats_broadcast;
+#endif
+
struct sk_buff *gso_skb;
/*
* For performance sake on SMP, we put highly modified fields at the end
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index da0a60e..edee5a9 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -132,6 +132,9 @@ enum {
RTM_GETMDB = 86,
#define RTM_GETMDB RTM_GETMDB
+ RTM_QDISC_STATS,
+#define RTM_QDISC_STATS RTM_QDISC_STATS
+
__RTM_MAX,
#define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1)
};
@@ -613,6 +616,8 @@ enum rtnetlink_groups {
#define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF
RTNLGRP_MDB,
#define RTNLGRP_MDB RTNLGRP_MDB
+ RTNLGRP_TC_STATS,
+#define RTNLGRP_TC_STATS RTNLGRP_TC_STATS
__RTNLGRP_MAX
};
#define RTNLGRP_MAX (__RTNLGRP_MAX - 1)
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index ddedf21..68df614 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -61,7 +61,8 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
{
memset(d, 0, sizeof(*d));
- spin_lock_bh(lock);
+ if(lock)
+ spin_lock_bh(lock);
d->lock = lock;
if (type)
d->tail = (struct nlattr *)skb_tail_pointer(skb);
@@ -245,7 +246,8 @@ gnet_stats_finish_copy(struct gnet_dump *d)
return -1;
}
- spin_unlock_bh(d->lock);
+ if(d->lock)
+ spin_unlock_bh(d->lock);
return 0;
}
EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 960fd29..896429c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -851,6 +851,15 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = ipv4_tcp_mem,
},
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ {
+ .procname = "qdisc_stats_broadcast_interval",
+ .data = &init_net.ipv4.sysctl_qdisc_stats_broadcast_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
{ }
};
@@ -880,6 +889,10 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
&net->ipv4.sysctl_ping_group_range;
table[7].data =
&net->ipv4.sysctl_tcp_ecn;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ table[9].data =
+ &net->ipv4.sysctl_qdisc_stats_broadcast_interval;
+#endif
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 235e01a..03958f0 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -308,6 +308,19 @@ config NET_SCH_PLUG
To compile this code as a module, choose M here: the
module will be called sch_plug.
+config NET_SCH_BROADCAST_STATS
+ bool "Enable Qdisc statistics broadcast"
+ ---help---
+
+ Select this option if you want to enable qdisc stats broadcast through
+ netlink multicast. Broadcast happens on packet dequeue, limited to the
+ interval set by the qdisc_stats_broadcast_interval sysctl parameter.
+
+ The statistics will be broadcast to the RTNLGRP_TC_STATS multicast group
+ and the message type is RTM_QDISC_STATS.
+
+ See https://github.com/tohojo/qstatsc for a sample client implementation.
+
comment "Classification"
config NET_CLS
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c297e2a..154b316 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1772,6 +1772,10 @@ static int __net_init psched_net_init(struct net *net)
if (e == NULL)
return -ENOMEM;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ net->ipv4.sysctl_qdisc_stats_broadcast_interval = 200000;
+#endif
+
return 0;
}
@@ -1782,6 +1786,9 @@ static void __net_exit psched_net_exit(struct net *net)
#else
static int __net_init psched_net_init(struct net *net)
{
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ net->ipv4.sysctl_qdisc_stats_broadcast_interval = 200000;
+#endif
return 0;
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index eac7e0e..b220705 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -28,6 +28,7 @@
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
+#include <net/netlink.h>
/* Main transmission queue. */
@@ -101,6 +102,88 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,
return ret;
}
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+static inline u64 qdisc_stats_time(void)
+{
+ u64 ns = ktime_to_ns(ktime_get());
+ do_div(ns, NSEC_PER_USEC);
+ return ns;
+}
+
+static int qdisc_broadcast_stats(struct Qdisc *q)
+{
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ struct gnet_dump d;
+ struct sk_buff *skb;
+ struct net *net;
+ unsigned char *b;
+ u64 time;
+
+ if(!q->dev_queue || !q->dev_queue->dev)
+ return 0;
+
+ net = dev_net(qdisc_dev(q));
+
+ if(!netlink_has_listeners(net->rtnl, RTNLGRP_TC_STATS))
+ return 0;
+
+ time = qdisc_stats_time();
+ if(time < q->last_stats_broadcast +
+ net->ipv4.sysctl_qdisc_stats_broadcast_interval)
+ return 0;
+
+ skb = alloc_skb(NLMSG_SPACE(1024), GFP_ATOMIC);
+ if(!skb)
+ return -ENOBUFS;
+ b = skb_tail_pointer(skb);
+
+ nlh = nlmsg_put(skb, 0, 0, RTM_QDISC_STATS, sizeof(*tcm), NLM_F_MULTI);
+ if (!nlh)
+ goto out_free;
+
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+ tcm->tcm_parent = q->parent;
+ tcm->tcm_handle = q->handle;
+ tcm->tcm_info = atomic_read(&q->refcnt);
+
+ if (nla_put_string(skb, TCA_KIND, q->ops->id))
+ goto nla_put_failure;
+
+ if (gnet_stats_start_copy(skb, TCA_STATS2, NULL, &d) < 0)
+ goto nla_put_failure;
+
+ if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
+ goto nla_put_failure;
+
+ q->qstats.qlen = q->q.qlen;
+ if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
+ gnet_stats_copy_queue(&d, &q->qstats) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_finish_copy(&d) < 0)
+ goto nla_put_failure;
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
+ nlmsg_notify(net->rtnl, skb, 0, RTNLGRP_TC_STATS, 0, 0);
+
+ q->last_stats_broadcast = time;
+
+ return 0;
+
+nla_put_failure:
+out_free:
+ kfree_skb(skb);
+ return -1;
+
+}
+#endif
+
/*
* Transmit one skb, and handle the return status as required. Holding the
* __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this
@@ -115,6 +198,9 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
spinlock_t *root_lock)
{
int ret = NETDEV_TX_BUSY;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ qdisc_broadcast_stats(q);
+#endif
/* And release qdisc */
spin_unlock(root_lock);
@@ -565,6 +651,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
sch->dev_queue = dev_queue;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ sch->last_stats_broadcast = qdisc_stats_time();
+#endif
dev_hold(dev);
atomic_set(&sch->refcnt, 1);
--
1.8.3.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 489 bytes
Desc: not available
URL: <https://lists.bufferbloat.net/pipermail/bloat-devel/attachments/20130617/1d8da425/attachment.pgp>
More information about the Bloat-devel
mailing list