From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail.tohojo.dk (mail.tohojo.dk [188.40.53.186]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (Client did not present a certificate) by huchra.bufferbloat.net (Postfix) with ESMTPS id DDBE721F14C for ; Mon, 17 Jun 2013 12:07:34 -0700 (PDT) Received: from alrua-desktop.borgediget.toke.dk (unknown [10.42.3.5]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by mail.tohojo.dk (Postfix) with ESMTPSA id CB1A81EC0D98 for ; Mon, 17 Jun 2013 21:07:31 +0200 (CEST) Received: by alrua-desktop.borgediget.toke.dk (Postfix, from userid 1000) id B3DACF474; Mon, 17 Jun 2013 21:07:30 +0200 (CEST) From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= To: bloat-devel@lists.bufferbloat.net Subject: [RFC PATCH v2] Broadcast qdisc statistics via netlink on packet dequeue. In-Reply-To: <1370867989-7318-1-git-send-email-toke@toke.dk> References: <1370867989-7318-1-git-send-email-toke@toke.dk> Date: Mon, 17 Jun 2013 21:07:27 +0200 Message-ID: <874ncwtv2o.fsf@toke.dk> User-Agent: Gnus/5.130008 (Ma Gnus v0.8) Emacs/24.3 (gnu/linux) MIME-Version: 1.0 Content-Type: multipart/signed; boundary="=-=-="; micalg=pgp-sha1; protocol="application/pgp-signature" X-BeenThere: bloat-devel@lists.bufferbloat.net X-Mailman-Version: 2.1.13 Precedence: list List-Id: "Developers working on AQM, device drivers, and networking stacks" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 17 Jun 2013 19:07:35 -0000 --=-=-= Content-Type: text/plain Content-Transfer-Encoding: quoted-printable This is the second attempt at a patch to broadcast qdisc statistics via netlink on packet dequeue. Changes since the first version: =2D Cleaned up the patch itself. =2D New enum entries in rtnetlink.h are now at the bottom. =2D Clean up buffers properly on failure. =2D Check if any clients are listening to the netlink group and abort if none are. =2D Use the non-blocking netlink API (I think...) =2D Call the main stats broadcast function from sch_direct_xmit(). Finally makes things work the way they're supposed to (i.e. it gets called on every packet dequeue). Client implementation is at https://github.com/tohojo/qstatsc Preliminary testing on a virtual machine between host and guests gives similar network performance and similar CPU load with and without the patch. Still, I wouldn't recommend turning it on at 10GigE speeds. Comments greatly appreciated. =2DToke =2D---- >8 ----------- >8 ----- Add qdisc_stats_broadcast_interval sysctl parameter, and use it to limit stats broadcast interval. =2D-- include/net/netns/ipv4.h | 4 ++ include/net/sch_generic.h | 4 ++ include/uapi/linux/rtnetlink.h | 5 +++ net/core/gen_stats.c | 6 ++- net/ipv4/sysctl_net_ipv4.c | 13 ++++++ net/sched/Kconfig | 13 ++++++ net/sched/sch_api.c | 7 ++++ net/sched/sch_generic.c | 89 ++++++++++++++++++++++++++++++++++++++= ++++ 8 files changed, 139 insertions(+), 2 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2ba9de8..ff69564 100644 =2D-- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -64,6 +64,10 @@ struct netns_ipv4 { =20 int sysctl_tcp_ecn; =20 +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + int sysctl_qdisc_stats_broadcast_interval; +#endif + kgid_t sysctl_ping_group_range[2]; long sysctl_tcp_mem[3]; =20 diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f10818f..92f26cf 100644 =2D-- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -78,6 +78,10 @@ struct Qdisc { struct netdev_queue *dev_queue; struct Qdisc *next_sched; =20 +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + u64 last_stats_broadcast; +#endif + struct sk_buff *gso_skb; /* * For performance sake on SMP, we put highly modified fields at the end diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index da0a60e..edee5a9 100644 =2D-- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -132,6 +132,9 @@ enum { RTM_GETMDB =3D 86, #define RTM_GETMDB RTM_GETMDB =20 + RTM_QDISC_STATS, +#define RTM_QDISC_STATS RTM_QDISC_STATS + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; @@ -613,6 +616,8 @@ enum rtnetlink_groups { #define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF RTNLGRP_MDB, #define RTNLGRP_MDB RTNLGRP_MDB + RTNLGRP_TC_STATS, +#define RTNLGRP_TC_STATS RTNLGRP_TC_STATS __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index ddedf21..68df614 100644 =2D-- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -61,7 +61,8 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int typ= e, int tc_stats_type, { memset(d, 0, sizeof(*d)); =20 =2D spin_lock_bh(lock); + if(lock) + spin_lock_bh(lock); d->lock =3D lock; if (type) d->tail =3D (struct nlattr *)skb_tail_pointer(skb); @@ -245,7 +246,8 @@ gnet_stats_finish_copy(struct gnet_dump *d) return -1; } =20 =2D spin_unlock_bh(d->lock); + if(d->lock) + spin_unlock_bh(d->lock); return 0; } EXPORT_SYMBOL(gnet_stats_finish_copy); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 960fd29..896429c 100644 =2D-- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -851,6 +851,15 @@ static struct ctl_table ipv4_net_table[] =3D { .mode =3D 0644, .proc_handler =3D ipv4_tcp_mem, }, +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + { + .procname =3D "qdisc_stats_broadcast_interval", + .data =3D &init_net.ipv4.sysctl_qdisc_stats_broadcast_interval, + .maxlen =3D sizeof(int), + .mode =3D 0644, + .proc_handler =3D proc_dointvec + }, +#endif { } }; =20 @@ -880,6 +889,10 @@ static __net_init int ipv4_sysctl_init_net(struct net = *net) &net->ipv4.sysctl_ping_group_range; table[7].data =3D &net->ipv4.sysctl_tcp_ecn; +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + table[9].data =3D + &net->ipv4.sysctl_qdisc_stats_broadcast_interval; +#endif =20 /* Don't export sysctls to unprivileged users */ if (net->user_ns !=3D &init_user_ns) diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 235e01a..03958f0 100644 =2D-- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -308,6 +308,19 @@ config NET_SCH_PLUG To compile this code as a module, choose M here: the module will be called sch_plug. =20 +config NET_SCH_BROADCAST_STATS + bool "Enable Qdisc statistics broadcast" + ---help--- + + Select this option if you want to enable qdisc stats broadcast through + netlink multicast. Broadcast happens on packet dequeue, limited = to the + interval set by the qdisc_stats_broadcast_interval sysctl parame= ter. + + The statistics will be broadcast to the RTNLGRP_TC_STATS multica= st group + and the message type is RTM_QDISC_STATS. + + See https://github.com/tohojo/qstatsc for a sample client implem= entation. + comment "Classification" =20 config NET_CLS diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index c297e2a..154b316 100644 =2D-- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1772,6 +1772,10 @@ static int __net_init psched_net_init(struct net *ne= t) if (e =3D=3D NULL) return -ENOMEM; =20 +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + net->ipv4.sysctl_qdisc_stats_broadcast_interval =3D 200000; +#endif + return 0; } =20 @@ -1782,6 +1786,9 @@ static void __net_exit psched_net_exit(struct net *ne= t) #else static int __net_init psched_net_init(struct net *net) { +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + net->ipv4.sysctl_qdisc_stats_broadcast_interval =3D 200000; +#endif return 0; } =20 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index eac7e0e..b220705 100644 =2D-- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -28,6 +28,7 @@ #include #include #include +#include =20 /* Main transmission queue. */ =20 @@ -101,6 +102,88 @@ static inline int handle_dev_cpu_collision(struct sk_b= uff *skb, return ret; } =20 +#ifdef CONFIG_NET_SCH_BROADCAST_STATS +static inline u64 qdisc_stats_time(void) +{ + u64 ns =3D ktime_to_ns(ktime_get()); + do_div(ns, NSEC_PER_USEC); + return ns; +} + +static int qdisc_broadcast_stats(struct Qdisc *q) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + struct gnet_dump d; + struct sk_buff *skb; + struct net *net; + unsigned char *b; + u64 time; + + if(!q->dev_queue || !q->dev_queue->dev) + return 0; + + net =3D dev_net(qdisc_dev(q)); + + if(!netlink_has_listeners(net->rtnl, RTNLGRP_TC_STATS)) + return 0; + + time =3D qdisc_stats_time(); + if(time < q->last_stats_broadcast + + net->ipv4.sysctl_qdisc_stats_broadcast_interval) + return 0; + + skb =3D alloc_skb(NLMSG_SPACE(1024), GFP_ATOMIC); + if(!skb) + return -ENOBUFS; + b =3D skb_tail_pointer(skb); + + nlh =3D nlmsg_put(skb, 0, 0, RTM_QDISC_STATS, sizeof(*tcm), NLM_F_MULTI); + if (!nlh) + goto out_free; + + tcm =3D nlmsg_data(nlh); + tcm->tcm_family =3D AF_UNSPEC; + tcm->tcm__pad1 =3D 0; + tcm->tcm__pad2 =3D 0; + tcm->tcm_ifindex =3D qdisc_dev(q)->ifindex; + tcm->tcm_parent =3D q->parent; + tcm->tcm_handle =3D q->handle; + tcm->tcm_info =3D atomic_read(&q->refcnt); + + if (nla_put_string(skb, TCA_KIND, q->ops->id)) + goto nla_put_failure; + + if (gnet_stats_start_copy(skb, TCA_STATS2, NULL, &d) < 0) + goto nla_put_failure; + + if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) + goto nla_put_failure; + + q->qstats.qlen =3D q->q.qlen; + if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || + gnet_stats_copy_queue(&d, &q->qstats) < 0) + goto nla_put_failure; + + if (gnet_stats_finish_copy(&d) < 0) + goto nla_put_failure; + + nlh->nlmsg_len =3D skb_tail_pointer(skb) - b; + + nlmsg_notify(net->rtnl, skb, 0, RTNLGRP_TC_STATS, 0, 0); + + q->last_stats_broadcast =3D time; + + return 0; + +nla_put_failure: +out_free: + kfree_skb(skb); + return -1; + +} +#endif + /* * Transmit one skb, and handle the return status as required. Holding the * __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this @@ -115,6 +198,9 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *= q, spinlock_t *root_lock) { int ret =3D NETDEV_TX_BUSY; +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + qdisc_broadcast_stats(q); +#endif =20 /* And release qdisc */ spin_unlock(root_lock); @@ -565,6 +651,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queu= e, sch->enqueue =3D ops->enqueue; sch->dequeue =3D ops->dequeue; sch->dev_queue =3D dev_queue; +#ifdef CONFIG_NET_SCH_BROADCAST_STATS + sch->last_stats_broadcast =3D qdisc_stats_time(); +#endif dev_hold(dev); atomic_set(&sch->refcnt, 1); =20 =2D-=20 1.8.3.1 --=-=-= Content-Type: application/pgp-signature; name="signature.asc" -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.20 (GNU/Linux) iQEcBAEBAgAGBQJRv15vAAoJEENeEGz1+utPehwH/jctecPBdHX2N769EA9GWG+n 38HubMGud59kfXfog7z1FgZOOME6gXs6Qj55AaT6azhcNRUGzkroWPZPTAHELFZ8 OtpGbOCpBQRpfBXsdfrmuMqQVFbE9THUbhLY+qksfMYxV3lBM0QfaOvLi/AEFnFp l/eL0NIxnJVpLBLmAMUEBtHz7FjUwZmNSzONcJlZQsqscXHLrlN2gf5KNDOJMJKN GHWW2+6MbPtsooWquOI9aWJWPxDbMJr5yxpSK1FTBKT8VYyHx/WH1c7PEyYA3Suf w6FUFQBuvYmmZTec68Rl9nns/lZrsdaTvZC31BVjzIYgS4nhSl7OH9W7nRVEGUY= =2ayh -----END PGP SIGNATURE----- --=-=-=--