* [PATCH RFC 1/3] Broadcast qdisc statistics via netlink on packet dequeue.
2013-06-10 12:39 [PATCH RFC 0/3] Broadcasting qdisc statistics via netlink Toke Høiland-Jørgensen
@ 2013-06-10 12:39 ` Toke Høiland-Jørgensen
2013-06-10 17:20 ` Dave Taht
2013-06-10 12:39 ` [PATCH RFC 2/3] Add qdisc_stats_broadcast_interval sysctl parameter, and use it to limit stats broadcast interval Toke Høiland-Jørgensen
` (2 subsequent siblings)
3 siblings, 1 reply; 7+ messages in thread
From: Toke Høiland-Jørgensen @ 2013-06-10 12:39 UTC (permalink / raw)
To: bloat-devel
---
include/uapi/linux/rtnetlink.h | 15 ++++++---
net/core/gen_stats.c | 6 ++--
net/sched/Kconfig | 8 +++++
net/sched/sch_generic.c | 71 ++++++++++++++++++++++++++++++++++++++++++
4 files changed, 93 insertions(+), 7 deletions(-)
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 7a2144e..e9d91c5 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -68,6 +68,9 @@ enum {
RTM_GETQDISC,
#define RTM_GETQDISC RTM_GETQDISC
+ RTM_QDISC_STATS,
+#define RTM_QDISC_STATS RTM_QDISC_STATS
+
RTM_NEWTCLASS = 40,
#define RTM_NEWTCLASS RTM_NEWTCLASS
RTM_DELTCLASS,
@@ -140,7 +143,7 @@ enum {
#define RTM_NR_FAMILIES (RTM_NR_MSGTYPES >> 2)
#define RTM_FAM(cmd) (((cmd) - RTM_BASE) >> 2)
-/*
+/*
Generic structure for encapsulation of optional route information.
It is reminiscent of sockaddr, but with sa_family replaced
with attribute type.
@@ -180,7 +183,7 @@ struct rtmsg {
unsigned char rtm_table; /* Routing table id */
unsigned char rtm_protocol; /* Routing protocol; see below */
- unsigned char rtm_scope; /* See below */
+ unsigned char rtm_scope; /* See below */
unsigned char rtm_type; /* See below */
unsigned rtm_flags;
@@ -450,7 +453,7 @@ struct ifinfomsg {
};
/********************************************************************
- * prefix information
+ * prefix information
****/
struct prefixmsg {
@@ -464,7 +467,7 @@ struct prefixmsg {
unsigned char prefix_pad3;
};
-enum
+enum
{
PREFIX_UNSPEC,
PREFIX_ADDRESS,
@@ -571,6 +574,8 @@ enum rtnetlink_groups {
#define RTNLGRP_NEIGH RTNLGRP_NEIGH
RTNLGRP_TC,
#define RTNLGRP_TC RTNLGRP_TC
+ RTNLGRP_TC_STATS,
+#define RTNLGRP_TC_STATS RTNLGRP_TC_STATS
RTNLGRP_IPV4_IFADDR,
#define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR
RTNLGRP_IPV4_MROUTE,
@@ -625,7 +630,7 @@ struct tcamsg {
};
#define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
#define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
-#define TCA_ACT_TAB 1 /* attr type must be >=1 */
+#define TCA_ACT_TAB 1 /* attr type must be >=1 */
#define TCAA_MAX 1
/* New extended info filters for IFLA_EXT_MASK */
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index ddedf21..68df614 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -61,7 +61,8 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
{
memset(d, 0, sizeof(*d));
- spin_lock_bh(lock);
+ if(lock)
+ spin_lock_bh(lock);
d->lock = lock;
if (type)
d->tail = (struct nlattr *)skb_tail_pointer(skb);
@@ -245,7 +246,8 @@ gnet_stats_finish_copy(struct gnet_dump *d)
return -1;
}
- spin_unlock_bh(d->lock);
+ if(d->lock)
+ spin_unlock_bh(d->lock);
return 0;
}
EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 235e01a..4902812 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -308,6 +308,14 @@ config NET_SCH_PLUG
To compile this code as a module, choose M here: the
module will be called sch_plug.
+config NET_SCH_BROADCAST_STATS
+ bool "Enable Qdisc statistics broadcast"
+ ---help---
+
+ Select this option if you want to enable qdisc stats broadcast through
+ netlink multicast. Broadcast happens on packet dequeue, limited to the
+ interval set by the <something> sysctl parameter.
+
comment "Classification"
config NET_CLS
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index eac7e0e..56490a1 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -28,6 +28,7 @@
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
+#include <net/netlink.h>
/* Main transmission queue. */
@@ -148,6 +149,73 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
return ret;
}
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+int qdisc_broadcast_stats(struct Qdisc *q)
+{
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ struct gnet_dump d;
+ struct sk_buff *skb;
+ unsigned char *b;
+
+ if(!q->dev_queue || !q->dev_queue->dev)
+ return 0;
+
+ printk(KERN_DEBUG "Packet dequeue, ifname %s(%d), qdisc %s, qlen %d\n",
+ qdisc_dev(q)->name,
+ qdisc_dev(q)->ifindex,
+ q->ops->id,
+ q->qstats.qlen);
+
+ skb = alloc_skb(NLMSG_SPACE(1024), GFP_KERNEL);
+ if(!skb)
+ return -ENOBUFS;
+ b = skb_tail_pointer(skb);
+
+ nlh = nlmsg_put(skb, 0, 0, RTM_QDISC_STATS, sizeof(*tcm), NLM_F_MULTI);
+ if (!nlh)
+ goto out_nlmsg_trim;
+
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+ tcm->tcm_parent = q->parent;
+ tcm->tcm_handle = q->handle;
+ tcm->tcm_info = atomic_read(&q->refcnt);
+
+ if (nla_put_string(skb, TCA_KIND, q->ops->id))
+ goto nla_put_failure;
+
+ if (gnet_stats_start_copy(skb, TCA_STATS2, NULL, &d) < 0)
+ goto nla_put_failure;
+
+ if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
+ gnet_stats_copy_queue(&d, &q->qstats) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_finish_copy(&d) < 0)
+ goto nla_put_failure;
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
+ rtnetlink_send(skb, qdisc_dev(q)->nd_net, 0, RTNLGRP_TC_STATS, 0);
+
+ return 0;
+
+nla_put_failure:
+out_nlmsg_trim:
+ nlmsg_trim(skb, b);
+ return -1;
+
+}
+#endif
+
+
/*
* NOTE: Called under qdisc_lock(q) with locally disabled BH.
*
@@ -173,6 +241,9 @@ static inline int qdisc_restart(struct Qdisc *q)
struct net_device *dev;
spinlock_t *root_lock;
struct sk_buff *skb;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ qdisc_broadcast_stats(q);
+#endif
/* Dequeue packet */
skb = dequeue_skb(q);
--
1.8.3
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH RFC 1/3] Broadcast qdisc statistics via netlink on packet dequeue.
2013-06-10 12:39 ` [PATCH RFC 1/3] Broadcast qdisc statistics via netlink on packet dequeue Toke Høiland-Jørgensen
@ 2013-06-10 17:20 ` Dave Taht
2013-06-11 7:15 ` Toke Høiland-Jørgensen
0 siblings, 1 reply; 7+ messages in thread
From: Dave Taht @ 2013-06-10 17:20 UTC (permalink / raw)
To: Toke Høiland-Jørgensen; +Cc: bloat-devel
[-- Attachment #1: Type: text/plain, Size: 9634 bytes --]
It is highly unlikely these patches would be accepted by upstream as
allocating a skb in the fast path is probably a no-no. However at least
part of them seem generically useful - instrumenting pfifo_fast in
particular - and certainly helpful for debugging in general.
I had hoped it would be possible to do this without mucking with locking,
tho. :(
In my own case I'd like to have a
qdisc_drop_with_stats(skb,sch,"fq_codel",reason) that could be compiled in
or out easily, sending the packet headers of the dropped packet to a
listener much as you do here. This would be useful for monitoring, have
potential as a routing metric, and make it easier to analyze the behavior
of various qdisc types, without having to poll and lock via tc.
Also as a drop takes place only when the card is producing delays, it would
be possible to get a bandwidth estimate over smaller intervals from it
easily, and be less invasive overall...
So thanks for digging into netlink so much and having a sample program. I
almost get it now...
A couple more comments below:
On Mon, Jun 10, 2013 at 5:39 AM, Toke Høiland-Jørgensen <toke@toke.dk>wrote:
Email syntax for a RFC PATCH is RFC PATCH, not PATCH RFC
---
> include/uapi/linux/rtnetlink.h | 15 ++++++---
> net/core/gen_stats.c | 6 ++--
> net/sched/Kconfig | 8 +++++
> net/sched/sch_generic.c | 71
> ++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 93 insertions(+), 7 deletions(-)
>
> diff --git a/include/uapi/linux/rtnetlink.h
> b/include/uapi/linux/rtnetlink.h
> index 7a2144e..e9d91c5 100644
> --- a/include/uapi/linux/rtnetlink.h
> +++ b/include/uapi/linux/rtnetlink.h
> @@ -68,6 +68,9 @@ enum {
> RTM_GETQDISC,
> #define RTM_GETQDISC RTM_GETQDISC
>
> + RTM_QDISC_STATS,
> +#define RTM_QDISC_STATS RTM_QDISC_STATS
> +
>
> RTM_NEWTCLASS = 40,
> #define RTM_NEWTCLASS RTM_NEWTCLASS
> RTM_DELTCLASS,
> @@ -140,7 +143,7 @@ enum {
> #define RTM_NR_FAMILIES (RTM_NR_MSGTYPES >> 2)
> #define RTM_FAM(cmd) (((cmd) - RTM_BASE) >> 2)
>
> -/*
> +/*
> Generic structure for encapsulation of optional route information.
> It is reminiscent of sockaddr, but with sa_family replaced
> with attribute type.
> @@ -180,7 +183,7 @@ struct rtmsg {
>
> unsigned char rtm_table; /* Routing table id */
> unsigned char rtm_protocol; /* Routing protocol; see
> below */
> - unsigned char rtm_scope; /* See below */
> + unsigned char rtm_scope; /* See below */
> unsigned char rtm_type; /* See below */
>
>
unnecessary change.
> unsigned rtm_flags;
> @@ -450,7 +453,7 @@ struct ifinfomsg {
> };
>
> /********************************************************************
> - * prefix information
> + * prefix information
> ****/
>
>
unnecessary change. try to touch as little as possible... There is an emacs
mode for linux indentation, btw
which helps avoid annoyingly messing up existing code.
struct prefixmsg {
> @@ -464,7 +467,7 @@ struct prefixmsg {
> unsigned char prefix_pad3;
> };
>
> -enum
> +enum
> {
> PREFIX_UNSPEC,
> PREFIX_ADDRESS,
> @@ -571,6 +574,8 @@ enum rtnetlink_groups {
> #define RTNLGRP_NEIGH RTNLGRP_NEIGH
> RTNLGRP_TC,
> #define RTNLGRP_TC RTNLGRP_TC
> + RTNLGRP_TC_STATS,
> +#define RTNLGRP_TC_STATS RTNLGRP_TC_STATS
> RTNLGRP_IPV4_IFADDR,
> #define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR
> RTNLGRP_IPV4_MROUTE,
>
It's not clear to me where this stuff enters userspace or not, and in
general it's
safer to append new enums/defines rather than insert them.
> @@ -625,7 +630,7 @@ struct tcamsg {
> };
> #define TA_RTA(r) ((struct rtattr*)(((char*)(r)) +
> NLMSG_ALIGN(sizeof(struct tcamsg))))
> #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
> -#define TCA_ACT_TAB 1 /* attr type must be >=1 */
> +#define TCA_ACT_TAB 1 /* attr type must be >=1 */
> #define TCAA_MAX 1
>
Uneeeded change...
>
> /* New extended info filters for IFLA_EXT_MASK */
> diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
> index ddedf21..68df614 100644
> --- a/net/core/gen_stats.c
> +++ b/net/core/gen_stats.c
> @@ -61,7 +61,8 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int
> type, int tc_stats_type,
> {
> memset(d, 0, sizeof(*d));
>
> - spin_lock_bh(lock);
> + if(lock)
> + spin_lock_bh(lock);
>
So this acted up on a SMP kernel?
> d->lock = lock;
> if (type)
> d->tail = (struct nlattr *)skb_tail_pointer(skb);
> @@ -245,7 +246,8 @@ gnet_stats_finish_copy(struct gnet_dump *d)
> return -1;
> }
>
> - spin_unlock_bh(d->lock);
> + if(d->lock)
> + spin_unlock_bh(d->lock);
> return 0;
> }
> EXPORT_SYMBOL(gnet_stats_finish_copy);
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index 235e01a..4902812 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -308,6 +308,14 @@ config NET_SCH_PLUG
> To compile this code as a module, choose M here: the
> module will be called sch_plug.
>
> +config NET_SCH_BROADCAST_STATS
> + bool "Enable Qdisc statistics broadcast"
> + ---help---
> +
> + Select this option if you want to enable qdisc stats broadcast
> through
> + netlink multicast. Broadcast happens on packet dequeue, limited
> to the
> + interval set by the <something> sysctl parameter.
> +
> comment "Classification"
>
QDISC_BROADCAST_DEQUEUE_STATS maybe?
Elsewhere you've defined the sysctl. Would argue for sysfs...
>
> config NET_CLS
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index eac7e0e..56490a1 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -28,6 +28,7 @@
> #include <net/sch_generic.h>
> #include <net/pkt_sched.h>
> #include <net/dst.h>
> +#include <net/netlink.h>
>
> /* Main transmission queue. */
>
> @@ -148,6 +149,73 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc
> *q,
> return ret;
> }
>
> +#ifdef CONFIG_NET_SCH_BROADCAST_STATS
> +int qdisc_broadcast_stats(struct Qdisc *q)
> +{
> + struct tcmsg *tcm;
> + struct nlmsghdr *nlh;
> + struct gnet_dump d;
> + struct sk_buff *skb;
> + unsigned char *b;
> +
> + if(!q->dev_queue || !q->dev_queue->dev)
> + return 0;
>
>
Is there a way to also merely detect if there is a listener present?
> + printk(KERN_DEBUG "Packet dequeue, ifname %s(%d), qdisc %s, qlen
> %d\n",
> + qdisc_dev(q)->name,
> + qdisc_dev(q)->ifindex,
> + q->ops->id,
> + q->qstats.qlen);
>
Useful for debugging, but invasive for general use, far more heavyweight
than tossing
it into netlink.
> + skb = alloc_skb(NLMSG_SPACE(1024), GFP_KERNEL);
>
NLMSG_SPACE seems arbitrary, and perhaps this needs to be GFP_ATOMIC?
(But I am totally not an expert down here)
> + if(!skb)
> + return -ENOBUFS;
> + b = skb_tail_pointer(skb);
> +
> + nlh = nlmsg_put(skb, 0, 0, RTM_QDISC_STATS, sizeof(*tcm),
> NLM_F_MULTI);
> + if (!nlh)
> + goto out_nlmsg_trim;
> +
> + tcm = nlmsg_data(nlh);
> + tcm->tcm_family = AF_UNSPEC;
> + tcm->tcm__pad1 = 0;
> + tcm->tcm__pad2 = 0;
> + tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
> + tcm->tcm_parent = q->parent;
> + tcm->tcm_handle = q->handle;
> + tcm->tcm_info = atomic_read(&q->refcnt);
> +
> + if (nla_put_string(skb, TCA_KIND, q->ops->id))
> + goto nla_put_failure;
> +
> + if (gnet_stats_start_copy(skb, TCA_STATS2, NULL, &d) < 0)
> + goto nla_put_failure;
> +
> + if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
> + goto nla_put_failure;
> +
> + if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
> + gnet_stats_copy_queue(&d, &q->qstats) < 0)
> + goto nla_put_failure;
> +
> + if (gnet_stats_finish_copy(&d) < 0)
> + goto nla_put_failure;
> +
> + nlh->nlmsg_len = skb_tail_pointer(skb) - b;
> +
> + rtnetlink_send(skb, qdisc_dev(q)->nd_net, 0, RTNLGRP_TC_STATS, 0);
> +
> + return 0;
> +
> +nla_put_failure:
> +out_nlmsg_trim:
> + nlmsg_trim(skb, b);
> + return -1;
> +
> +}
> +#endif
> +
> +
> /*
> * NOTE: Called under qdisc_lock(q) with locally disabled BH.
> *
> @@ -173,6 +241,9 @@ static inline int qdisc_restart(struct Qdisc *q)
> struct net_device *dev;
> spinlock_t *root_lock;
> struct sk_buff *skb;
> +#ifdef CONFIG_NET_SCH_BROADCAST_STATS
> + qdisc_broadcast_stats(q);
> +#endif
>
Am thinking this could be a callback in the Qdisc_ops struct...
> /* Dequeue packet */
> skb = dequeue_skb(q);
> --
> 1.8.3
>
> _______________________________________________
> Bloat-devel mailing list
> Bloat-devel@lists.bufferbloat.net
> https://lists.bufferbloat.net/listinfo/bloat-devel
>
--
Dave Täht
Fixing bufferbloat with cerowrt:
http://www.teklibre.com/cerowrt/subscribe.html
[-- Attachment #2: Type: text/html, Size: 12299 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH RFC 1/3] Broadcast qdisc statistics via netlink on packet dequeue.
2013-06-10 17:20 ` Dave Taht
@ 2013-06-11 7:15 ` Toke Høiland-Jørgensen
0 siblings, 0 replies; 7+ messages in thread
From: Toke Høiland-Jørgensen @ 2013-06-11 7:15 UTC (permalink / raw)
To: Dave Taht; +Cc: bloat-devel
[-- Attachment #1: Type: text/plain, Size: 4077 bytes --]
Dave Taht <dave.taht@gmail.com> writes:
> It is highly unlikely these patches would be accepted by upstream as
> allocating a skb in the fast path is probably a no-no. However at
> least part of them seem generically useful - instrumenting pfifo_fast
> in particular - and certainly helpful for debugging in general.
Right, well, I never envisioned it was something people would turn on if
they wanted to maximise performance.
> I had hoped it would be possible to do this without mucking with
> locking, tho. :(
Well I'm not really doing any locking as it is; the whole thing is
called from a point where the qdisc lock is already held. The only
"mucking" I'm doing is making the gnet_stats stuff optionally *not* lock
anything. Not so sure that is a good idea, but it seemed better than
unlocking everything before calling into gnet_stats*. Of course an
alternative would be to just not use that, but thought reusing the
existing code would be worthwhile.
> In my own case I'd like to have a qdisc_drop_with_stats
> (skb,sch,"fq_codel",reason) that could be compiled in or out easily,
> sending the packet headers of the dropped packet to a listener much as
> you do here. This would be useful for monitoring, have potential as a
> routing metric, and make it easier to analyze the behavior of various
> qdisc types, without having to poll and lock via tc.
Yeah, that should be quite doable starting from what I have, I think.
What would "reason" be?
> Email syntax for a RFC PATCH is RFC PATCH, not PATCH RFC
Noted.
> unnecessary change. try to touch as little as possible... There is an
> emacs mode for linux indentation, btw which helps avoid annoyingly
> messing up existing code.
Yeah, punted on cleaning up the whitespace changes; will go over those
again.
> It's not clear to me where this stuff enters userspace or not, and in
> general it's safer to append new enums/defines rather than insert
> them.
It's a number used for assigning the netlink multicast group; userspace
includes the header file and subscribes to the same group number. Put it
there to group related stuff together...
> So this acted up on a SMP kernel?
Not really; as noted above, gen_stats by default locks the qdisc before
accessing it, but since I'm now calling it from somewhere where it's
already locked, I had to get it to not do that.
I'm not entirely sure it's exactly the same lock, but think so...
> QDISC_BROADCAST_DEQUEUE_STATS maybe?
Oh yeah, completely forgot to fix up the compile option after adding the
sysctl...
> Elsewhere you've defined the sysctl. Would argue for sysfs...
Eh?
> Is there a way to also merely detect if there is a listener present?
Dunno. I assume that since the netlink stuff has multicast semantics it
would just do something reasonable (such as discard the packet) if
there's no listeners. I can see how detecting it would cut down on some
overhead though; I'll see if I can find something. My alternative was to
abort if the sysctl parameter is set to some magic value (0? negative
number?).
> Useful for debugging, but invasive for general use, far more
> heavyweight than tossing it into netlink.
Yeah, that wasn't supposed to stay in there.
> NLMSG_SPACE seems arbitrary, and perhaps this needs to be GFP_ATOMIC?
> (But I am totally not an expert down here)
Right, will investigate.
> Am thinking this could be a callback in the Qdisc_ops struct...
Well right now it's the other way around; this is called for every
qdisc, and dumps the general qdisc stats along with anything that the
qdisc itself defines (through the dump_stats callback in qdisc_ops).
It's the same way the stuff that tc talks to works, and makes explicit
support from each qdisc optional.
Also, another unresolved issue with this patch: if there's more than one
qdisc on an interface (e.g. a tree of HTBs), it will probably be called
for each one dequeueing a packet; not sure what to do about that; maybe
just limit it to the root qdisc?
-Toke
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 489 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH RFC 2/3] Add qdisc_stats_broadcast_interval sysctl parameter, and use it to limit stats broadcast interval.
2013-06-10 12:39 [PATCH RFC 0/3] Broadcasting qdisc statistics via netlink Toke Høiland-Jørgensen
2013-06-10 12:39 ` [PATCH RFC 1/3] Broadcast qdisc statistics via netlink on packet dequeue Toke Høiland-Jørgensen
@ 2013-06-10 12:39 ` Toke Høiland-Jørgensen
2013-06-10 12:39 ` [PATCH RFC 3/3] Make pfifo_fast track qlen stats Toke Høiland-Jørgensen
2013-06-17 19:07 ` [RFC PATCH v2] Broadcast qdisc statistics via netlink on packet dequeue Toke Høiland-Jørgensen
3 siblings, 0 replies; 7+ messages in thread
From: Toke Høiland-Jørgensen @ 2013-06-10 12:39 UTC (permalink / raw)
To: bloat-devel
---
include/net/netns/ipv4.h | 4 ++++
include/net/sch_generic.h | 4 ++++
net/ipv4/sysctl_net_ipv4.c | 13 +++++++++++++
net/sched/sch_api.c | 7 +++++++
net/sched/sch_generic.c | 18 ++++++++++++++++++
5 files changed, 46 insertions(+)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2ba9de8..ff69564 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -64,6 +64,10 @@ struct netns_ipv4 {
int sysctl_tcp_ecn;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ int sysctl_qdisc_stats_broadcast_interval;
+#endif
+
kgid_t sysctl_ping_group_range[2];
long sysctl_tcp_mem[3];
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f10818f..92f26cf 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -78,6 +78,10 @@ struct Qdisc {
struct netdev_queue *dev_queue;
struct Qdisc *next_sched;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ u64 last_stats_broadcast;
+#endif
+
struct sk_buff *gso_skb;
/*
* For performance sake on SMP, we put highly modified fields at the end
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 960fd29..896429c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -851,6 +851,15 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = ipv4_tcp_mem,
},
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ {
+ .procname = "qdisc_stats_broadcast_interval",
+ .data = &init_net.ipv4.sysctl_qdisc_stats_broadcast_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
{ }
};
@@ -880,6 +889,10 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
&net->ipv4.sysctl_ping_group_range;
table[7].data =
&net->ipv4.sysctl_tcp_ecn;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ table[9].data =
+ &net->ipv4.sysctl_qdisc_stats_broadcast_interval;
+#endif
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c297e2a..154b316 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1772,6 +1772,10 @@ static int __net_init psched_net_init(struct net *net)
if (e == NULL)
return -ENOMEM;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ net->ipv4.sysctl_qdisc_stats_broadcast_interval = 200000;
+#endif
+
return 0;
}
@@ -1782,6 +1786,9 @@ static void __net_exit psched_net_exit(struct net *net)
#else
static int __net_init psched_net_init(struct net *net)
{
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ net->ipv4.sysctl_qdisc_stats_broadcast_interval = 200000;
+#endif
return 0;
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 56490a1..1b4e16d 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -150,6 +150,13 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
}
#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+static inline u64 qdisc_stats_time(void)
+{
+ u64 ns = ktime_to_ns(ktime_get());
+ do_div(ns, NSEC_PER_USEC);
+ return ns;
+}
+
int qdisc_broadcast_stats(struct Qdisc *q)
{
struct tcmsg *tcm;
@@ -157,10 +164,16 @@ int qdisc_broadcast_stats(struct Qdisc *q)
struct gnet_dump d;
struct sk_buff *skb;
unsigned char *b;
+ u64 time;
if(!q->dev_queue || !q->dev_queue->dev)
return 0;
+ time = qdisc_stats_time();
+ if(time < q->last_stats_broadcast +
+ dev_net(qdisc_dev(q))->ipv4.sysctl_qdisc_stats_broadcast_interval)
+ return 0;
+
printk(KERN_DEBUG "Packet dequeue, ifname %s(%d), qdisc %s, qlen %d\n",
qdisc_dev(q)->name,
qdisc_dev(q)->ifindex,
@@ -205,6 +218,8 @@ int qdisc_broadcast_stats(struct Qdisc *q)
rtnetlink_send(skb, qdisc_dev(q)->nd_net, 0, RTNLGRP_TC_STATS, 0);
+ q->last_stats_broadcast = time;
+
return 0;
nla_put_failure:
@@ -636,6 +651,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
sch->dev_queue = dev_queue;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ sch->last_stats_broadcast = qdisc_stats_time();
+#endif
dev_hold(dev);
atomic_set(&sch->refcnt, 1);
--
1.8.3
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH RFC 3/3] Make pfifo_fast track qlen stats.
2013-06-10 12:39 [PATCH RFC 0/3] Broadcasting qdisc statistics via netlink Toke Høiland-Jørgensen
2013-06-10 12:39 ` [PATCH RFC 1/3] Broadcast qdisc statistics via netlink on packet dequeue Toke Høiland-Jørgensen
2013-06-10 12:39 ` [PATCH RFC 2/3] Add qdisc_stats_broadcast_interval sysctl parameter, and use it to limit stats broadcast interval Toke Høiland-Jørgensen
@ 2013-06-10 12:39 ` Toke Høiland-Jørgensen
2013-06-17 19:07 ` [RFC PATCH v2] Broadcast qdisc statistics via netlink on packet dequeue Toke Høiland-Jørgensen
3 siblings, 0 replies; 7+ messages in thread
From: Toke Høiland-Jørgensen @ 2013-06-10 12:39 UTC (permalink / raw)
To: bloat-devel
---
net/sched/sch_generic.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 1b4e16d..0ca0d06 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -522,6 +522,8 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
priv->bitmap |= (1 << band);
qdisc->q.qlen++;
+ qdisc->qstats.qlen++;
+ qdisc->qstats.backlog += qdisc_pkt_len(skb);
return __qdisc_enqueue_tail(skb, qdisc, list);
}
@@ -538,6 +540,8 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
qdisc->q.qlen--;
+ qdisc->qstats.qlen--;
+ qdisc->qstats.backlog -= qdisc_pkt_len(skb);
if (skb_queue_empty(list))
priv->bitmap &= ~(1 << band);
--
1.8.3
^ permalink raw reply [flat|nested] 7+ messages in thread
* [RFC PATCH v2] Broadcast qdisc statistics via netlink on packet dequeue.
2013-06-10 12:39 [PATCH RFC 0/3] Broadcasting qdisc statistics via netlink Toke Høiland-Jørgensen
` (2 preceding siblings ...)
2013-06-10 12:39 ` [PATCH RFC 3/3] Make pfifo_fast track qlen stats Toke Høiland-Jørgensen
@ 2013-06-17 19:07 ` Toke Høiland-Jørgensen
3 siblings, 0 replies; 7+ messages in thread
From: Toke Høiland-Jørgensen @ 2013-06-17 19:07 UTC (permalink / raw)
To: bloat-devel
[-- Attachment #1: Type: text/plain, Size: 9312 bytes --]
This is the second attempt at a patch to broadcast qdisc statistics via
netlink on packet dequeue.
Changes since the first version:
- Cleaned up the patch itself.
- New enum entries in rtnetlink.h are now at the bottom.
- Clean up buffers properly on failure.
- Check if any clients are listening to the netlink group and abort if
none are.
- Use the non-blocking netlink API (I think...)
- Call the main stats broadcast function from sch_direct_xmit(). Finally
makes things work the way they're supposed to (i.e. it gets called on
every packet dequeue).
Client implementation is at https://github.com/tohojo/qstatsc
Preliminary testing on a virtual machine between host and guests gives
similar network performance and similar CPU load with and without the
patch. Still, I wouldn't recommend turning it on at 10GigE speeds.
Comments greatly appreciated.
-Toke
----- >8 ----------- >8 -----
Add qdisc_stats_broadcast_interval sysctl parameter, and use it to limit
stats broadcast interval.
---
include/net/netns/ipv4.h | 4 ++
include/net/sch_generic.h | 4 ++
include/uapi/linux/rtnetlink.h | 5 +++
net/core/gen_stats.c | 6 ++-
net/ipv4/sysctl_net_ipv4.c | 13 ++++++
net/sched/Kconfig | 13 ++++++
net/sched/sch_api.c | 7 ++++
net/sched/sch_generic.c | 89 ++++++++++++++++++++++++++++++++++++++++++
8 files changed, 139 insertions(+), 2 deletions(-)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2ba9de8..ff69564 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -64,6 +64,10 @@ struct netns_ipv4 {
int sysctl_tcp_ecn;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ int sysctl_qdisc_stats_broadcast_interval;
+#endif
+
kgid_t sysctl_ping_group_range[2];
long sysctl_tcp_mem[3];
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f10818f..92f26cf 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -78,6 +78,10 @@ struct Qdisc {
struct netdev_queue *dev_queue;
struct Qdisc *next_sched;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ u64 last_stats_broadcast;
+#endif
+
struct sk_buff *gso_skb;
/*
* For performance sake on SMP, we put highly modified fields at the end
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index da0a60e..edee5a9 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -132,6 +132,9 @@ enum {
RTM_GETMDB = 86,
#define RTM_GETMDB RTM_GETMDB
+ RTM_QDISC_STATS,
+#define RTM_QDISC_STATS RTM_QDISC_STATS
+
__RTM_MAX,
#define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1)
};
@@ -613,6 +616,8 @@ enum rtnetlink_groups {
#define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF
RTNLGRP_MDB,
#define RTNLGRP_MDB RTNLGRP_MDB
+ RTNLGRP_TC_STATS,
+#define RTNLGRP_TC_STATS RTNLGRP_TC_STATS
__RTNLGRP_MAX
};
#define RTNLGRP_MAX (__RTNLGRP_MAX - 1)
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index ddedf21..68df614 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -61,7 +61,8 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
{
memset(d, 0, sizeof(*d));
- spin_lock_bh(lock);
+ if(lock)
+ spin_lock_bh(lock);
d->lock = lock;
if (type)
d->tail = (struct nlattr *)skb_tail_pointer(skb);
@@ -245,7 +246,8 @@ gnet_stats_finish_copy(struct gnet_dump *d)
return -1;
}
- spin_unlock_bh(d->lock);
+ if(d->lock)
+ spin_unlock_bh(d->lock);
return 0;
}
EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 960fd29..896429c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -851,6 +851,15 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = ipv4_tcp_mem,
},
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ {
+ .procname = "qdisc_stats_broadcast_interval",
+ .data = &init_net.ipv4.sysctl_qdisc_stats_broadcast_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
{ }
};
@@ -880,6 +889,10 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
&net->ipv4.sysctl_ping_group_range;
table[7].data =
&net->ipv4.sysctl_tcp_ecn;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ table[9].data =
+ &net->ipv4.sysctl_qdisc_stats_broadcast_interval;
+#endif
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 235e01a..03958f0 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -308,6 +308,19 @@ config NET_SCH_PLUG
To compile this code as a module, choose M here: the
module will be called sch_plug.
+config NET_SCH_BROADCAST_STATS
+ bool "Enable Qdisc statistics broadcast"
+ ---help---
+
+ Select this option if you want to enable qdisc stats broadcast through
+ netlink multicast. Broadcast happens on packet dequeue, limited to the
+ interval set by the qdisc_stats_broadcast_interval sysctl parameter.
+
+ The statistics will be broadcast to the RTNLGRP_TC_STATS multicast group
+ and the message type is RTM_QDISC_STATS.
+
+ See https://github.com/tohojo/qstatsc for a sample client implementation.
+
comment "Classification"
config NET_CLS
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c297e2a..154b316 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1772,6 +1772,10 @@ static int __net_init psched_net_init(struct net *net)
if (e == NULL)
return -ENOMEM;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ net->ipv4.sysctl_qdisc_stats_broadcast_interval = 200000;
+#endif
+
return 0;
}
@@ -1782,6 +1786,9 @@ static void __net_exit psched_net_exit(struct net *net)
#else
static int __net_init psched_net_init(struct net *net)
{
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ net->ipv4.sysctl_qdisc_stats_broadcast_interval = 200000;
+#endif
return 0;
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index eac7e0e..b220705 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -28,6 +28,7 @@
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
+#include <net/netlink.h>
/* Main transmission queue. */
@@ -101,6 +102,88 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,
return ret;
}
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+static inline u64 qdisc_stats_time(void)
+{
+ u64 ns = ktime_to_ns(ktime_get());
+ do_div(ns, NSEC_PER_USEC);
+ return ns;
+}
+
+static int qdisc_broadcast_stats(struct Qdisc *q)
+{
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ struct gnet_dump d;
+ struct sk_buff *skb;
+ struct net *net;
+ unsigned char *b;
+ u64 time;
+
+ if(!q->dev_queue || !q->dev_queue->dev)
+ return 0;
+
+ net = dev_net(qdisc_dev(q));
+
+ if(!netlink_has_listeners(net->rtnl, RTNLGRP_TC_STATS))
+ return 0;
+
+ time = qdisc_stats_time();
+ if(time < q->last_stats_broadcast +
+ net->ipv4.sysctl_qdisc_stats_broadcast_interval)
+ return 0;
+
+ skb = alloc_skb(NLMSG_SPACE(1024), GFP_ATOMIC);
+ if(!skb)
+ return -ENOBUFS;
+ b = skb_tail_pointer(skb);
+
+ nlh = nlmsg_put(skb, 0, 0, RTM_QDISC_STATS, sizeof(*tcm), NLM_F_MULTI);
+ if (!nlh)
+ goto out_free;
+
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+ tcm->tcm_parent = q->parent;
+ tcm->tcm_handle = q->handle;
+ tcm->tcm_info = atomic_read(&q->refcnt);
+
+ if (nla_put_string(skb, TCA_KIND, q->ops->id))
+ goto nla_put_failure;
+
+ if (gnet_stats_start_copy(skb, TCA_STATS2, NULL, &d) < 0)
+ goto nla_put_failure;
+
+ if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
+ goto nla_put_failure;
+
+ q->qstats.qlen = q->q.qlen;
+ if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
+ gnet_stats_copy_queue(&d, &q->qstats) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_finish_copy(&d) < 0)
+ goto nla_put_failure;
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
+ nlmsg_notify(net->rtnl, skb, 0, RTNLGRP_TC_STATS, 0, 0);
+
+ q->last_stats_broadcast = time;
+
+ return 0;
+
+nla_put_failure:
+out_free:
+ kfree_skb(skb);
+ return -1;
+
+}
+#endif
+
/*
* Transmit one skb, and handle the return status as required. Holding the
* __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this
@@ -115,6 +198,9 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
spinlock_t *root_lock)
{
int ret = NETDEV_TX_BUSY;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ qdisc_broadcast_stats(q);
+#endif
/* And release qdisc */
spin_unlock(root_lock);
@@ -565,6 +651,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
sch->dev_queue = dev_queue;
+#ifdef CONFIG_NET_SCH_BROADCAST_STATS
+ sch->last_stats_broadcast = qdisc_stats_time();
+#endif
dev_hold(dev);
atomic_set(&sch->refcnt, 1);
--
1.8.3.1
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 489 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread