[PATCH RFC 1/3] Broadcast qdisc statistics via netlink on packet dequeue.
Dave Taht
dave.taht at gmail.com
Mon Jun 10 10:20:57 PDT 2013
It is highly unlikely these patches would be accepted by upstream as
allocating a skb in the fast path is probably a no-no. However at least
part of them seem generically useful - instrumenting pfifo_fast in
particular - and certainly helpful for debugging in general.
I had hoped it would be possible to do this without mucking with locking,
tho. :(
In my own case I'd like to have a
qdisc_drop_with_stats(skb,sch,"fq_codel",reason) that could be compiled in
or out easily, sending the packet headers of the dropped packet to a
listener much as you do here. This would be useful for monitoring, have
potential as a routing metric, and make it easier to analyze the behavior
of various qdisc types, without having to poll and lock via tc.
Also as a drop takes place only when the card is producing delays, it would
be possible to get a bandwidth estimate over smaller intervals from it
easily, and be less invasive overall...
So thanks for digging into netlink so much and having a sample program. I
almost get it now...
A couple more comments below:
On Mon, Jun 10, 2013 at 5:39 AM, Toke Høiland-Jørgensen <toke at toke.dk>wrote:
Email syntax for a RFC PATCH is RFC PATCH, not PATCH RFC
---
> include/uapi/linux/rtnetlink.h | 15 ++++++---
> net/core/gen_stats.c | 6 ++--
> net/sched/Kconfig | 8 +++++
> net/sched/sch_generic.c | 71
> ++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 93 insertions(+), 7 deletions(-)
>
> diff --git a/include/uapi/linux/rtnetlink.h
> b/include/uapi/linux/rtnetlink.h
> index 7a2144e..e9d91c5 100644
> --- a/include/uapi/linux/rtnetlink.h
> +++ b/include/uapi/linux/rtnetlink.h
> @@ -68,6 +68,9 @@ enum {
> RTM_GETQDISC,
> #define RTM_GETQDISC RTM_GETQDISC
>
> + RTM_QDISC_STATS,
> +#define RTM_QDISC_STATS RTM_QDISC_STATS
> +
>
> RTM_NEWTCLASS = 40,
> #define RTM_NEWTCLASS RTM_NEWTCLASS
> RTM_DELTCLASS,
> @@ -140,7 +143,7 @@ enum {
> #define RTM_NR_FAMILIES (RTM_NR_MSGTYPES >> 2)
> #define RTM_FAM(cmd) (((cmd) - RTM_BASE) >> 2)
>
> -/*
> +/*
> Generic structure for encapsulation of optional route information.
> It is reminiscent of sockaddr, but with sa_family replaced
> with attribute type.
> @@ -180,7 +183,7 @@ struct rtmsg {
>
> unsigned char rtm_table; /* Routing table id */
> unsigned char rtm_protocol; /* Routing protocol; see
> below */
> - unsigned char rtm_scope; /* See below */
> + unsigned char rtm_scope; /* See below */
> unsigned char rtm_type; /* See below */
>
>
unnecessary change.
> unsigned rtm_flags;
> @@ -450,7 +453,7 @@ struct ifinfomsg {
> };
>
> /********************************************************************
> - * prefix information
> + * prefix information
> ****/
>
>
unnecessary change. try to touch as little as possible... There is an emacs
mode for linux indentation, btw
which helps avoid annoyingly messing up existing code.
struct prefixmsg {
> @@ -464,7 +467,7 @@ struct prefixmsg {
> unsigned char prefix_pad3;
> };
>
> -enum
> +enum
> {
> PREFIX_UNSPEC,
> PREFIX_ADDRESS,
> @@ -571,6 +574,8 @@ enum rtnetlink_groups {
> #define RTNLGRP_NEIGH RTNLGRP_NEIGH
> RTNLGRP_TC,
> #define RTNLGRP_TC RTNLGRP_TC
> + RTNLGRP_TC_STATS,
> +#define RTNLGRP_TC_STATS RTNLGRP_TC_STATS
> RTNLGRP_IPV4_IFADDR,
> #define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR
> RTNLGRP_IPV4_MROUTE,
>
It's not clear to me where this stuff enters userspace or not, and in
general it's
safer to append new enums/defines rather than insert them.
> @@ -625,7 +630,7 @@ struct tcamsg {
> };
> #define TA_RTA(r) ((struct rtattr*)(((char*)(r)) +
> NLMSG_ALIGN(sizeof(struct tcamsg))))
> #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
> -#define TCA_ACT_TAB 1 /* attr type must be >=1 */
> +#define TCA_ACT_TAB 1 /* attr type must be >=1 */
> #define TCAA_MAX 1
>
Uneeeded change...
>
> /* New extended info filters for IFLA_EXT_MASK */
> diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
> index ddedf21..68df614 100644
> --- a/net/core/gen_stats.c
> +++ b/net/core/gen_stats.c
> @@ -61,7 +61,8 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int
> type, int tc_stats_type,
> {
> memset(d, 0, sizeof(*d));
>
> - spin_lock_bh(lock);
> + if(lock)
> + spin_lock_bh(lock);
>
So this acted up on a SMP kernel?
> d->lock = lock;
> if (type)
> d->tail = (struct nlattr *)skb_tail_pointer(skb);
> @@ -245,7 +246,8 @@ gnet_stats_finish_copy(struct gnet_dump *d)
> return -1;
> }
>
> - spin_unlock_bh(d->lock);
> + if(d->lock)
> + spin_unlock_bh(d->lock);
> return 0;
> }
> EXPORT_SYMBOL(gnet_stats_finish_copy);
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index 235e01a..4902812 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -308,6 +308,14 @@ config NET_SCH_PLUG
> To compile this code as a module, choose M here: the
> module will be called sch_plug.
>
> +config NET_SCH_BROADCAST_STATS
> + bool "Enable Qdisc statistics broadcast"
> + ---help---
> +
> + Select this option if you want to enable qdisc stats broadcast
> through
> + netlink multicast. Broadcast happens on packet dequeue, limited
> to the
> + interval set by the <something> sysctl parameter.
> +
> comment "Classification"
>
QDISC_BROADCAST_DEQUEUE_STATS maybe?
Elsewhere you've defined the sysctl. Would argue for sysfs...
>
> config NET_CLS
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index eac7e0e..56490a1 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -28,6 +28,7 @@
> #include <net/sch_generic.h>
> #include <net/pkt_sched.h>
> #include <net/dst.h>
> +#include <net/netlink.h>
>
> /* Main transmission queue. */
>
> @@ -148,6 +149,73 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc
> *q,
> return ret;
> }
>
> +#ifdef CONFIG_NET_SCH_BROADCAST_STATS
> +int qdisc_broadcast_stats(struct Qdisc *q)
> +{
> + struct tcmsg *tcm;
> + struct nlmsghdr *nlh;
> + struct gnet_dump d;
> + struct sk_buff *skb;
> + unsigned char *b;
> +
> + if(!q->dev_queue || !q->dev_queue->dev)
> + return 0;
>
>
Is there a way to also merely detect if there is a listener present?
> + printk(KERN_DEBUG "Packet dequeue, ifname %s(%d), qdisc %s, qlen
> %d\n",
> + qdisc_dev(q)->name,
> + qdisc_dev(q)->ifindex,
> + q->ops->id,
> + q->qstats.qlen);
>
Useful for debugging, but invasive for general use, far more heavyweight
than tossing
it into netlink.
> + skb = alloc_skb(NLMSG_SPACE(1024), GFP_KERNEL);
>
NLMSG_SPACE seems arbitrary, and perhaps this needs to be GFP_ATOMIC?
(But I am totally not an expert down here)
> + if(!skb)
> + return -ENOBUFS;
> + b = skb_tail_pointer(skb);
> +
> + nlh = nlmsg_put(skb, 0, 0, RTM_QDISC_STATS, sizeof(*tcm),
> NLM_F_MULTI);
> + if (!nlh)
> + goto out_nlmsg_trim;
> +
> + tcm = nlmsg_data(nlh);
> + tcm->tcm_family = AF_UNSPEC;
> + tcm->tcm__pad1 = 0;
> + tcm->tcm__pad2 = 0;
> + tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
> + tcm->tcm_parent = q->parent;
> + tcm->tcm_handle = q->handle;
> + tcm->tcm_info = atomic_read(&q->refcnt);
> +
> + if (nla_put_string(skb, TCA_KIND, q->ops->id))
> + goto nla_put_failure;
> +
> + if (gnet_stats_start_copy(skb, TCA_STATS2, NULL, &d) < 0)
> + goto nla_put_failure;
> +
> + if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
> + goto nla_put_failure;
> +
> + if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
> + gnet_stats_copy_queue(&d, &q->qstats) < 0)
> + goto nla_put_failure;
> +
> + if (gnet_stats_finish_copy(&d) < 0)
> + goto nla_put_failure;
> +
> + nlh->nlmsg_len = skb_tail_pointer(skb) - b;
> +
> + rtnetlink_send(skb, qdisc_dev(q)->nd_net, 0, RTNLGRP_TC_STATS, 0);
> +
> + return 0;
> +
> +nla_put_failure:
> +out_nlmsg_trim:
> + nlmsg_trim(skb, b);
> + return -1;
> +
> +}
> +#endif
> +
> +
> /*
> * NOTE: Called under qdisc_lock(q) with locally disabled BH.
> *
> @@ -173,6 +241,9 @@ static inline int qdisc_restart(struct Qdisc *q)
> struct net_device *dev;
> spinlock_t *root_lock;
> struct sk_buff *skb;
> +#ifdef CONFIG_NET_SCH_BROADCAST_STATS
> + qdisc_broadcast_stats(q);
> +#endif
>
Am thinking this could be a callback in the Qdisc_ops struct...
> /* Dequeue packet */
> skb = dequeue_skb(q);
> --
> 1.8.3
>
> _______________________________________________
> Bloat-devel mailing list
> Bloat-devel at lists.bufferbloat.net
> https://lists.bufferbloat.net/listinfo/bloat-devel
>
--
Dave Täht
Fixing bufferbloat with cerowrt:
http://www.teklibre.com/cerowrt/subscribe.html
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.bufferbloat.net/pipermail/bloat-devel/attachments/20130610/b7d9db28/attachment-0001.html>
More information about the Bloat-devel
mailing list