Fwd: [net-next 11/13] igb: Make Tx budget for NAPI user adjustable

Dave Taht dave.taht at gmail.com
Sat Sep 17 13:10:44 EDT 2011


I am encouraged... more discussion on the netdev list...

---------- Forwarded message ----------
From: Jeff Kirsher <jeffrey.t.kirsher at intel.com>
Date: Sat, Sep 17, 2011 at 1:04 AM
Subject: [net-next 11/13] igb: Make Tx budget for NAPI user adjustable
To: davem at davemloft.net
Cc: Alexander Duyck <alexander.h.duyck at intel.com>, netdev at vger.kernel.org,
gospo at redhat.com, Jeff Kirsher <jeffrey.t.kirsher at intel.com>


From: Alexander Duyck <alexander.h.duyck at intel.com>

This change is meant to make the NAPI budget limits for transmit
adjustable.  By doing this it is possible to tune the value for optimal
performance with applications such as routing.

Signed-off-by: Alexander Duyck <alexander.h.duyck at intel.com>
Tested-by:  Aaron Brown  <aaron.f.brown at intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher at intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h         |    3 +
 drivers/net/ethernet/intel/igb/igb_ethtool.c |    6 +
 drivers/net/ethernet/intel/igb/igb_main.c    |  136
++++++++++++++++----------
 3 files changed, 92 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h
b/drivers/net/ethernet/intel/igb/igb.h
index b725937..cfa590c8 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -47,6 +47,7 @@ struct igb_adapter;

 /* TX/RX descriptor defines */
 #define IGB_DEFAULT_TXD                  256
+#define IXGBE_DEFAULT_TX_WORK           128
 #define IGB_MIN_TXD                       80
 #define IGB_MAX_TXD                     4096

@@ -177,6 +178,7 @@ struct igb_q_vector {

       u32 eims_value;
       u16 cpu;
+       u16 tx_work_limit;

       u16 itr_val;
       u8 set_itr;
@@ -266,6 +268,7 @@ struct igb_adapter {
       u16 rx_itr;

       /* TX */
+       u16 tx_work_limit;
       u32 tx_timeout_count;
       int num_tx_queues;
       struct igb_ring *tx_ring[16];
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c
b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index f231d82..f6da820 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -1989,6 +1989,9 @@ static int igb_set_coalesce(struct net_device *netdev,
       if ((adapter->flags & IGB_FLAG_QUEUE_PAIRS) && ec->tx_coalesce_usecs)
               return -EINVAL;

+       if (ec->tx_max_coalesced_frames_irq)
+               adapter->tx_work_limit = ec->tx_max_coalesced_frames_irq;
+
       /* If ITR is disabled, disable DMAC */
       if (ec->rx_coalesce_usecs == 0) {
               if (adapter->flags & IGB_FLAG_DMAC)
@@ -2011,6 +2014,7 @@ static int igb_set_coalesce(struct net_device *netdev,

       for (i = 0; i < adapter->num_q_vectors; i++) {
               struct igb_q_vector *q_vector = adapter->q_vector[i];
+               q_vector->tx_work_limit = adapter->tx_work_limit;
               if (q_vector->rx_ring)
                       q_vector->itr_val = adapter->rx_itr_setting;
               else
@@ -2033,6 +2037,8 @@ static int igb_get_coalesce(struct net_device *netdev,
       else
               ec->rx_coalesce_usecs = adapter->rx_itr_setting >> 2;

+       ec->tx_max_coalesced_frames_irq = adapter->tx_work_limit;
+
       if (!(adapter->flags & IGB_FLAG_QUEUE_PAIRS)) {
               if (adapter->tx_itr_setting <= 3)
                       ec->tx_coalesce_usecs = adapter->tx_itr_setting;
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c
b/drivers/net/ethernet/intel/igb/igb_main.c
index 7ad25e8..989e774 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -136,8 +136,8 @@ static irqreturn_t igb_msix_ring(int irq, void *);
 static void igb_update_dca(struct igb_q_vector *);
 static void igb_setup_dca(struct igb_adapter *);
 #endif /* CONFIG_IGB_DCA */
-static bool igb_clean_tx_irq(struct igb_q_vector *);
 static int igb_poll(struct napi_struct *, int);
+static bool igb_clean_tx_irq(struct igb_q_vector *);
 static bool igb_clean_rx_irq(struct igb_q_vector *, int);
 static int igb_ioctl(struct net_device *, struct ifreq *, int cmd);
 static void igb_tx_timeout(struct net_device *);
@@ -1120,6 +1120,7 @@ static void igb_map_tx_ring_to_vector(struct
igb_adapter *adapter,
       q_vector->tx_ring = adapter->tx_ring[ring_idx];
       q_vector->tx_ring->q_vector = q_vector;
       q_vector->itr_val = adapter->tx_itr_setting;
+       q_vector->tx_work_limit = adapter->tx_work_limit;
       if (q_vector->itr_val && q_vector->itr_val <= 3)
               q_vector->itr_val = IGB_START_ITR;
 }
@@ -2388,11 +2389,17 @@ static int __devinit igb_sw_init(struct igb_adapter
*adapter)

       pci_read_config_word(pdev, PCI_COMMAND, &hw->bus.pci_cmd_word);

+       /* set default ring sizes */
       adapter->tx_ring_count = IGB_DEFAULT_TXD;
       adapter->rx_ring_count = IGB_DEFAULT_RXD;
+
+       /* set default ITR values */
       adapter->rx_itr_setting = IGB_DEFAULT_ITR;
       adapter->tx_itr_setting = IGB_DEFAULT_ITR;

+       /* set default work limits */
+       adapter->tx_work_limit = IXGBE_DEFAULT_TX_WORK;
+
       adapter->max_frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN +
                                 VLAN_HLEN;
       adapter->min_frame_size = ETH_ZLEN + ETH_FCS_LEN;
@@ -5496,7 +5503,7 @@ static int igb_poll(struct napi_struct *napi, int
budget)
               igb_update_dca(q_vector);
 #endif
       if (q_vector->tx_ring)
-               clean_complete = !!igb_clean_tx_irq(q_vector);
+               clean_complete = igb_clean_tx_irq(q_vector);

       if (q_vector->rx_ring)
               clean_complete &= igb_clean_rx_irq(q_vector, budget);
@@ -5578,64 +5585,69 @@ static bool igb_clean_tx_irq(struct igb_q_vector
*q_vector)
 {
       struct igb_adapter *adapter = q_vector->adapter;
       struct igb_ring *tx_ring = q_vector->tx_ring;
-       struct net_device *netdev = tx_ring->netdev;
-       struct e1000_hw *hw = &adapter->hw;
-       struct igb_buffer *buffer_info;
-       union e1000_adv_tx_desc *tx_desc, *eop_desc;
+       struct igb_buffer *tx_buffer;
+       union e1000_adv_tx_desc *tx_desc;
       unsigned int total_bytes = 0, total_packets = 0;
-       unsigned int i, eop, count = 0;
-       bool cleaned = false;
+       unsigned int budget = q_vector->tx_work_limit;
+       u16 i = tx_ring->next_to_clean;

-       i = tx_ring->next_to_clean;
-       eop = tx_ring->buffer_info[i].next_to_watch;
-       eop_desc = IGB_TX_DESC(tx_ring, eop);
+       if (test_bit(__IGB_DOWN, &adapter->state))
+               return true;

-       while ((eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD)) &&
-              (count < tx_ring->count)) {
-               rmb();  /* read buffer_info after eop_desc status */
-               for (cleaned = false; !cleaned; count++) {
-                       tx_desc = IGB_TX_DESC(tx_ring, i);
-                       buffer_info = &tx_ring->buffer_info[i];
-                       cleaned = (i == eop);
+       tx_buffer = &tx_ring->buffer_info[i];
+       tx_desc = IGB_TX_DESC(tx_ring, i);

-                       if (buffer_info->skb) {
-                               total_bytes += buffer_info->bytecount;
-                               /* gso_segs is currently only valid for tcp
*/
-                               total_packets += buffer_info->gso_segs;
-                               igb_tx_hwtstamp(q_vector, buffer_info);
-                       }
+       for (; budget; budget--) {
+               u16 eop = tx_buffer->next_to_watch;
+               union e1000_adv_tx_desc *eop_desc;
+
+               eop_desc = IGB_TX_DESC(tx_ring, eop);
+
+               /* if DD is not set pending work has not been completed */
+               if (!(eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD)))
+                       break;
+
+               /* prevent any other reads prior to eop_desc being verified
*/
+               rmb();

-                       igb_unmap_and_free_tx_resource(tx_ring,
buffer_info);
+               do {
                       tx_desc->wb.status = 0;
+                       if (likely(tx_desc == eop_desc)) {
+                               eop_desc = NULL;
+
+                               total_bytes += tx_buffer->bytecount;
+                               total_packets += tx_buffer->gso_segs;
+                               igb_tx_hwtstamp(q_vector, tx_buffer);
+                       }
+
+                       igb_unmap_and_free_tx_resource(tx_ring, tx_buffer);

+                       tx_buffer++;
+                       tx_desc++;
                       i++;
-                       if (i == tx_ring->count)
+                       if (unlikely(i == tx_ring->count)) {
                               i = 0;
-               }
-               eop = tx_ring->buffer_info[i].next_to_watch;
-               eop_desc = IGB_TX_DESC(tx_ring, eop);
+                               tx_buffer = tx_ring->buffer_info;
+                               tx_desc = IGB_TX_DESC(tx_ring, 0);
+                       }
+               } while (eop_desc);
       }

       tx_ring->next_to_clean = i;
+       u64_stats_update_begin(&tx_ring->tx_syncp);
+       tx_ring->tx_stats.bytes += total_bytes;
+       tx_ring->tx_stats.packets += total_packets;
+       u64_stats_update_end(&tx_ring->tx_syncp);
+       tx_ring->total_bytes += total_bytes;
+       tx_ring->total_packets += total_packets;

-       if (unlikely(count &&
-                    netif_carrier_ok(netdev) &&
-                    igb_desc_unused(tx_ring) >= IGB_TX_QUEUE_WAKE)) {
-               /* Make sure that anybody stopping the queue after this
-                * sees the new next_to_clean.
-                */
-               smp_mb();
-               if (__netif_subqueue_stopped(netdev, tx_ring->queue_index)
&&
-                   !(test_bit(__IGB_DOWN, &adapter->state))) {
-                       netif_wake_subqueue(netdev, tx_ring->queue_index);
+       if (tx_ring->detect_tx_hung) {
+               struct e1000_hw *hw = &adapter->hw;
+               u16 eop = tx_ring->buffer_info[i].next_to_watch;
+               union e1000_adv_tx_desc *eop_desc;

-                       u64_stats_update_begin(&tx_ring->tx_syncp);
-                       tx_ring->tx_stats.restart_queue++;
-                       u64_stats_update_end(&tx_ring->tx_syncp);
-               }
-       }
+               eop_desc = IGB_TX_DESC(tx_ring, eop);

-       if (tx_ring->detect_tx_hung) {
               /* Detect a transmit hang in hardware, this serializes the
                * check with the clearing of time_stamp and movement of i */
               tx_ring->detect_tx_hung = false;
@@ -5666,16 +5678,34 @@ static bool igb_clean_tx_irq(struct igb_q_vector
*q_vector)
                               eop,
                               jiffies,
                               eop_desc->wb.status);
-                       netif_stop_subqueue(netdev, tx_ring->queue_index);
+                       netif_stop_subqueue(tx_ring->netdev,
+                                           tx_ring->queue_index);
+
+                       /* we are about to reset, no point in enabling stuff
*/
+                       return true;
               }
       }
-       tx_ring->total_bytes += total_bytes;
-       tx_ring->total_packets += total_packets;
-       u64_stats_update_begin(&tx_ring->tx_syncp);
-       tx_ring->tx_stats.bytes += total_bytes;
-       tx_ring->tx_stats.packets += total_packets;
-       u64_stats_update_end(&tx_ring->tx_syncp);
-       return count < tx_ring->count;
+
+       if (unlikely(total_packets &&
+                    netif_carrier_ok(tx_ring->netdev) &&
+                    igb_desc_unused(tx_ring) >= IGB_TX_QUEUE_WAKE)) {
+               /* Make sure that anybody stopping the queue after this
+                * sees the new next_to_clean.
+                */
+               smp_mb();
+               if (__netif_subqueue_stopped(tx_ring->netdev,
+                                            tx_ring->queue_index) &&
+                   !(test_bit(__IGB_DOWN, &adapter->state))) {
+                       netif_wake_subqueue(tx_ring->netdev,
+                                           tx_ring->queue_index);
+
+                       u64_stats_update_begin(&tx_ring->tx_syncp);
+                       tx_ring->tx_stats.restart_queue++;
+                       u64_stats_update_end(&tx_ring->tx_syncp);
+               }
+       }
+
+       return !!budget;
 }

 static inline void igb_rx_checksum(struct igb_ring *ring,
--
1.7.6

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo at vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
http://the-edge.blogspot.com
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.bufferbloat.net/pipermail/bloat-devel/attachments/20110917/84e3d1ee/attachment-0002.html>


More information about the Bloat-devel mailing list