I am encouraged... more discussion on the netdev list...<br><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Jeff Kirsher</b> <span dir="ltr"><<a href="mailto:jeffrey.t.kirsher@intel.com">jeffrey.t.kirsher@intel.com</a>></span><br>
Date: Sat, Sep 17, 2011 at 1:04 AM<br>Subject: [net-next 11/13] igb: Make Tx budget for NAPI user adjustable<br>To: <a href="mailto:davem@davemloft.net">davem@davemloft.net</a><br>Cc: Alexander Duyck <<a href="mailto:alexander.h.duyck@intel.com">alexander.h.duyck@intel.com</a>>, <a href="mailto:netdev@vger.kernel.org">netdev@vger.kernel.org</a>, <a href="mailto:gospo@redhat.com">gospo@redhat.com</a>, Jeff Kirsher <<a href="mailto:jeffrey.t.kirsher@intel.com">jeffrey.t.kirsher@intel.com</a>><br>
<br><br>From: Alexander Duyck <<a href="mailto:alexander.h.duyck@intel.com">alexander.h.duyck@intel.com</a>><br>
<br>
This change is meant to make the NAPI budget limits for transmit<br>
adjustable.  By doing this it is possible to tune the value for optimal<br>
performance with applications such as routing.<br>
<br>
Signed-off-by: Alexander Duyck <<a href="mailto:alexander.h.duyck@intel.com">alexander.h.duyck@intel.com</a>><br>
Tested-by:  Aaron Brown  <<a href="mailto:aaron.f.brown@intel.com">aaron.f.brown@intel.com</a>><br>
Signed-off-by: Jeff Kirsher <<a href="mailto:jeffrey.t.kirsher@intel.com">jeffrey.t.kirsher@intel.com</a>><br>
---<br>
 drivers/net/ethernet/intel/igb/igb.h         |    3 +<br>
 drivers/net/ethernet/intel/igb/igb_ethtool.c |    6 +<br>
 drivers/net/ethernet/intel/igb/igb_main.c    |  136 ++++++++++++++++----------<br>
 3 files changed, 92 insertions(+), 53 deletions(-)<br>
<br>
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h<br>
index b725937..cfa590c8 100644<br>
--- a/drivers/net/ethernet/intel/igb/igb.h<br>
+++ b/drivers/net/ethernet/intel/igb/igb.h<br>
@@ -47,6 +47,7 @@ struct igb_adapter;<br>
<br>
 /* TX/RX descriptor defines */<br>
 #define IGB_DEFAULT_TXD                  256<br>
+#define IXGBE_DEFAULT_TX_WORK           128<br>
 #define IGB_MIN_TXD                       80<br>
 #define IGB_MAX_TXD                     4096<br>
<br>
@@ -177,6 +178,7 @@ struct igb_q_vector {<br>
<br>
        u32 eims_value;<br>
        u16 cpu;<br>
+       u16 tx_work_limit;<br>
<br>
        u16 itr_val;<br>
        u8 set_itr;<br>
@@ -266,6 +268,7 @@ struct igb_adapter {<br>
        u16 rx_itr;<br>
<br>
        /* TX */<br>
+       u16 tx_work_limit;<br>
        u32 tx_timeout_count;<br>
        int num_tx_queues;<br>
        struct igb_ring *tx_ring[16];<br>
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c<br>
index f231d82..f6da820 100644<br>
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c<br>
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c<br>
@@ -1989,6 +1989,9 @@ static int igb_set_coalesce(struct net_device *netdev,<br>
        if ((adapter->flags & IGB_FLAG_QUEUE_PAIRS) && ec->tx_coalesce_usecs)<br>
                return -EINVAL;<br>
<br>
+       if (ec->tx_max_coalesced_frames_irq)<br>
+               adapter->tx_work_limit = ec->tx_max_coalesced_frames_irq;<br>
+<br>
        /* If ITR is disabled, disable DMAC */<br>
        if (ec->rx_coalesce_usecs == 0) {<br>
                if (adapter->flags & IGB_FLAG_DMAC)<br>
@@ -2011,6 +2014,7 @@ static int igb_set_coalesce(struct net_device *netdev,<br>
<br>
        for (i = 0; i < adapter->num_q_vectors; i++) {<br>
                struct igb_q_vector *q_vector = adapter->q_vector[i];<br>
+               q_vector->tx_work_limit = adapter->tx_work_limit;<br>
                if (q_vector->rx_ring)<br>
                        q_vector->itr_val = adapter->rx_itr_setting;<br>
                else<br>
@@ -2033,6 +2037,8 @@ static int igb_get_coalesce(struct net_device *netdev,<br>
        else<br>
                ec->rx_coalesce_usecs = adapter->rx_itr_setting >> 2;<br>
<br>
+       ec->tx_max_coalesced_frames_irq = adapter->tx_work_limit;<br>
+<br>
        if (!(adapter->flags & IGB_FLAG_QUEUE_PAIRS)) {<br>
                if (adapter->tx_itr_setting <= 3)<br>
                        ec->tx_coalesce_usecs = adapter->tx_itr_setting;<br>
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c<br>
index 7ad25e8..989e774 100644<br>
--- a/drivers/net/ethernet/intel/igb/igb_main.c<br>
+++ b/drivers/net/ethernet/intel/igb/igb_main.c<br>
@@ -136,8 +136,8 @@ static irqreturn_t igb_msix_ring(int irq, void *);<br>
 static void igb_update_dca(struct igb_q_vector *);<br>
 static void igb_setup_dca(struct igb_adapter *);<br>
 #endif /* CONFIG_IGB_DCA */<br>
-static bool igb_clean_tx_irq(struct igb_q_vector *);<br>
 static int igb_poll(struct napi_struct *, int);<br>
+static bool igb_clean_tx_irq(struct igb_q_vector *);<br>
 static bool igb_clean_rx_irq(struct igb_q_vector *, int);<br>
 static int igb_ioctl(struct net_device *, struct ifreq *, int cmd);<br>
 static void igb_tx_timeout(struct net_device *);<br>
@@ -1120,6 +1120,7 @@ static void igb_map_tx_ring_to_vector(struct igb_adapter *adapter,<br>
        q_vector->tx_ring = adapter->tx_ring[ring_idx];<br>
        q_vector->tx_ring->q_vector = q_vector;<br>
        q_vector->itr_val = adapter->tx_itr_setting;<br>
+       q_vector->tx_work_limit = adapter->tx_work_limit;<br>
        if (q_vector->itr_val && q_vector->itr_val <= 3)<br>
                q_vector->itr_val = IGB_START_ITR;<br>
 }<br>
@@ -2388,11 +2389,17 @@ static int __devinit igb_sw_init(struct igb_adapter *adapter)<br>
<br>
        pci_read_config_word(pdev, PCI_COMMAND, &hw->bus.pci_cmd_word);<br>
<br>
+       /* set default ring sizes */<br>
        adapter->tx_ring_count = IGB_DEFAULT_TXD;<br>
        adapter->rx_ring_count = IGB_DEFAULT_RXD;<br>
+<br>
+       /* set default ITR values */<br>
        adapter->rx_itr_setting = IGB_DEFAULT_ITR;<br>
        adapter->tx_itr_setting = IGB_DEFAULT_ITR;<br>
<br>
+       /* set default work limits */<br>
+       adapter->tx_work_limit = IXGBE_DEFAULT_TX_WORK;<br>
+<br>
        adapter->max_frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN +<br>
                                  VLAN_HLEN;<br>
        adapter->min_frame_size = ETH_ZLEN + ETH_FCS_LEN;<br>
@@ -5496,7 +5503,7 @@ static int igb_poll(struct napi_struct *napi, int budget)<br>
                igb_update_dca(q_vector);<br>
 #endif<br>
        if (q_vector->tx_ring)<br>
-               clean_complete = !!igb_clean_tx_irq(q_vector);<br>
+               clean_complete = igb_clean_tx_irq(q_vector);<br>
<br>
        if (q_vector->rx_ring)<br>
                clean_complete &= igb_clean_rx_irq(q_vector, budget);<br>
@@ -5578,64 +5585,69 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)<br>
 {<br>
        struct igb_adapter *adapter = q_vector->adapter;<br>
        struct igb_ring *tx_ring = q_vector->tx_ring;<br>
-       struct net_device *netdev = tx_ring->netdev;<br>
-       struct e1000_hw *hw = &adapter->hw;<br>
-       struct igb_buffer *buffer_info;<br>
-       union e1000_adv_tx_desc *tx_desc, *eop_desc;<br>
+       struct igb_buffer *tx_buffer;<br>
+       union e1000_adv_tx_desc *tx_desc;<br>
        unsigned int total_bytes = 0, total_packets = 0;<br>
-       unsigned int i, eop, count = 0;<br>
-       bool cleaned = false;<br>
+       unsigned int budget = q_vector->tx_work_limit;<br>
+       u16 i = tx_ring->next_to_clean;<br>
<br>
-       i = tx_ring->next_to_clean;<br>
-       eop = tx_ring->buffer_info[i].next_to_watch;<br>
-       eop_desc = IGB_TX_DESC(tx_ring, eop);<br>
+       if (test_bit(__IGB_DOWN, &adapter->state))<br>
+               return true;<br>
<br>
-       while ((eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD)) &&<br>
-              (count < tx_ring->count)) {<br>
-               rmb();  /* read buffer_info after eop_desc status */<br>
-               for (cleaned = false; !cleaned; count++) {<br>
-                       tx_desc = IGB_TX_DESC(tx_ring, i);<br>
-                       buffer_info = &tx_ring->buffer_info[i];<br>
-                       cleaned = (i == eop);<br>
+       tx_buffer = &tx_ring->buffer_info[i];<br>
+       tx_desc = IGB_TX_DESC(tx_ring, i);<br>
<br>
-                       if (buffer_info->skb) {<br>
-                               total_bytes += buffer_info->bytecount;<br>
-                               /* gso_segs is currently only valid for tcp */<br>
-                               total_packets += buffer_info->gso_segs;<br>
-                               igb_tx_hwtstamp(q_vector, buffer_info);<br>
-                       }<br>
+       for (; budget; budget--) {<br>
+               u16 eop = tx_buffer->next_to_watch;<br>
+               union e1000_adv_tx_desc *eop_desc;<br>
+<br>
+               eop_desc = IGB_TX_DESC(tx_ring, eop);<br>
+<br>
+               /* if DD is not set pending work has not been completed */<br>
+               if (!(eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD)))<br>
+                       break;<br>
+<br>
+               /* prevent any other reads prior to eop_desc being verified */<br>
+               rmb();<br>
<br>
-                       igb_unmap_and_free_tx_resource(tx_ring, buffer_info);<br>
+               do {<br>
                        tx_desc->wb.status = 0;<br>
+                       if (likely(tx_desc == eop_desc)) {<br>
+                               eop_desc = NULL;<br>
+<br>
+                               total_bytes += tx_buffer->bytecount;<br>
+                               total_packets += tx_buffer->gso_segs;<br>
+                               igb_tx_hwtstamp(q_vector, tx_buffer);<br>
+                       }<br>
+<br>
+                       igb_unmap_and_free_tx_resource(tx_ring, tx_buffer);<br>
<br>
+                       tx_buffer++;<br>
+                       tx_desc++;<br>
                        i++;<br>
-                       if (i == tx_ring->count)<br>
+                       if (unlikely(i == tx_ring->count)) {<br>
                                i = 0;<br>
-               }<br>
-               eop = tx_ring->buffer_info[i].next_to_watch;<br>
-               eop_desc = IGB_TX_DESC(tx_ring, eop);<br>
+                               tx_buffer = tx_ring->buffer_info;<br>
+                               tx_desc = IGB_TX_DESC(tx_ring, 0);<br>
+                       }<br>
+               } while (eop_desc);<br>
        }<br>
<br>
        tx_ring->next_to_clean = i;<br>
+       u64_stats_update_begin(&tx_ring->tx_syncp);<br>
+       tx_ring->tx_stats.bytes += total_bytes;<br>
+       tx_ring->tx_stats.packets += total_packets;<br>
+       u64_stats_update_end(&tx_ring->tx_syncp);<br>
+       tx_ring->total_bytes += total_bytes;<br>
+       tx_ring->total_packets += total_packets;<br>
<br>
-       if (unlikely(count &&<br>
-                    netif_carrier_ok(netdev) &&<br>
-                    igb_desc_unused(tx_ring) >= IGB_TX_QUEUE_WAKE)) {<br>
-               /* Make sure that anybody stopping the queue after this<br>
-                * sees the new next_to_clean.<br>
-                */<br>
-               smp_mb();<br>
-               if (__netif_subqueue_stopped(netdev, tx_ring->queue_index) &&<br>
-                   !(test_bit(__IGB_DOWN, &adapter->state))) {<br>
-                       netif_wake_subqueue(netdev, tx_ring->queue_index);<br>
+       if (tx_ring->detect_tx_hung) {<br>
+               struct e1000_hw *hw = &adapter->hw;<br>
+               u16 eop = tx_ring->buffer_info[i].next_to_watch;<br>
+               union e1000_adv_tx_desc *eop_desc;<br>
<br>
-                       u64_stats_update_begin(&tx_ring->tx_syncp);<br>
-                       tx_ring->tx_stats.restart_queue++;<br>
-                       u64_stats_update_end(&tx_ring->tx_syncp);<br>
-               }<br>
-       }<br>
+               eop_desc = IGB_TX_DESC(tx_ring, eop);<br>
<br>
-       if (tx_ring->detect_tx_hung) {<br>
                /* Detect a transmit hang in hardware, this serializes the<br>
                 * check with the clearing of time_stamp and movement of i */<br>
                tx_ring->detect_tx_hung = false;<br>
@@ -5666,16 +5678,34 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)<br>
                                eop,<br>
                                jiffies,<br>
                                eop_desc->wb.status);<br>
-                       netif_stop_subqueue(netdev, tx_ring->queue_index);<br>
+                       netif_stop_subqueue(tx_ring->netdev,<br>
+                                           tx_ring->queue_index);<br>
+<br>
+                       /* we are about to reset, no point in enabling stuff */<br>
+                       return true;<br>
                }<br>
        }<br>
-       tx_ring->total_bytes += total_bytes;<br>
-       tx_ring->total_packets += total_packets;<br>
-       u64_stats_update_begin(&tx_ring->tx_syncp);<br>
-       tx_ring->tx_stats.bytes += total_bytes;<br>
-       tx_ring->tx_stats.packets += total_packets;<br>
-       u64_stats_update_end(&tx_ring->tx_syncp);<br>
-       return count < tx_ring->count;<br>
+<br>
+       if (unlikely(total_packets &&<br>
+                    netif_carrier_ok(tx_ring->netdev) &&<br>
+                    igb_desc_unused(tx_ring) >= IGB_TX_QUEUE_WAKE)) {<br>
+               /* Make sure that anybody stopping the queue after this<br>
+                * sees the new next_to_clean.<br>
+                */<br>
+               smp_mb();<br>
+               if (__netif_subqueue_stopped(tx_ring->netdev,<br>
+                                            tx_ring->queue_index) &&<br>
+                   !(test_bit(__IGB_DOWN, &adapter->state))) {<br>
+                       netif_wake_subqueue(tx_ring->netdev,<br>
+                                           tx_ring->queue_index);<br>
+<br>
+                       u64_stats_update_begin(&tx_ring->tx_syncp);<br>
+                       tx_ring->tx_stats.restart_queue++;<br>
+                       u64_stats_update_end(&tx_ring->tx_syncp);<br>
+               }<br>
+       }<br>
+<br>
+       return !!budget;<br>
 }<br>
<br>
 static inline void igb_rx_checksum(struct igb_ring *ring,<br>
--<br>
1.7.6<br>
<font color="#888888"><br>
--<br>
To unsubscribe from this list: send the line "unsubscribe netdev" in<br>
the body of a message to <a href="mailto:majordomo@vger.kernel.org">majordomo@vger.kernel.org</a><br>
More majordomo info at  <a href="http://vger.kernel.org/majordomo-info.html" target="_blank">http://vger.kernel.org/majordomo-info.html</a><br>
</font></div><br><br clear="all"><br>-- <br>Dave Täht<br>SKYPE: davetaht<br>US Tel: 1-239-829-5608<br><a href="http://the-edge.blogspot.com" target="_blank">http://the-edge.blogspot.com</a> <br>