[Devel] [PATCH RHEL7 COMMIT] ms/ixgbe: Refactor busy poll socket code to address multiple issues

Konstantin Khorenko khorenko at virtuozzo.com
Mon Dec 28 05:48:03 PST 2015


The commit is pushed to "branch-rh7-3.10.0-229.7.2.vz7.9.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.9.18
------>
commit 58c37d8dec10fe0ab5b90ce96ebfa2f2ecc4175d
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Mon Dec 28 17:48:03 2015 +0400

    ms/ixgbe: Refactor busy poll socket code to address multiple issues
    
    In netpoll_send_skb_on_dev we do not trigger warn so irqs are disabled.
    And in ixgbe_poll->ixgbe_qv_unlock_napi with irqs disabled we use
    spin_unlock_bh, that is bug as inside it has local_irq_enable, witch
    will enable hardware interrupts.
    
    So port patch witch changes locking here to atomic_cmpxchg.
    
    https://jira.sw.ru/browse/PSBM-40330
    
    WARNING:
    Dec  1 18:03:10 p9 kernel: netpoll: netconsole: local port 6666
    Dec  1 18:03:10 p9 kernel: netpoll: netconsole: local IPv4 address 10.94.2.172
    Dec  1 18:03:10 p9 kernel: netpoll: netconsole: interface 'br0'
    Dec  1 18:03:10 p9 kernel: netpoll: netconsole: remote port 6666
    Dec  1 18:03:10 p9 kernel: netpoll: netconsole: remote IPv4 address 10.29.0.70
    Dec  1 18:03:10 p9 kernel: netpoll: netconsole: remote ethernet address 7c:95:f3:2e:05:64
    Dec  1 18:03:10 p9 kernel: ------------[ cut here ]------------
    Dec  1 18:03:10 p9 kernel: WARNING: at kernel/softirq.c:162 local_bh_enable_ip+0x7a/0xa0()
    Dec  1 18:03:10 p9 kernel: Modules linked in: netconsole(+) xt_CHECKSUM iptable_mangle iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT tun ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter ip_tables 8021q garp mrp intel_powerclamp coretemp kvm_intel kvm snd_pcm snd_timer snd iTCO_wdt iTCO_vendor_support crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel lrw gf128mul ioatdma glue_helper soundcore ablk_helper cryptd mei_me pcspkr ses sb_edac enclosure edac_core mei i2c_i801 ipmi_si shpchp lpc_ich mfd_core wmi ipmi_msghandler dm_mod ext4 mbcache jbd2 sd_mod crc_t10dif crct10dif_common mgag200 isci syscopyarea sysfillrect sysimgblt drm_kms_helper ttm libsas ixgbe drm igb scsi_transport_sas ahci libahci megaraid_sas libata mdio
    Dec  1 18:03:10 p9 kernel: ptp pps_core dca i2c_algo_bit i2c_core ip6_vzprivnet ip6_vznetstat pio_kaio pio_nfs pio_direct pfmt_raw pfmt_ploop1 ploop ip_vznetstat ip_vzprivnet vziolimit vzevent vzlist vzstat vznetstat vznetdev vzmon vzdev bridge stp llc
    Dec  1 18:03:10 p9 kernel: CPU: 31 PID: 3649 Comm: modprobe ve: 0 Not tainted 3.10.0-229.7.2.ovz.9.14-00004-gcc670bd-dirty #2 9.14
    Dec  1 18:03:10 p9 kernel: Hardware name: DEPO Computers X9DRi-LN4+/X9DR3-LN4+/X9DRi-LN4+/X9DR3-LN4+, BIOS 3.2 03/04/2015
    Dec  1 18:03:10 p9 kernel: 00000000000000a2 000000001e836364 ffff883fd0aeb908 ffffffff816222a3
    Dec  1 18:03:10 p9 kernel: ffff883fd0aeb940 ffffffff8107025b ffff883fc90d2040 ffff883fc90d2368
    Dec  1 18:03:10 p9 kernel: 0000000000000005 0000000000000001 00000000fffffe05 ffff883fd0aeb950
    Dec  1 18:03:10 p9 kernel: Call Trace:
    Dec  1 18:03:10 p9 kernel: [<ffffffff816222a3>] dump_stack+0x19/0x1b
    Dec  1 18:03:10 p9 kernel: [<ffffffff8107025b>] warn_slowpath_common+0x6b/0xa0
    Dec  1 18:03:10 p9 kernel: [<ffffffff8107039a>] warn_slowpath_null+0x1a/0x20
    Dec  1 18:03:10 p9 kernel: [<ffffffff81079bea>] local_bh_enable_ip+0x7a/0xa0
    Dec  1 18:03:10 p9 kernel: [<ffffffff816285cb>] _raw_spin_unlock_bh+0x1b/0x70
    Dec  1 18:03:10 p9 kernel: [<ffffffffa02e123d>] ixgbe_poll+0x54d/0x980 [ixgbe]
    Dec  1 18:03:10 p9 kernel: [<ffffffff8153d1b6>] netpoll_poll_dev+0x1b6/0xa80
    Dec  1 18:03:10 p9 kernel: [<ffffffffa02dcf4b>] ? ixgbe_select_queue+0xbb/0x120 [ixgbe]
    Dec  1 18:03:10 p9 kernel: [<ffffffff8153ddc0>] netpoll_send_skb_on_dev+0x340/0x480
    Dec  1 18:03:10 p9 kernel: [<ffffffff8119cf78>] ? __list_lru_walk_one.isra.3+0x148/0x170
    Dec  1 18:03:10 p9 kernel: [<ffffffffa00129d6>] __br_deliver+0x106/0x130 [bridge]
    Dec  1 18:03:10 p9 kernel: [<ffffffffa0012e83>] br_deliver+0x63/0x70 [bridge]
    Dec  1 18:03:10 p9 kernel: [<ffffffffa0010250>] br_dev_xmit+0x240/0x2a0 [bridge]
    Dec  1 18:03:10 p9 kernel: [<ffffffff8153dcf6>] netpoll_send_skb_on_dev+0x276/0x480
    Dec  1 18:03:10 p9 kernel: [<ffffffff8153e22b>] netpoll_send_udp+0x27b/0x390
    Dec  1 18:03:10 p9 kernel: [<ffffffffa05d383f>] write_msg+0xcf/0x140 [netconsole]
    Dec  1 18:03:10 p9 kernel: [<ffffffff81070f6f>] call_console_drivers.constprop.21+0x9f/0xf0
    Dec  1 18:03:10 p9 kernel: [<ffffffff810718b3>] console_unlock+0x293/0x410
    Dec  1 18:03:10 p9 kernel: [<ffffffff810726de>] register_console+0x14e/0x3e0
    Dec  1 18:03:10 p9 kernel: [<ffffffffa05d81a0>] init_netconsole+0x1a0/0x1000 [netconsole]
    Dec  1 18:03:10 p9 kernel: [<ffffffffa05d8000>] ? 0xffffffffa05d7fff
    Dec  1 18:03:10 p9 kernel: [<ffffffff810020e8>] do_one_initcall+0xb8/0x230
    Dec  1 18:03:10 p9 kernel: [<ffffffff810f3996>] load_module+0x1656/0x1cc0
    Dec  1 18:03:10 p9 kernel: [<ffffffff8130c7a0>] ? ddebug_proc_write+0xf0/0xf0
    Dec  1 18:03:10 p9 kernel: [<ffffffff810ef6aa>] ? copy_module_from_fd.isra.42+0x5a/0x150
    Dec  1 18:03:10 p9 kernel: [<ffffffff810f41de>] SyS_finit_module+0xbe/0xf0
    Dec  1 18:03:10 p9 kernel: [<ffffffff816318c9>] system_call_fastpath+0x16/0x1b
    Dec  1 18:03:10 p9 kernel: ---[ end trace 44dd3910a6b1ff80 ]---
    Dec  1 18:03:10 p9 kernel: Tainting kernel with flag 0x9
    
    Port ms commit:
      commit adc810900a703ee78fe88fd65e086d359fec04b2
      Author: Alexander Duyck <alexander.h.duyck at intel.com>
      Date:   Sat Jul 26 02:42:44 2014 +0000
    
        This change addresses several issues in the current ixgbe implementation of
        busy poll sockets.
    
        First was the fact that it was possible for frames to be delivered out of
        order if they were held in GRO.  This is addressed by flushing the GRO buffers
        before releasing the q_vector back to the idle state.
    
        The other issue was the fact that we were having to take a spinlock on
        changing the state to and from idle.  To resolve this I have replaced the
        state value with an atomic and use atomic_cmpxchg to change the value from
        idle, and a simple atomic set to restore it back to idle after we have
        acquired it.  This allows us to only use a locked operation on acquiring the
        vector without a need for a locked operation to release it.
    
        Signed-off-by: Alexander Duyck <alexander.h.duyck at intel.com>
        Tested-by: Phil Schmitt <phillip.j.schmitt at intel.com>
        Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher at intel.com>
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    Acked-by: Andrew Vagin <avagin at virtuozzo.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h     | 112 ++++++++++-----------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c |   5 ++
 2 files changed, 45 insertions(+), 72 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 211c30e..00da363 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -371,119 +371,87 @@ struct ixgbe_q_vector {
 	char name[IFNAMSIZ + 9];
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
-	unsigned int state;
-#define IXGBE_QV_STATE_IDLE        0
-#define IXGBE_QV_STATE_NAPI	   1     /* NAPI owns this QV */
-#define IXGBE_QV_STATE_POLL	   2     /* poll owns this QV */
-#define IXGBE_QV_STATE_DISABLED	   4     /* QV is disabled */
-#define IXGBE_QV_OWNED (IXGBE_QV_STATE_NAPI | IXGBE_QV_STATE_POLL)
-#define IXGBE_QV_LOCKED (IXGBE_QV_OWNED | IXGBE_QV_STATE_DISABLED)
-#define IXGBE_QV_STATE_NAPI_YIELD  8     /* NAPI yielded this QV */
-#define IXGBE_QV_STATE_POLL_YIELD  16    /* poll yielded this QV */
-#define IXGBE_QV_YIELD (IXGBE_QV_STATE_NAPI_YIELD | IXGBE_QV_STATE_POLL_YIELD)
-#define IXGBE_QV_USER_PEND (IXGBE_QV_STATE_POLL | IXGBE_QV_STATE_POLL_YIELD)
-	spinlock_t lock;
 #endif  /* CONFIG_NET_RX_BUSY_POLL */
+	atomic_t state;
 
 	/* for dynamic allocation of rings associated with this q_vector */
 	struct ixgbe_ring ring[0] ____cacheline_internodealigned_in_smp;
 };
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
+enum ixgbe_qv_state_t {
+	IXGBE_QV_STATE_IDLE = 0,
+	IXGBE_QV_STATE_NAPI,
+	IXGBE_QV_STATE_POLL,
+	IXGBE_QV_STATE_DISABLE
+};
+
 static inline void ixgbe_qv_init_lock(struct ixgbe_q_vector *q_vector)
 {
-
-	spin_lock_init(&q_vector->lock);
-	q_vector->state = IXGBE_QV_STATE_IDLE;
+	/* reset state to idle */
+	atomic_set(&q_vector->state, IXGBE_QV_STATE_IDLE);
 }
 
 /* called from the device poll routine to get ownership of a q_vector */
 static inline bool ixgbe_qv_lock_napi(struct ixgbe_q_vector *q_vector)
 {
-	int rc = true;
-	spin_lock_bh(&q_vector->lock);
-	if (q_vector->state & IXGBE_QV_LOCKED) {
-		WARN_ON(q_vector->state & IXGBE_QV_STATE_NAPI);
-		q_vector->state |= IXGBE_QV_STATE_NAPI_YIELD;
-		rc = false;
+	int rc = atomic_cmpxchg(&q_vector->state, IXGBE_QV_STATE_IDLE,
+				IXGBE_QV_STATE_NAPI);
 #ifdef LL_EXTENDED_STATS
+	if (rc != IXGBE_QV_STATE_IDLE)
 		q_vector->tx.ring->stats.yields++;
 #endif
-	} else {
-		/* we don't care if someone yielded */
-		q_vector->state = IXGBE_QV_STATE_NAPI;
-	}
-	spin_unlock_bh(&q_vector->lock);
-	return rc;
+
+	return rc == IXGBE_QV_STATE_IDLE;
 }
 
 /* returns true is someone tried to get the qv while napi had it */
-static inline bool ixgbe_qv_unlock_napi(struct ixgbe_q_vector *q_vector)
+static inline void ixgbe_qv_unlock_napi(struct ixgbe_q_vector *q_vector)
 {
-	int rc = false;
-	spin_lock_bh(&q_vector->lock);
-	WARN_ON(q_vector->state & (IXGBE_QV_STATE_POLL |
-			       IXGBE_QV_STATE_NAPI_YIELD));
-
-	if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD)
-		rc = true;
-	/* will reset state to idle, unless QV is disabled */
-	q_vector->state &= IXGBE_QV_STATE_DISABLED;
-	spin_unlock_bh(&q_vector->lock);
-	return rc;
+	WARN_ON(atomic_read(&q_vector->state) != IXGBE_QV_STATE_NAPI);
+
+	/* flush any outstanding Rx frames */
+	if (q_vector->napi.gro_list)
+		napi_gro_flush(&q_vector->napi, false);
+
+	/* reset state to idle */
+	atomic_set(&q_vector->state, IXGBE_QV_STATE_IDLE);
 }
 
 /* called from ixgbe_low_latency_poll() */
 static inline bool ixgbe_qv_lock_poll(struct ixgbe_q_vector *q_vector)
 {
-	int rc = true;
-	spin_lock_bh(&q_vector->lock);
-	if ((q_vector->state & IXGBE_QV_LOCKED)) {
-		q_vector->state |= IXGBE_QV_STATE_POLL_YIELD;
-		rc = false;
+	int rc = atomic_cmpxchg(&q_vector->state, IXGBE_QV_STATE_IDLE,
+				IXGBE_QV_STATE_POLL);
 #ifdef LL_EXTENDED_STATS
-		q_vector->rx.ring->stats.yields++;
+	if (rc != IXGBE_QV_STATE_IDLE)
+		q_vector->tx.ring->stats.yields++;
 #endif
-	} else {
-		/* preserve yield marks */
-		q_vector->state |= IXGBE_QV_STATE_POLL;
-	}
-	spin_unlock_bh(&q_vector->lock);
-	return rc;
+	return rc == IXGBE_QV_STATE_IDLE;
 }
 
 /* returns true if someone tried to get the qv while it was locked */
-static inline bool ixgbe_qv_unlock_poll(struct ixgbe_q_vector *q_vector)
+static inline void ixgbe_qv_unlock_poll(struct ixgbe_q_vector *q_vector)
 {
-	int rc = false;
-	spin_lock_bh(&q_vector->lock);
-	WARN_ON(q_vector->state & (IXGBE_QV_STATE_NAPI));
-
-	if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD)
-		rc = true;
-	/* will reset state to idle, unless QV is disabled */
-	q_vector->state &= IXGBE_QV_STATE_DISABLED;
-	spin_unlock_bh(&q_vector->lock);
-	return rc;
+	WARN_ON(atomic_read(&q_vector->state) != IXGBE_QV_STATE_POLL);
+
+	/* reset state to idle */
+	atomic_set(&q_vector->state, IXGBE_QV_STATE_IDLE);
 }
 
 /* true if a socket is polling, even if it did not get the lock */
 static inline bool ixgbe_qv_ll_polling(struct ixgbe_q_vector *q_vector)
 {
-	WARN_ON(!(q_vector->state & IXGBE_QV_OWNED));
-	return q_vector->state & IXGBE_QV_USER_PEND;
+	return atomic_read(&q_vector->state) == IXGBE_QV_STATE_POLL;
 }
 
 /* false if QV is currently owned */
 static inline bool ixgbe_qv_disable(struct ixgbe_q_vector *q_vector)
 {
-	int rc = true;
-	spin_lock_bh(&q_vector->lock);
-	if (q_vector->state & IXGBE_QV_OWNED)
-		rc = false;
-	q_vector->state |= IXGBE_QV_STATE_DISABLED;
-	spin_unlock_bh(&q_vector->lock);
-
-	return rc;
+	int rc = atomic_cmpxchg(&q_vector->state, IXGBE_QV_STATE_IDLE,
+				IXGBE_QV_STATE_DISABLE);
+
+	return rc == IXGBE_QV_STATE_IDLE;
 }
 
 #else /* CONFIG_NET_RX_BUSY_POLL */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
index dcd2d1ec..b42e44d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
@@ -808,6 +808,11 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
 		       ixgbe_poll, 64);
 	napi_hash_add(&q_vector->napi);
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	/* initialize busy poll */
+	atomic_set(&q_vector->state, IXGBE_QV_STATE_DISABLE);
+
+#endif
 	/* tie q_vector and adapter together */
 	adapter->q_vector[v_idx] = q_vector;
 	q_vector->adapter = adapter;


More information about the Devel mailing list