[Devel] [PATCH RHEL7 COMMIT] ms/blk-wbt: Avoid lock contention and thundering herd issue in wbt_wait

Fri Oct 14 19:50:17 MSK 2022

The commit is pushed to "branch-rh7-3.10.0-1160.76.1.vz7.189.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.76.1.vz7.189.4
------>
commit 9b97c13c145a9cd9c53fba49d8037cb2c978d8d2
Author: Anchal Agarwal <anchalag at amazon.com>
Date:   Thu Sep 29 14:30:01 2022 +0300

    ms/blk-wbt: Avoid lock contention and thundering herd issue in wbt_wait
    
    I am currently running a large bare metal instance (i3.metal)
    on EC2 with 72 cores, 512GB of RAM and NVME drives, with a
    4.18 kernel. I have a workload that simulates a database
    workload and I am running into lockup issues when writeback
    throttling is enabled,with the hung task detector also
    kicking in.
    
    Crash dumps show that most CPUs (up to 50 of them) are
    all trying to get the wbt wait queue lock while trying to add
    themselves to it in __wbt_wait (see stack traces below).
    
    [    0.948118] CPU: 45 PID: 0 Comm: swapper/45 Not tainted 4.14.51-62.38.amzn1.x86_64 #1
    [    0.948119] Hardware name: Amazon EC2 i3.metal/Not Specified, BIOS 1.0 10/16/2017
    [    0.948120] task: ffff883f7878c000 task.stack: ffffc9000c69c000
    [    0.948124] RIP: 0010:native_queued_spin_lock_slowpath+0xf8/0x1a0
    [    0.948125] RSP: 0018:ffff883f7fcc3dc8 EFLAGS: 00000046
    [    0.948126] RAX: 0000000000000000 RBX: ffff887f7709ca68 RCX: ffff883f7fce2a00
    [    0.948128] RDX: 000000000000001c RSI: 0000000000740001 RDI: ffff887f7709ca68
    [    0.948129] RBP: 0000000000000002 R08: 0000000000b80000 R09: 0000000000000000
    [    0.948130] R10: ffff883f7fcc3d78 R11: 000000000de27121 R12: 0000000000000002
    [    0.948131] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000000
    [    0.948132] FS:  0000000000000000(0000) GS:ffff883f7fcc0000(0000) knlGS:0000000000000000
    [    0.948134] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    [    0.948135] CR2: 000000c424c77000 CR3: 0000000002010005 CR4: 00000000003606e0
    [    0.948136] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
    [    0.948137] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
    [    0.948138] Call Trace:
    [    0.948139]  <IRQ>
    [    0.948142]  do_raw_spin_lock+0xad/0xc0
    [    0.948145]  _raw_spin_lock_irqsave+0x44/0x4b
    [    0.948149]  ? __wake_up_common_lock+0x53/0x90
    [    0.948150]  __wake_up_common_lock+0x53/0x90
    [    0.948155]  wbt_done+0x7b/0xa0
    [    0.948158]  blk_mq_free_request+0xb7/0x110
    [    0.948161]  __blk_mq_complete_request+0xcb/0x140
    [    0.948166]  nvme_process_cq+0xce/0x1a0 [nvme]
    [    0.948169]  nvme_irq+0x23/0x50 [nvme]
    [    0.948173]  __handle_irq_event_percpu+0x46/0x300
    [    0.948176]  handle_irq_event_percpu+0x20/0x50
    [    0.948179]  handle_irq_event+0x34/0x60
    [    0.948181]  handle_edge_irq+0x77/0x190
    [    0.948185]  handle_irq+0xaf/0x120
    [    0.948188]  do_IRQ+0x53/0x110
    [    0.948191]  common_interrupt+0x87/0x87
    [    0.948192]  </IRQ>
    ....
    [    0.311136] CPU: 4 PID: 9737 Comm: run_linux_amd64 Not tainted 4.14.51-62.38.amzn1.x86_64 #1
    [    0.311137] Hardware name: Amazon EC2 i3.metal/Not Specified, BIOS 1.0 10/16/2017
    [    0.311138] task: ffff883f6e6a8000 task.stack: ffffc9000f1ec000
    [    0.311141] RIP: 0010:native_queued_spin_lock_slowpath+0xf5/0x1a0
    [    0.311142] RSP: 0018:ffffc9000f1efa28 EFLAGS: 00000046
    [    0.311144] RAX: 0000000000000000 RBX: ffff887f7709ca68 RCX: ffff883f7f722a00
    [    0.311145] RDX: 0000000000000035 RSI: 0000000000d80001 RDI: ffff887f7709ca68
    [    0.311146] RBP: 0000000000000202 R08: 0000000000140000 R09: 0000000000000000
    [    0.311147] R10: ffffc9000f1ef9d8 R11: 000000001a249fa0 R12: ffff887f7709ca68
    [    0.311148] R13: ffffc9000f1efad0 R14: 0000000000000000 R15: ffff887f7709ca00
    [    0.311149] FS:  000000c423f30090(0000) GS:ffff883f7f700000(0000) knlGS:0000000000000000
    [    0.311150] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    [    0.311151] CR2: 00007feefcea4000 CR3: 0000007f7016e001 CR4: 00000000003606e0
    [    0.311152] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
    [    0.311153] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
    [    0.311154] Call Trace:
    [    0.311157]  do_raw_spin_lock+0xad/0xc0
    [    0.311160]  _raw_spin_lock_irqsave+0x44/0x4b
    [    0.311162]  ? prepare_to_wait_exclusive+0x28/0xb0
    [    0.311164]  prepare_to_wait_exclusive+0x28/0xb0
    [    0.311167]  wbt_wait+0x127/0x330
    [    0.311169]  ? finish_wait+0x80/0x80
    [    0.311172]  ? generic_make_request+0xda/0x3b0
    [    0.311174]  blk_mq_make_request+0xd6/0x7b0
    [    0.311176]  ? blk_queue_enter+0x24/0x260
    [    0.311178]  ? generic_make_request+0xda/0x3b0
    [    0.311181]  generic_make_request+0x10c/0x3b0
    [    0.311183]  ? submit_bio+0x5c/0x110
    [    0.311185]  submit_bio+0x5c/0x110
    [    0.311197]  ? __ext4_journal_stop+0x36/0xa0 [ext4]
    [    0.311210]  ext4_io_submit+0x48/0x60 [ext4]
    [    0.311222]  ext4_writepages+0x810/0x11f0 [ext4]
    [    0.311229]  ? do_writepages+0x3c/0xd0
    [    0.311239]  ? ext4_mark_inode_dirty+0x260/0x260 [ext4]
    [    0.311240]  do_writepages+0x3c/0xd0
    [    0.311243]  ? _raw_spin_unlock+0x24/0x30
    [    0.311245]  ? wbc_attach_and_unlock_inode+0x165/0x280
    [    0.311248]  ? __filemap_fdatawrite_range+0xa3/0xe0
    [    0.311250]  __filemap_fdatawrite_range+0xa3/0xe0
    [    0.311253]  file_write_and_wait_range+0x34/0x90
    [    0.311264]  ext4_sync_file+0x151/0x500 [ext4]
    [    0.311267]  do_fsync+0x38/0x60
    [    0.311270]  SyS_fsync+0xc/0x10
    [    0.311272]  do_syscall_64+0x6f/0x170
    [    0.311274]  entry_SYSCALL_64_after_hwframe+0x42/0xb7
    
    In the original patch, wbt_done is waking up all the exclusive
    processes in the wait queue, which can cause a thundering herd
    if there is a large number of writer threads in the queue. The
    original intention of the code seems to be to wake up one thread
    only however, it uses wake_up_all() in __wbt_done(), and then
    uses the following check in __wbt_wait to have only one thread
    actually get out of the wait loop:
    
    if (waitqueue_active(&rqw->wait) &&
                rqw->wait.head.next != &wait->entry)
                    return false;
    
    The problem with this is that the wait entry in wbt_wait is
    define with DEFINE_WAIT, which uses the autoremove wakeup function.
    That means that the above check is invalid - the wait entry will
    have been removed from the queue already by the time we hit the
    check in the loop.
    
    Secondly, auto-removing the wait entries also means that the wait
    queue essentially gets reordered "randomly" (e.g. threads re-add
    themselves in the order they got to run after being woken up).
    Additionally, new requests entering wbt_wait might overtake requests
    that were queued earlier, because the wait queue will be
    (temporarily) empty after the wake_up_all, so the waitqueue_active
    check will not stop them. This can cause certain threads to starve
    under high load.
    
    The fix is to leave the woken up requests in the queue and remove
    them in finish_wait() once the current thread breaks out of the
    wait loop in __wbt_wait. This will ensure new requests always
    end up at the back of the queue, and they won't overtake requests
    that are already in the wait queue. With that change, the loop
    in wbt_wait is also in line with many other wait loops in the kernel.
    Waking up just one thread drastically reduces lock contention, as
    does moving the wait queue add/remove out of the loop.
    
    A significant drop in lockdep's lock contention numbers is seen when
    running the test application on the patched kernel.
    
    Signed-off-by: Anchal Agarwal <anchalag at amazon.com>
    Signed-off-by: Frank van der Linden <fllinden at amazon.com>
    Signed-off-by: Jens Axboe <axboe at kernel.dk>
    
    Changes porting to vz7:
    - add rq_wait_inc_below helper
    
    https://jira.sw.ru/browse/PSBM-141883
    (cherry picked from commit 2887e41b910bb14fd847cf01ab7a5993db989d88)
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    =================
    Patchset description:
    blk-wbt: Fix hardlockup in wbt_done()
    
    We have a hard lockup detected in this stack:
    
     #13 [ffff9103fe603af8] __enqueue_entity at ffffffffb8ce64c5
     #14 [ffff9103fe603b00] enqueue_entity at ffffffffb8cee27a
     #15 [ffff9103fe603b50] enqueue_task_fair at ffffffffb8ceea9c
     #16 [ffff9103fe603ba0] activate_task at ffffffffb8cdd029
     #17 [ffff9103fe603bc8] ttwu_do_activate at ffffffffb8cdd491
     #18 [ffff9103fe603bf0] try_to_wake_up at ffffffffb8ce124a
     #19 [ffff9103fe603c40] default_wake_function at ffffffffb8ce1552
     #20 [ffff9103fe603c50] autoremove_wake_function at ffffffffb8ccb178
     #21 [ffff9103fe603c78] __wake_up_common at ffffffffb8cd7752
     #22 [ffff9103fe603cd0] __wake_up_common_lock at ffffffffb8cd7873
     #23 [ffff9103fe603d40] __wake_up at ffffffffb8cd78c3
     #24 [ffff9103fe603d50] __wbt_done at ffffffffb8fb6573
     #25 [ffff9103fe603d60] wbt_done at ffffffffb8fb65f2
     #26 [ffff9103fe603d80] __blk_mq_finish_request at ffffffffb8f8daa1
     #27 [ffff9103fe603db8] blk_mq_finish_request at ffffffffb8f8db6a
     #28 [ffff9103fe603dc8] blk_mq_sched_put_request at ffffffffb8f93ee0
     #29 [ffff9103fe603de8] blk_mq_end_request at ffffffffb8f8d1a4
     #30 [ffff9103fe603e08] nvme_complete_rq at ffffffffc033dcfc [nvme_core]
     #31 [ffff9103fe603e18] nvme_pci_complete_rq at ffffffffc038be70 [nvme]
     #32 [ffff9103fe603e40] __blk_mq_complete_request at ffffffffb8f8d316
     #33 [ffff9103fe603e68] blk_mq_complete_request at ffffffffb8f8d3c7
     #34 [ffff9103fe603e78] nvme_irq at ffffffffc038c0b2 [nvme]
     #35 [ffff9103fe603eb0] __handle_irq_event_percpu at ffffffffb8d66bb4
     #36 [ffff9103fe603ef8] handle_irq_event_percpu at ffffffffb8d66d62
     #37 [ffff9103fe603f28] handle_irq_event at ffffffffb8d66dec
     #38 [ffff9103fe603f50] handle_edge_irq at ffffffffb8d69c0f
     #39 [ffff9103fe603f70] handle_irq at ffffffffb8c30524
     #40 [ffff9103fe603fb8] do_IRQ at ffffffffb93d898d
    
    which is exactly the same as ubuntu problem here:
    
    https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1810998
    
    this is because we have writeback throttling ported which does not work
    well in some cases.
    
    In launchpad bug it helped to port these patches from mainstream:
    
      * CPU hard lockup with rigorous writes to NVMe drive (LP: #1810998)
        - blk-wbt: Avoid lock contention and thundering herd issue in wbt_wait
        - blk-wbt: move disable check into get_limit()
        - blk-wbt: use wq_has_sleeper() for wq active check
        - blk-wbt: fix has-sleeper queueing check
        - blk-wbt: abstract out end IO completion handler
        - blk-wbt: improve waking of tasks
    
    which fixes similar lockup issues in wbt.
    
    More over I've found some more small and useful patches which fix races
    (missed wakeups) in this code, so I've also put them in the patchset.
    
    Anchal Agarwal (1):
      blk-wbt: Avoid lock contention and thundering herd issue in wbt_wait
    
    Herbert Xu (1):
      net: Generalise wq_has_sleeper helper
    
    Jens Axboe (5):
      blk-wbt: move disable check into get_limit()
      blk-wbt: use wq_has_sleeper() for wq active check
      blk-wbt: fix has-sleeper queueing check
      blk-wbt: abstract out end IO completion handler
      blk-wbt: improve waking of tasks
    
    Josef Bacik (5):
      wait: add wq_has_single_sleeper helper
      rq-qos: fix missed wake-ups in rq_qos_throttle
      rq-qos: don't reset has_sleepers on spurious wakeups
      rq-qos: set ourself TASK_UNINTERRUPTIBLE after we schedule
      rq-qos: use a mb for got_token
    
    https://jira.sw.ru/browse/PSBM-141883
    
    Ported-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 block/blk-wbt.c | 60 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 29 insertions(+), 31 deletions(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 67f0c9e6451f..600046a47ed8 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -81,6 +81,11 @@ static bool atomic_inc_below(atomic_t *v, int below)
 	return true;
 }
 
+bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
+{
+	return atomic_inc_below(&rq_wait->inflight, limit);
+}
+
 static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
 {
 	if (rwb_enabled(rwb)) {
@@ -158,7 +163,7 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
 		int diff = limit - inflight;
 
 		if (!inflight || diff >= rwb->wb_background / 2)
-			wake_up_all(&rqw->wait);
+			wake_up(&rqw->wait);
 	}
 }
 
@@ -499,30 +504,6 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 	return limit;
 }
 
-static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
-			     wait_queue_t *wait, unsigned long rw)
-{
-	/*
-	 * inc it here even if disabled, since we'll dec it at completion.
-	 * this only happens if the task was sleeping in __wbt_wait(),
-	 * and someone turned it off at the same time.
-	 */
-	if (!rwb_enabled(rwb)) {
-		atomic_inc(&rqw->inflight);
-		return true;
-	}
-
-	/*
-	 * If the waitqueue is already active and we are not the next
-	 * in line to be woken up, wait for our turn.
-	 */
-	if (waitqueue_active(&rqw->wait) &&
-	    rqw->wait.task_list.next != &wait->task_list)
-		return false;
-
-	return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
-}
-
 /*
  * Block if we will exceed our limit, or if we are currently waiting for
  * the timer to kick off queuing again.
@@ -530,16 +511,32 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
 static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
 {
 	struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
-	DEFINE_WAIT(wait);
+	DECLARE_WAITQUEUE(wait, current);
 
-	if (may_queue(rwb, rqw, &wait, rw))
+	/*
+	* inc it here even if disabled, since we'll dec it at completion.
+	* this only happens if the task was sleeping in __wbt_wait(),
+	* and someone turned it off at the same time.
+	*/
+	if (!rwb_enabled(rwb)) {
+		atomic_inc(&rqw->inflight);
+		return;
+	}
+
+	if (!waitqueue_active(&rqw->wait)
+		&& rq_wait_inc_below(rqw, get_limit(rwb, rw)))
 		return;
 
+	add_wait_queue_exclusive(&rqw->wait, &wait);
 	do {
-		prepare_to_wait_exclusive(&rqw->wait, &wait,
-						TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+
+		if (!rwb_enabled(rwb)) {
+			atomic_inc(&rqw->inflight);
+			break;
+		}
 
-		if (may_queue(rwb, rqw, &wait, rw))
+		if (rq_wait_inc_below(rqw, get_limit(rwb, rw)))
 			break;
 
 		if (lock)
@@ -551,7 +548,8 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
 			spin_lock_irq(lock);
 	} while (1);
 
-	finish_wait(&rqw->wait, &wait);
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&rqw->wait, &wait);
 }
 
 static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)