[Devel] [PATCH RHEL7 COMMIT] ms/blk-wbt: improve waking of tasks

Konstantin Khorenko khorenko at virtuozzo.com
Fri Oct 14 19:50:18 MSK 2022


The commit is pushed to "branch-rh7-3.10.0-1160.76.1.vz7.189.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.76.1.vz7.189.4
------>
commit f39f017e090342e34cb60e3247c27d08931eee6c
Author: Jens Axboe <axboe at kernel.dk>
Date:   Thu Sep 29 14:30:07 2022 +0300

    ms/blk-wbt: improve waking of tasks
    
    We have two potential issues:
    
    1) After commit 2887e41b910b, we only wake one process at the time when
       we finish an IO. We really want to wake up as many tasks as can
       queue IO. Before this commit, we woke up everyone, which could cause
       a thundering herd issue.
    
    2) A task can potentially consume two wakeups, causing us to (in
       practice) miss a wakeup.
    
    Fix both by providing our own wakeup function, which stops
    __wake_up_common() from waking up more tasks if we fail to get a
    queueing token. With the strict ordering we have on the wait list, this
    wakes the right tasks and the right amount of tasks.
    
    Based on a patch from Jianchao Wang <jianchao.w.wang at oracle.com>.
    
    Tested-by: Agarwal, Anchal <anchalag at amazon.com>
    Signed-off-by: Jens Axboe <axboe at kernel.dk>
    
    Changes when porting to vz7:
    - s/wait_queue_entry/__wait_queue/
    - s/entry/task_list/
    - add wb_acct argument to __wbt_wait
    
    https://jira.sw.ru/browse/PSBM-141883
    (cherry picked from commit 38cfb5a45ee013bfab5d1ae4c4738815e744b440)
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    =================
    Patchset description:
    blk-wbt: Fix hardlockup in wbt_done()
    
    We have a hard lockup detected in this stack:
    
     #13 [ffff9103fe603af8] __enqueue_entity at ffffffffb8ce64c5
     #14 [ffff9103fe603b00] enqueue_entity at ffffffffb8cee27a
     #15 [ffff9103fe603b50] enqueue_task_fair at ffffffffb8ceea9c
     #16 [ffff9103fe603ba0] activate_task at ffffffffb8cdd029
     #17 [ffff9103fe603bc8] ttwu_do_activate at ffffffffb8cdd491
     #18 [ffff9103fe603bf0] try_to_wake_up at ffffffffb8ce124a
     #19 [ffff9103fe603c40] default_wake_function at ffffffffb8ce1552
     #20 [ffff9103fe603c50] autoremove_wake_function at ffffffffb8ccb178
     #21 [ffff9103fe603c78] __wake_up_common at ffffffffb8cd7752
     #22 [ffff9103fe603cd0] __wake_up_common_lock at ffffffffb8cd7873
     #23 [ffff9103fe603d40] __wake_up at ffffffffb8cd78c3
     #24 [ffff9103fe603d50] __wbt_done at ffffffffb8fb6573
     #25 [ffff9103fe603d60] wbt_done at ffffffffb8fb65f2
     #26 [ffff9103fe603d80] __blk_mq_finish_request at ffffffffb8f8daa1
     #27 [ffff9103fe603db8] blk_mq_finish_request at ffffffffb8f8db6a
     #28 [ffff9103fe603dc8] blk_mq_sched_put_request at ffffffffb8f93ee0
     #29 [ffff9103fe603de8] blk_mq_end_request at ffffffffb8f8d1a4
     #30 [ffff9103fe603e08] nvme_complete_rq at ffffffffc033dcfc [nvme_core]
     #31 [ffff9103fe603e18] nvme_pci_complete_rq at ffffffffc038be70 [nvme]
     #32 [ffff9103fe603e40] __blk_mq_complete_request at ffffffffb8f8d316
     #33 [ffff9103fe603e68] blk_mq_complete_request at ffffffffb8f8d3c7
     #34 [ffff9103fe603e78] nvme_irq at ffffffffc038c0b2 [nvme]
     #35 [ffff9103fe603eb0] __handle_irq_event_percpu at ffffffffb8d66bb4
     #36 [ffff9103fe603ef8] handle_irq_event_percpu at ffffffffb8d66d62
     #37 [ffff9103fe603f28] handle_irq_event at ffffffffb8d66dec
     #38 [ffff9103fe603f50] handle_edge_irq at ffffffffb8d69c0f
     #39 [ffff9103fe603f70] handle_irq at ffffffffb8c30524
     #40 [ffff9103fe603fb8] do_IRQ at ffffffffb93d898d
    
    which is exactly the same as ubuntu problem here:
    
    https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1810998
    
    this is because we have writeback throttling ported which does not work
    well in some cases.
    
    In launchpad bug it helped to port these patches from mainstream:
    
      * CPU hard lockup with rigorous writes to NVMe drive (LP: #1810998)
        - blk-wbt: Avoid lock contention and thundering herd issue in wbt_wait
        - blk-wbt: move disable check into get_limit()
        - blk-wbt: use wq_has_sleeper() for wq active check
        - blk-wbt: fix has-sleeper queueing check
        - blk-wbt: abstract out end IO completion handler
        - blk-wbt: improve waking of tasks
    
    which fixes similar lockup issues in wbt.
    
    More over I've found some more small and useful patches which fix races
    (missed wakeups) in this code, so I've also put them in the patchset.
    
    Anchal Agarwal (1):
      blk-wbt: Avoid lock contention and thundering herd issue in wbt_wait
    
    Herbert Xu (1):
      net: Generalise wq_has_sleeper helper
    
    Jens Axboe (5):
      blk-wbt: move disable check into get_limit()
      blk-wbt: use wq_has_sleeper() for wq active check
      blk-wbt: fix has-sleeper queueing check
      blk-wbt: abstract out end IO completion handler
      blk-wbt: improve waking of tasks
    
    Josef Bacik (5):
      wait: add wq_has_single_sleeper helper
      rq-qos: fix missed wake-ups in rq_qos_throttle
      rq-qos: don't reset has_sleepers on spurious wakeups
      rq-qos: set ourself TASK_UNINTERRUPTIBLE after we schedule
      rq-qos: use a mb for got_token
    
    https://jira.sw.ru/browse/PSBM-141883
    
    Ported-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 block/blk-wbt.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 61 insertions(+), 12 deletions(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 8719a820bfb7..49d11e089c97 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -159,7 +159,7 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
 		int diff = limit - inflight;
 
 		if (!inflight || diff >= rwb->wb_background / 2)
-			wake_up(&rqw->wait);
+			wake_up_all(&rqw->wait);
 	}
 }
 
@@ -518,26 +518,76 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 	return limit;
 }
 
+struct wbt_wait_data {
+	struct __wait_queue wq;
+	struct task_struct *task;
+	struct rq_wb *rwb;
+	struct rq_wait *rqw;
+	unsigned long rw;
+	bool got_token;
+};
+
+static int wbt_wake_function(struct __wait_queue *curr, unsigned int mode,
+			     int wake_flags, void *key)
+{
+	struct wbt_wait_data *data = container_of(curr, struct wbt_wait_data,
+							wq);
+
+	/*
+	 * If we fail to get a budget, return -1 to interrupt the wake up
+	 * loop in __wake_up_common.
+	 */
+	if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw)))
+		return -1;
+
+	data->got_token = true;
+	list_del_init(&curr->task_list);
+	wake_up_process(data->task);
+	return 1;
+}
+
 /*
  * Block if we will exceed our limit, or if we are currently waiting for
  * the timer to kick off queuing again.
  */
-static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
+static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
+		       unsigned long rw, spinlock_t *lock)
 {
 	struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
-	DECLARE_WAITQUEUE(wait, current);
+	struct wbt_wait_data data = {
+		.wq = {
+			.func = wbt_wake_function,
+			.task_list = LIST_HEAD_INIT(data.wq.task_list),
+		},
+		.task = current,
+		.rwb = rwb,
+		.rqw = rqw,
+		.rw = rw,
+	};
 	bool has_sleeper;
 
 	has_sleeper = wq_has_sleeper(&rqw->wait);
 	if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
 		return;
 
-	add_wait_queue_exclusive(&rqw->wait, &wait);
+	prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
 	do {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (data.got_token)
+			break;
 
-		if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
+		if (!has_sleeper &&
+		    rq_wait_inc_below(rqw, get_limit(rwb, rw))) {
+			finish_wait(&rqw->wait, &data.wq);
+
+			/*
+			 * We raced with wbt_wake_function() getting a token,
+			 * which means we now have two. Put our local token
+			 * and wake anyone else potentially waiting for one.
+			 */
+			if (data.got_token)
+				wbt_rqw_done(rwb, rqw, wb_acct);
 			break;
+		}
 
 		if (lock)
 			spin_unlock_irq(lock);
@@ -550,8 +600,7 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
 		has_sleeper = false;
 	} while (1);
 
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&rqw->wait, &wait);
+	finish_wait(&rqw->wait, &data.wq);
 }
 
 static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
@@ -595,14 +644,14 @@ unsigned int wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
 		return ret;
 	}
 
-	__wbt_wait(rwb, bio->bi_rw, lock);
+	if (current_is_kswapd())
+		ret |= WBT_KSWAPD;
+
+	__wbt_wait(rwb, ret, bio->bi_rw, lock);
 
 	if (!blk_stat_is_active(rwb->cb))
 		rwb_arm_timer(rwb);
 
-	if (current_is_kswapd())
-		ret |= WBT_KSWAPD;
-
 	return ret | WBT_TRACKED;
 }
 


More information about the Devel mailing list