[Devel] [PATCH RH7 07/12] blk-wbt: improve waking of tasks

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Thu Sep 29 14:30:07 MSK 2022


From: Jens Axboe <axboe at kernel.dk>

We have two potential issues:

1) After commit 2887e41b910b, we only wake one process at the time when
   we finish an IO. We really want to wake up as many tasks as can
   queue IO. Before this commit, we woke up everyone, which could cause
   a thundering herd issue.

2) A task can potentially consume two wakeups, causing us to (in
   practice) miss a wakeup.

Fix both by providing our own wakeup function, which stops
__wake_up_common() from waking up more tasks if we fail to get a
queueing token. With the strict ordering we have on the wait list, this
wakes the right tasks and the right amount of tasks.

Based on a patch from Jianchao Wang <jianchao.w.wang at oracle.com>.

Tested-by: Agarwal, Anchal <anchalag at amazon.com>
Signed-off-by: Jens Axboe <axboe at kernel.dk>

Changes when porting to vz7:
- s/wait_queue_entry/__wait_queue/
- s/entry/task_list/
- add wb_acct argument to __wbt_wait

https://jira.sw.ru/browse/PSBM-141883
(cherry picked from commit 38cfb5a45ee013bfab5d1ae4c4738815e744b440)
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 block/blk-wbt.c | 73 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 61 insertions(+), 12 deletions(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 8719a820bfb7..49d11e089c97 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -159,7 +159,7 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
 		int diff = limit - inflight;
 
 		if (!inflight || diff >= rwb->wb_background / 2)
-			wake_up(&rqw->wait);
+			wake_up_all(&rqw->wait);
 	}
 }
 
@@ -518,26 +518,76 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 	return limit;
 }
 
+struct wbt_wait_data {
+	struct __wait_queue wq;
+	struct task_struct *task;
+	struct rq_wb *rwb;
+	struct rq_wait *rqw;
+	unsigned long rw;
+	bool got_token;
+};
+
+static int wbt_wake_function(struct __wait_queue *curr, unsigned int mode,
+			     int wake_flags, void *key)
+{
+	struct wbt_wait_data *data = container_of(curr, struct wbt_wait_data,
+							wq);
+
+	/*
+	 * If we fail to get a budget, return -1 to interrupt the wake up
+	 * loop in __wake_up_common.
+	 */
+	if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw)))
+		return -1;
+
+	data->got_token = true;
+	list_del_init(&curr->task_list);
+	wake_up_process(data->task);
+	return 1;
+}
+
 /*
  * Block if we will exceed our limit, or if we are currently waiting for
  * the timer to kick off queuing again.
  */
-static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
+static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
+		       unsigned long rw, spinlock_t *lock)
 {
 	struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
-	DECLARE_WAITQUEUE(wait, current);
+	struct wbt_wait_data data = {
+		.wq = {
+			.func = wbt_wake_function,
+			.task_list = LIST_HEAD_INIT(data.wq.task_list),
+		},
+		.task = current,
+		.rwb = rwb,
+		.rqw = rqw,
+		.rw = rw,
+	};
 	bool has_sleeper;
 
 	has_sleeper = wq_has_sleeper(&rqw->wait);
 	if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
 		return;
 
-	add_wait_queue_exclusive(&rqw->wait, &wait);
+	prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
 	do {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (data.got_token)
+			break;
 
-		if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
+		if (!has_sleeper &&
+		    rq_wait_inc_below(rqw, get_limit(rwb, rw))) {
+			finish_wait(&rqw->wait, &data.wq);
+
+			/*
+			 * We raced with wbt_wake_function() getting a token,
+			 * which means we now have two. Put our local token
+			 * and wake anyone else potentially waiting for one.
+			 */
+			if (data.got_token)
+				wbt_rqw_done(rwb, rqw, wb_acct);
 			break;
+		}
 
 		if (lock)
 			spin_unlock_irq(lock);
@@ -550,8 +600,7 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
 		has_sleeper = false;
 	} while (1);
 
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&rqw->wait, &wait);
+	finish_wait(&rqw->wait, &data.wq);
 }
 
 static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
@@ -595,14 +644,14 @@ unsigned int wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
 		return ret;
 	}
 
-	__wbt_wait(rwb, bio->bi_rw, lock);
+	if (current_is_kswapd())
+		ret |= WBT_KSWAPD;
+
+	__wbt_wait(rwb, ret, bio->bi_rw, lock);
 
 	if (!blk_stat_is_active(rwb->cb))
 		rwb_arm_timer(rwb);
 
-	if (current_is_kswapd())
-		ret |= WBT_KSWAPD;
-
 	return ret | WBT_TRACKED;
 }
 
-- 
2.37.1



More information about the Devel mailing list