[Devel] [PATCH RH8 08/10] ploop: Split pios from kwork context

Kirill Tkhai ktkhai at virtuozzo.com
Wed Jun 30 19:46:52 MSK 2021


...instead of doing this in ploop_clone_and_map().
Otherwise, since dm-rq is not marked as BLK_MQ_F_BLOCKING,
we may call ploop_clone_and_map() from such stack:

[  325.472261] CT: 7d0ce903-d493-464f-8786-d57bebc77b37: stopped
[  329.991603] BUG: sleeping function called from invalid context at mm/slab.h:496
[  329.995398] in_atomic(): 1, irqs_disabled(): 0, pid: 6647, name: jbd2/dm-50135-8
[  329.999080] 1 lock held by jbd2/dm-50135-8/6647:
[  330.002290]  #0: ffffffffa6269e60 (rcu_read_lock){....}, at: hctx_lock+0x6d/0x180
[  330.006298] CPU: 2 PID: 6647 Comm: jbd2/dm-50135-8 ve: / Kdump: loaded Tainted: G        W        --------- -  - 4.18.0-240.1.1.vz8.5.50+debug #1 5.50
[  330.013953] Hardware name: Virtuozzo OpenStack Compute, BIOS 1.11.0-2.vz7.1 04/01/2014
[  330.018055] Call Trace:
[  330.021059]  dump_stack+0x9a/0xf0
[  330.024068]  ___might_sleep.cold.70+0x13d/0x178
[  330.027531]  slab_pre_alloc_hook+0x6a/0x90
[  330.030763]  __kmalloc+0x5c/0x320
[  330.033687]  ? create_bvec_from_rq+0x1ab/0x9f0 [ploop]
[  330.037158]  create_bvec_from_rq+0x1ab/0x9f0 [ploop]
[  330.040452]  ploop_clone_and_map+0x166/0x5c0 [ploop]
[  330.044382]  dm_mq_queue_rq+0x358/0x1030 [dm_mod]
[  330.048139]  ? dm_softirq_done+0x830/0x830 [dm_mod]
[  330.051668]  ? __sbitmap_queue_get+0xb7/0x230
[  330.054961]  ? sched_clock+0x5/0x10
[  330.057947]  ? __blk_mq_get_driver_tag+0x193/0x730
[  330.061721]  blk_mq_dispatch_rq_list+0x287/0x1f40
[  330.065562]  ? elv_rb_del+0x3b/0x80
[  330.069061]  ? blk_mq_dequeue_from_ctx+0x500/0x500
[  330.072840]  ? dd_dispatch_request+0x20f/0x930
[  330.076442]  blk_mq_do_dispatch_sched+0x2d8/0x4c0
[  330.080288]  ? blk_mq_sched_free_hctx_data+0x1b0/0x1b0
[  330.084154]  ? trace_hardirqs_on+0x10/0x10
[  330.087570]  __blk_mq_sched_dispatch_requests+0x2fd/0x4c0
[  330.091476]  ? blk_mq_sched_restart+0x50/0x50
[  330.095008]  ? sched_clock+0x5/0x10
[  330.098065]  ? sched_clock_cpu+0x18/0x1e0
[  330.101411]  blk_mq_sched_dispatch_requests+0xae/0x100
[  330.105135]  __blk_mq_run_hw_queue+0x169/0x250
[  330.108330]  ? __blk_mq_requeue_request+0x640/0x640
[  330.111691]  ? lock_downgrade+0x6f0/0x6f0
[  330.114688]  ? lock_acquire+0x14f/0x3b0
[  330.117756]  ? hctx_lock+0x6d/0x180
[  330.120819]  __blk_mq_delay_run_hw_queue+0x35c/0x690
[  330.124265]  blk_mq_run_hw_queue+0x140/0x280
[  330.127789]  ? blk_mq_delay_run_hw_queues+0x130/0x130
[  330.131495]  blk_mq_sched_insert_requests+0x1bd/0x4d0
[  330.134584]  ? kvm_sched_clock_read+0x14/0x30
[  330.137682]  blk_mq_flush_plug_list+0x6f8/0xb50
[  330.141069]  ? blk_mq_insert_requests+0x5d0/0x5d0
[  330.144501]  ? kvm_sched_clock_read+0x14/0x30
[  330.147826]  ? sched_clock+0x5/0x10
[  330.150736]  ? sched_clock_cpu+0x18/0x1e0
[  330.153734]  blk_flush_plug_list+0x27a/0x410
[  330.156834]  ? blk_rq_bio_prep+0x370/0x370
[  330.160029]  ? do_raw_read_unlock+0x40/0x70
[  330.163177]  blk_finish_plug+0x47/0x8a
[  330.166206]  jbd2_journal_commit_transaction+0x2fc7/0x67f0 [jbd2]
[  330.169917]  ? journal_submit_commit_record+0xa10/0xa10 [jbd2]
[  330.174095]  ? find_held_lock+0x3a/0x1c0
[  330.177559]  ? kvm_sched_clock_read+0x14/0x30
[  330.180656]  ? sched_clock+0x5/0x10
[  330.183392]  ? find_held_lock+0x3a/0x1c0
[  330.186306]  ? _raw_spin_unlock_irqrestore+0x46/0x60
[  330.189533]  ? trace_hardirqs_on_caller+0x39d/0x580
[  330.192471]  ? del_timer+0x100/0x100
[  330.195073]  kjournald2+0x1df/0x7a0 [jbd2]
[  330.197545]  ? __bpf_trace_jbd2_end_commit+0x10/0x10 [jbd2]
[  330.200693]  ? finish_wait+0x280/0x280
[  330.203351]  ? __kthread_parkme+0xb6/0x180
[  330.206086]  ? __bpf_trace_jbd2_end_commit+0x10/0x10 [jbd2]
[  330.209207]  kthread+0x30e/0x3d0
[  330.211609]  ? kthread_create_fn+0x70/0x70
[  330.214183]  ret_from_fork+0x3a/0x50

https://jira.sw.ru/browse/PSBM-131208
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 drivers/md/dm-ploop-cmd.c |    2 -
 drivers/md/dm-ploop-map.c |  157 +++++++++++++++++++++++----------------------
 drivers/md/dm-ploop.h     |    5 +
 3 files changed, 84 insertions(+), 80 deletions(-)

diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c
index 5104bdff10f8..87517ed7ef1a 100644
--- a/drivers/md/dm-ploop-cmd.c
+++ b/drivers/md/dm-ploop-cmd.c
@@ -139,7 +139,7 @@ static void ploop_resume_submitting_pios(struct ploop *ploop)
 	list_splice_tail_init(&ploop->suspended_pios, &list);
 	spin_unlock_irq(&ploop->deferred_lock);
 
-	submit_pios(ploop, &list);
+	dispatch_pios(ploop, NULL, &list);
 }
 
 /* Find existing BAT clu pointing to dst_clu */
diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 7f467db8d6b1..ca7f841cb8b8 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -1559,6 +1559,59 @@ static struct bio_vec *create_bvec_from_rq(struct request *rq)
 	return bvec;
 }
 
+static void prepare_one_embedded_pio(struct ploop *ploop, struct pio *pio,
+				     struct list_head *deferred_pios)
+{
+	struct ploop_rq *prq = embedded_pio_to_prq(pio);
+	struct request *rq = prq->rq;
+	struct bio_vec *bvec = NULL;
+	LIST_HEAD(list);
+	int ret;
+
+	if (rq->bio != rq->biotail) {
+		if (req_op(rq) == REQ_OP_DISCARD)
+			goto skip_bvec;
+		/*
+		 * Transform a set of bvec arrays related to bios
+		 * into a single bvec array (which we can iterate).
+		 */
+		bvec = create_bvec_from_rq(rq);
+		if (!bvec)
+			goto err_nomem;
+		prq->bvec = bvec;
+skip_bvec:
+		pio->bi_iter.bi_sector = blk_rq_pos(rq);
+		pio->bi_iter.bi_size = blk_rq_bytes(rq);
+		pio->bi_iter.bi_idx = 0;
+		pio->bi_iter.bi_bvec_done = 0;
+	} else {
+		/* Single bio already provides bvec array */
+		bvec = rq->bio->bi_io_vec;
+
+		pio->bi_iter = rq->bio->bi_iter;
+	}
+	pio->bi_io_vec = bvec;
+
+	pio->queue_list_id = PLOOP_LIST_DEFERRED;
+	ret = split_pio_to_list(ploop, pio, deferred_pios);
+	if (ret)
+		goto err_nomem;
+
+	return;
+err_nomem:
+	pio->bi_status = BLK_STS_IOERR;
+	pio_endio(pio);
+}
+
+static void prepare_embedded_pios(struct ploop *ploop, struct list_head *pios,
+				  struct list_head *deferred_pios)
+{
+	struct pio *pio;
+
+	while ((pio = pio_list_pop(pios)) != NULL)
+		prepare_one_embedded_pio(ploop, pio, deferred_pios);
+}
+
 static void process_deferred_pios(struct ploop *ploop, struct list_head *pios)
 {
 	struct pio *pio;
@@ -1662,6 +1715,7 @@ static void submit_metadata_writeback(struct ploop *ploop)
 void do_ploop_work(struct work_struct *ws)
 {
 	struct ploop *ploop = container_of(ws, struct ploop, worker);
+	LIST_HEAD(embedded_pios);
 	LIST_HEAD(deferred_pios);
 	LIST_HEAD(discard_pios);
 	LIST_HEAD(cow_pios);
@@ -1671,12 +1725,15 @@ void do_ploop_work(struct work_struct *ws)
 	current->flags |= PF_IO_THREAD;
 
 	spin_lock_irq(&ploop->deferred_lock);
-	list_splice_init(&ploop->resubmit_pios, &resubmit_pios);
+	list_splice_init(&ploop->pios[PLOOP_LIST_PREPARE], &embedded_pios);
 	list_splice_init(&ploop->pios[PLOOP_LIST_DEFERRED], &deferred_pios);
 	list_splice_init(&ploop->pios[PLOOP_LIST_DISCARD], &discard_pios);
 	list_splice_init(&ploop->pios[PLOOP_LIST_COW], &cow_pios);
+	list_splice_init(&ploop->resubmit_pios, &resubmit_pios);
 	spin_unlock_irq(&ploop->deferred_lock);
 
+	prepare_embedded_pios(ploop, &embedded_pios, &deferred_pios);
+
 	process_resubmit_pios(ploop, &resubmit_pios);
 	process_deferred_pios(ploop, &deferred_pios);
 	process_discard_pios(ploop, &discard_pios);
@@ -1715,107 +1772,53 @@ static void init_prq(struct ploop_rq *prq, struct request *rq)
 	prq->bvec = NULL;
 }
 
-static void submit_pio(struct ploop *ploop, struct pio *pio)
+int ploop_clone_and_map(struct dm_target *ti, struct request *rq,
+		    union map_info *info, struct request **clone)
 {
-	struct list_head *queue_list;
+	struct ploop *ploop = ti->private;
 	struct work_struct *worker;
+	struct ploop_rq *prq;
 	unsigned long flags;
 	bool queue = true;
-	LIST_HEAD(list);
-	int ret;
+	struct pio *pio;
 
-	if (pio->bi_iter.bi_size) {
-		queue_list = &ploop->pios[PLOOP_LIST_DEFERRED];
-		worker = &ploop->worker;
+	prq = map_info_to_embedded_prq(info);
+	init_prq(prq, rq);
 
-		ret = split_pio_to_list(ploop, pio, &list);
-		if (ret) {
-			pio->bi_status = BLK_STS_RESOURCE;
-			goto endio;
-		}
+	pio = map_info_to_embedded_pio(info);
+	init_pio(ploop, req_op(rq), pio);
+	pio->endio_cb = prq_endio;
+	pio->endio_cb_data = prq;
+
+	if (blk_rq_bytes(rq)) {
+		if (ploop_prq_valid(ploop, prq) < 0)
+			return DM_MAPIO_KILL;
+
+		pio->queue_list_id = PLOOP_LIST_PREPARE;
+		worker = &ploop->worker;
 	} else {
-		queue_list = &ploop->pios[PLOOP_LIST_FLUSH];
+		pio->queue_list_id = PLOOP_LIST_FLUSH;
 		worker = &ploop->fsync_worker;
 
 		if (WARN_ON_ONCE(pio->bi_op != REQ_OP_FLUSH))
-			goto kill;
-		list_add_tail(&pio->list, &list);
+			return DM_MAPIO_KILL;
 	}
 
 	spin_lock_irqsave(&ploop->deferred_lock, flags);
 	if (unlikely(ploop->stop_submitting_pios)) {
-		list_splice_tail(&list, &ploop->suspended_pios);
+		list_add_tail(&pio->list, &ploop->suspended_pios);
 		queue = false;
 		goto unlock;
 	}
 
 	inc_nr_inflight(ploop, pio);
-	list_splice_tail(&list, queue_list);
+	list_add_tail(&pio->list, &ploop->pios[pio->queue_list_id]);
 unlock:
 	spin_unlock_irqrestore(&ploop->deferred_lock, flags);
 
 	if (queue)
 		queue_work(ploop->wq, worker);
-	return;
-kill:
-	pio->bi_status = BLK_STS_IOERR;
-endio:
-	pio_endio(pio);
-}
-
-void submit_pios(struct ploop *ploop, struct list_head *list)
-{
-        struct pio *pio;
-
-        while ((pio = pio_list_pop(list)) != NULL)
-                submit_pio(ploop, pio);
-}
-
-int ploop_clone_and_map(struct dm_target *ti, struct request *rq,
-		    union map_info *info, struct request **clone)
-{
-	struct ploop *ploop = ti->private;
-	struct bio_vec *bvec = NULL;
-	struct ploop_rq *prq;
-	struct pio *pio;
-
-	prq = map_info_to_embedded_prq(info);
-	init_prq(prq, rq);
-
-	if (ploop_prq_valid(ploop, prq) < 0)
-		return DM_MAPIO_KILL;
-
-	pio = map_info_to_embedded_pio(info); /* Embedded pio */
-	init_pio(ploop, req_op(rq), pio);
-
-	if (rq->bio != rq->biotail) {
-		if (req_op(rq) == REQ_OP_DISCARD)
-			goto skip_bvec;
-		/*
-		 * Transform a set of bvec arrays related to bios
-		 * into a single bvec array (which we can iterate).
-		 */
-		bvec = create_bvec_from_rq(rq);
-		if (!bvec)
-			return DM_MAPIO_KILL;
-		prq->bvec = bvec;
-skip_bvec:
-		pio->bi_iter.bi_sector = blk_rq_pos(rq);
-		pio->bi_iter.bi_size = blk_rq_bytes(rq);
-		pio->bi_iter.bi_idx = 0;
-		pio->bi_iter.bi_bvec_done = 0;
-        } else if (rq->bio) {
-                /* Single bio already provides bvec array */
-		bvec = rq->bio->bi_io_vec;
-
-		pio->bi_iter = rq->bio->bi_iter;
-        } /* else FLUSH */
-
-        pio->bi_io_vec = bvec;
-        pio->endio_cb = prq_endio;
-        pio->endio_cb_data = prq;
 
-	submit_pio(ploop, pio);
 	return DM_MAPIO_SUBMITTED;
 }
 
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index 26eab969c389..d26f269e475c 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -123,7 +123,9 @@ struct md_page {
 };
 
 enum {
-	PLOOP_LIST_DEFERRED = 0,
+	PLOOP_LIST_PREPARE = 0, /* List for initial preparation and splitting
+				 * embedded pios related to prq */
+	PLOOP_LIST_DEFERRED,
 	PLOOP_LIST_FLUSH,
 	PLOOP_LIST_DISCARD,
 	PLOOP_LIST_COW,
@@ -530,7 +532,6 @@ extern bool try_update_bat_entry(struct ploop *ploop, u32 clu,
 extern int convert_bat_entries(u32 *bat_entries, u32 count);
 
 extern int ploop_add_delta(struct ploop *ploop, u32 level, struct file *file, bool is_raw);
-extern void submit_pios(struct ploop *ploop, struct list_head *list);
 extern void dispatch_pios(struct ploop *ploop, struct pio *pio, struct list_head *pio_list);
 extern void do_ploop_work(struct work_struct *ws);
 extern void do_ploop_fsync_work(struct work_struct *ws);




More information about the Devel mailing list