[Devel] [PATCH RHEL8 COMMIT] ploop: Split pios from kwork context
Konstantin Khorenko
khorenko at virtuozzo.com
Fri Jul 2 22:48:01 MSK 2021
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.51
------>
commit 72792315a15cec6a69d0f45724295dbc4ba05efa
Author: Kirill Tkhai <ktkhai at virtuozzo.com>
Date: Fri Jul 2 22:48:01 2021 +0300
ploop: Split pios from kwork context
...instead of doing this in ploop_clone_and_map().
Otherwise, since dm-rq is not marked as BLK_MQ_F_BLOCKING,
we may call ploop_clone_and_map() from such stack:
[ 325.472261] CT: 7d0ce903-d493-464f-8786-d57bebc77b37: stopped
[ 329.991603] BUG: sleeping function called from invalid context at mm/slab.h:496
[ 329.995398] in_atomic(): 1, irqs_disabled(): 0, pid: 6647, name: jbd2/dm-50135-8
[ 329.999080] 1 lock held by jbd2/dm-50135-8/6647:
[ 330.002290] #0: ffffffffa6269e60 (rcu_read_lock){....}, at: hctx_lock+0x6d/0x180
[ 330.006298] CPU: 2 PID: 6647 Comm: jbd2/dm-50135-8 ve: / Kdump: loaded Tainted: G W --------- - - 4.18.0-240.1.1.vz8.5.50+debug #1 5.50
[ 330.013953] Hardware name: Virtuozzo OpenStack Compute, BIOS 1.11.0-2.vz7.1 04/01/2014
[ 330.018055] Call Trace:
[ 330.021059] dump_stack+0x9a/0xf0
[ 330.024068] ___might_sleep.cold.70+0x13d/0x178
[ 330.027531] slab_pre_alloc_hook+0x6a/0x90
[ 330.030763] __kmalloc+0x5c/0x320
[ 330.033687] ? create_bvec_from_rq+0x1ab/0x9f0 [ploop]
[ 330.037158] create_bvec_from_rq+0x1ab/0x9f0 [ploop]
[ 330.040452] ploop_clone_and_map+0x166/0x5c0 [ploop]
[ 330.044382] dm_mq_queue_rq+0x358/0x1030 [dm_mod]
[ 330.048139] ? dm_softirq_done+0x830/0x830 [dm_mod]
[ 330.051668] ? __sbitmap_queue_get+0xb7/0x230
[ 330.054961] ? sched_clock+0x5/0x10
[ 330.057947] ? __blk_mq_get_driver_tag+0x193/0x730
[ 330.061721] blk_mq_dispatch_rq_list+0x287/0x1f40
[ 330.065562] ? elv_rb_del+0x3b/0x80
[ 330.069061] ? blk_mq_dequeue_from_ctx+0x500/0x500
[ 330.072840] ? dd_dispatch_request+0x20f/0x930
[ 330.076442] blk_mq_do_dispatch_sched+0x2d8/0x4c0
[ 330.080288] ? blk_mq_sched_free_hctx_data+0x1b0/0x1b0
[ 330.084154] ? trace_hardirqs_on+0x10/0x10
[ 330.087570] __blk_mq_sched_dispatch_requests+0x2fd/0x4c0
[ 330.091476] ? blk_mq_sched_restart+0x50/0x50
[ 330.095008] ? sched_clock+0x5/0x10
[ 330.098065] ? sched_clock_cpu+0x18/0x1e0
[ 330.101411] blk_mq_sched_dispatch_requests+0xae/0x100
[ 330.105135] __blk_mq_run_hw_queue+0x169/0x250
[ 330.108330] ? __blk_mq_requeue_request+0x640/0x640
[ 330.111691] ? lock_downgrade+0x6f0/0x6f0
[ 330.114688] ? lock_acquire+0x14f/0x3b0
[ 330.117756] ? hctx_lock+0x6d/0x180
[ 330.120819] __blk_mq_delay_run_hw_queue+0x35c/0x690
[ 330.124265] blk_mq_run_hw_queue+0x140/0x280
[ 330.127789] ? blk_mq_delay_run_hw_queues+0x130/0x130
[ 330.131495] blk_mq_sched_insert_requests+0x1bd/0x4d0
[ 330.134584] ? kvm_sched_clock_read+0x14/0x30
[ 330.137682] blk_mq_flush_plug_list+0x6f8/0xb50
[ 330.141069] ? blk_mq_insert_requests+0x5d0/0x5d0
[ 330.144501] ? kvm_sched_clock_read+0x14/0x30
[ 330.147826] ? sched_clock+0x5/0x10
[ 330.150736] ? sched_clock_cpu+0x18/0x1e0
[ 330.153734] blk_flush_plug_list+0x27a/0x410
[ 330.156834] ? blk_rq_bio_prep+0x370/0x370
[ 330.160029] ? do_raw_read_unlock+0x40/0x70
[ 330.163177] blk_finish_plug+0x47/0x8a
[ 330.166206] jbd2_journal_commit_transaction+0x2fc7/0x67f0 [jbd2]
[ 330.169917] ? journal_submit_commit_record+0xa10/0xa10 [jbd2]
[ 330.174095] ? find_held_lock+0x3a/0x1c0
[ 330.177559] ? kvm_sched_clock_read+0x14/0x30
[ 330.180656] ? sched_clock+0x5/0x10
[ 330.183392] ? find_held_lock+0x3a/0x1c0
[ 330.186306] ? _raw_spin_unlock_irqrestore+0x46/0x60
[ 330.189533] ? trace_hardirqs_on_caller+0x39d/0x580
[ 330.192471] ? del_timer+0x100/0x100
[ 330.195073] kjournald2+0x1df/0x7a0 [jbd2]
[ 330.197545] ? __bpf_trace_jbd2_end_commit+0x10/0x10 [jbd2]
[ 330.200693] ? finish_wait+0x280/0x280
[ 330.203351] ? __kthread_parkme+0xb6/0x180
[ 330.206086] ? __bpf_trace_jbd2_end_commit+0x10/0x10 [jbd2]
[ 330.209207] kthread+0x30e/0x3d0
[ 330.211609] ? kthread_create_fn+0x70/0x70
[ 330.214183] ret_from_fork+0x3a/0x50
https://jira.sw.ru/browse/PSBM-131208
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
==========================
ploop: Split pios in kwork
https://jira.sw.ru/browse/PSBM-131208
Kirill Tkhai (10):
ploop: Remove debug noinline in create_bvec_from_rq()
ploop: Manage flush pios in generic way
ploop: Teach dispatch_pios() work with flush pios
ploop: Make split_pios_to_list() to add initial pio to the list too
ploop: Introduce embedded_pio_to_prq()
ploop: Introduce ploop_prq_valid()
ploop: Move create_bvec_from_rq() up
ploop: Split pios from kwork context
ploop: Add sanity check of passed BAT from disk
ploop: Reread file size after index update
---
drivers/md/dm-ploop-cmd.c | 2 +-
drivers/md/dm-ploop-map.c | 157 +++++++++++++++++++++++-----------------------
drivers/md/dm-ploop.h | 5 +-
3 files changed, 84 insertions(+), 80 deletions(-)
diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c
index 5104bdff10f8..87517ed7ef1a 100644
--- a/drivers/md/dm-ploop-cmd.c
+++ b/drivers/md/dm-ploop-cmd.c
@@ -139,7 +139,7 @@ static void ploop_resume_submitting_pios(struct ploop *ploop)
list_splice_tail_init(&ploop->suspended_pios, &list);
spin_unlock_irq(&ploop->deferred_lock);
- submit_pios(ploop, &list);
+ dispatch_pios(ploop, NULL, &list);
}
/* Find existing BAT clu pointing to dst_clu */
diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 7f467db8d6b1..ca7f841cb8b8 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -1559,6 +1559,59 @@ static struct bio_vec *create_bvec_from_rq(struct request *rq)
return bvec;
}
+static void prepare_one_embedded_pio(struct ploop *ploop, struct pio *pio,
+ struct list_head *deferred_pios)
+{
+ struct ploop_rq *prq = embedded_pio_to_prq(pio);
+ struct request *rq = prq->rq;
+ struct bio_vec *bvec = NULL;
+ LIST_HEAD(list);
+ int ret;
+
+ if (rq->bio != rq->biotail) {
+ if (req_op(rq) == REQ_OP_DISCARD)
+ goto skip_bvec;
+ /*
+ * Transform a set of bvec arrays related to bios
+ * into a single bvec array (which we can iterate).
+ */
+ bvec = create_bvec_from_rq(rq);
+ if (!bvec)
+ goto err_nomem;
+ prq->bvec = bvec;
+skip_bvec:
+ pio->bi_iter.bi_sector = blk_rq_pos(rq);
+ pio->bi_iter.bi_size = blk_rq_bytes(rq);
+ pio->bi_iter.bi_idx = 0;
+ pio->bi_iter.bi_bvec_done = 0;
+ } else {
+ /* Single bio already provides bvec array */
+ bvec = rq->bio->bi_io_vec;
+
+ pio->bi_iter = rq->bio->bi_iter;
+ }
+ pio->bi_io_vec = bvec;
+
+ pio->queue_list_id = PLOOP_LIST_DEFERRED;
+ ret = split_pio_to_list(ploop, pio, deferred_pios);
+ if (ret)
+ goto err_nomem;
+
+ return;
+err_nomem:
+ pio->bi_status = BLK_STS_IOERR;
+ pio_endio(pio);
+}
+
+static void prepare_embedded_pios(struct ploop *ploop, struct list_head *pios,
+ struct list_head *deferred_pios)
+{
+ struct pio *pio;
+
+ while ((pio = pio_list_pop(pios)) != NULL)
+ prepare_one_embedded_pio(ploop, pio, deferred_pios);
+}
+
static void process_deferred_pios(struct ploop *ploop, struct list_head *pios)
{
struct pio *pio;
@@ -1662,6 +1715,7 @@ static void submit_metadata_writeback(struct ploop *ploop)
void do_ploop_work(struct work_struct *ws)
{
struct ploop *ploop = container_of(ws, struct ploop, worker);
+ LIST_HEAD(embedded_pios);
LIST_HEAD(deferred_pios);
LIST_HEAD(discard_pios);
LIST_HEAD(cow_pios);
@@ -1671,12 +1725,15 @@ void do_ploop_work(struct work_struct *ws)
current->flags |= PF_IO_THREAD;
spin_lock_irq(&ploop->deferred_lock);
- list_splice_init(&ploop->resubmit_pios, &resubmit_pios);
+ list_splice_init(&ploop->pios[PLOOP_LIST_PREPARE], &embedded_pios);
list_splice_init(&ploop->pios[PLOOP_LIST_DEFERRED], &deferred_pios);
list_splice_init(&ploop->pios[PLOOP_LIST_DISCARD], &discard_pios);
list_splice_init(&ploop->pios[PLOOP_LIST_COW], &cow_pios);
+ list_splice_init(&ploop->resubmit_pios, &resubmit_pios);
spin_unlock_irq(&ploop->deferred_lock);
+ prepare_embedded_pios(ploop, &embedded_pios, &deferred_pios);
+
process_resubmit_pios(ploop, &resubmit_pios);
process_deferred_pios(ploop, &deferred_pios);
process_discard_pios(ploop, &discard_pios);
@@ -1715,107 +1772,53 @@ static void init_prq(struct ploop_rq *prq, struct request *rq)
prq->bvec = NULL;
}
-static void submit_pio(struct ploop *ploop, struct pio *pio)
+int ploop_clone_and_map(struct dm_target *ti, struct request *rq,
+ union map_info *info, struct request **clone)
{
- struct list_head *queue_list;
+ struct ploop *ploop = ti->private;
struct work_struct *worker;
+ struct ploop_rq *prq;
unsigned long flags;
bool queue = true;
- LIST_HEAD(list);
- int ret;
+ struct pio *pio;
- if (pio->bi_iter.bi_size) {
- queue_list = &ploop->pios[PLOOP_LIST_DEFERRED];
- worker = &ploop->worker;
+ prq = map_info_to_embedded_prq(info);
+ init_prq(prq, rq);
- ret = split_pio_to_list(ploop, pio, &list);
- if (ret) {
- pio->bi_status = BLK_STS_RESOURCE;
- goto endio;
- }
+ pio = map_info_to_embedded_pio(info);
+ init_pio(ploop, req_op(rq), pio);
+ pio->endio_cb = prq_endio;
+ pio->endio_cb_data = prq;
+
+ if (blk_rq_bytes(rq)) {
+ if (ploop_prq_valid(ploop, prq) < 0)
+ return DM_MAPIO_KILL;
+
+ pio->queue_list_id = PLOOP_LIST_PREPARE;
+ worker = &ploop->worker;
} else {
- queue_list = &ploop->pios[PLOOP_LIST_FLUSH];
+ pio->queue_list_id = PLOOP_LIST_FLUSH;
worker = &ploop->fsync_worker;
if (WARN_ON_ONCE(pio->bi_op != REQ_OP_FLUSH))
- goto kill;
- list_add_tail(&pio->list, &list);
+ return DM_MAPIO_KILL;
}
spin_lock_irqsave(&ploop->deferred_lock, flags);
if (unlikely(ploop->stop_submitting_pios)) {
- list_splice_tail(&list, &ploop->suspended_pios);
+ list_add_tail(&pio->list, &ploop->suspended_pios);
queue = false;
goto unlock;
}
inc_nr_inflight(ploop, pio);
- list_splice_tail(&list, queue_list);
+ list_add_tail(&pio->list, &ploop->pios[pio->queue_list_id]);
unlock:
spin_unlock_irqrestore(&ploop->deferred_lock, flags);
if (queue)
queue_work(ploop->wq, worker);
- return;
-kill:
- pio->bi_status = BLK_STS_IOERR;
-endio:
- pio_endio(pio);
-}
-
-void submit_pios(struct ploop *ploop, struct list_head *list)
-{
- struct pio *pio;
-
- while ((pio = pio_list_pop(list)) != NULL)
- submit_pio(ploop, pio);
-}
-
-int ploop_clone_and_map(struct dm_target *ti, struct request *rq,
- union map_info *info, struct request **clone)
-{
- struct ploop *ploop = ti->private;
- struct bio_vec *bvec = NULL;
- struct ploop_rq *prq;
- struct pio *pio;
-
- prq = map_info_to_embedded_prq(info);
- init_prq(prq, rq);
-
- if (ploop_prq_valid(ploop, prq) < 0)
- return DM_MAPIO_KILL;
-
- pio = map_info_to_embedded_pio(info); /* Embedded pio */
- init_pio(ploop, req_op(rq), pio);
-
- if (rq->bio != rq->biotail) {
- if (req_op(rq) == REQ_OP_DISCARD)
- goto skip_bvec;
- /*
- * Transform a set of bvec arrays related to bios
- * into a single bvec array (which we can iterate).
- */
- bvec = create_bvec_from_rq(rq);
- if (!bvec)
- return DM_MAPIO_KILL;
- prq->bvec = bvec;
-skip_bvec:
- pio->bi_iter.bi_sector = blk_rq_pos(rq);
- pio->bi_iter.bi_size = blk_rq_bytes(rq);
- pio->bi_iter.bi_idx = 0;
- pio->bi_iter.bi_bvec_done = 0;
- } else if (rq->bio) {
- /* Single bio already provides bvec array */
- bvec = rq->bio->bi_io_vec;
-
- pio->bi_iter = rq->bio->bi_iter;
- } /* else FLUSH */
-
- pio->bi_io_vec = bvec;
- pio->endio_cb = prq_endio;
- pio->endio_cb_data = prq;
- submit_pio(ploop, pio);
return DM_MAPIO_SUBMITTED;
}
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index 26eab969c389..d26f269e475c 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -123,7 +123,9 @@ struct md_page {
};
enum {
- PLOOP_LIST_DEFERRED = 0,
+ PLOOP_LIST_PREPARE = 0, /* List for initial preparation and splitting
+ * embedded pios related to prq */
+ PLOOP_LIST_DEFERRED,
PLOOP_LIST_FLUSH,
PLOOP_LIST_DISCARD,
PLOOP_LIST_COW,
@@ -530,7 +532,6 @@ extern bool try_update_bat_entry(struct ploop *ploop, u32 clu,
extern int convert_bat_entries(u32 *bat_entries, u32 count);
extern int ploop_add_delta(struct ploop *ploop, u32 level, struct file *file, bool is_raw);
-extern void submit_pios(struct ploop *ploop, struct list_head *list);
extern void dispatch_pios(struct ploop *ploop, struct pio *pio, struct list_head *pio_list);
extern void do_ploop_work(struct work_struct *ws);
extern void do_ploop_fsync_work(struct work_struct *ws);
More information about the Devel
mailing list