[Devel] [PATCH RHEL9 COMMIT] dm-ploop: move from wq to kthread
Konstantin Khorenko
khorenko at virtuozzo.com
Mon Jan 27 16:12:35 MSK 2025
The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.44.1.vz9.80.6
------>
commit 8493a7bb51af6c43ab9f4e93e1268ff47abf8b02
Author: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
Date: Fri Jan 24 17:35:44 2025 +0200
dm-ploop: move from wq to kthread
Move to multithreaded model and remove work queue.
https://virtuozzo.atlassian.net/browse/VSTOR-91821
Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
======
Patchset description:
ploop: optimistations and scalling
Ploop processes requsts in a different threads in parallel
where possible which results in significant improvement in
performance and makes further optimistations possible.
Known bugs:
- delayed metadata writeback is not working and is missing error handling
- patch to disable it until fixed
- fast path is not working - causes rcu lockups - patch to disable it
Further improvements:
- optimize md pages lookups
Alexander Atanasov (50):
dm-ploop: md_pages map all pages at creation time
dm-ploop: Use READ_ONCE/WRITE_ONCE to access md page data
dm-ploop: fsync after all pios are sent
dm-ploop: move md status to use proper bitops
dm-ploop: convert wait_list and wb_batch_llist to use lockless lists
dm-ploop: convert enospc handling to use lockless lists
dm-ploop: convert suspended_pios list to use lockless list
dm-ploop: convert the rest of the lists to use llist variant
dm-ploop: combine processing of pios thru prepare list and remove
fsync worker
dm-ploop: move from wq to kthread
dm-ploop: move preparations of pios into the caller from worker
dm-ploop: fast path execution for reads
dm-ploop: do not use a wrapper for set_bit to make a page writeback
dm-ploop: BAT use only one list for writeback
dm-ploop: make md writeback timeout to be per page
dm-ploop: add interface to disable bat writeback delay
dm-ploop: convert wb_batch_list to lockless variant
dm-ploop: convert high_prio to status
dm-ploop: split cow processing into two functions
dm-ploop: convert md page rw lock to spin lock
dm-ploop: convert bat_rwlock to bat_lock spinlock
dm-ploop: prepare bat updates under bat_lock
dm-ploop: make ploop_bat_write_complete ready for parallel pio
completion
dm-ploop: make ploop_submit_metadata_writeback return number of
requests sent
dm-ploop: introduce pio runner threads
dm-ploop: add pio list ids to be used when passing pios to runners
dm-ploop: process pios via runners
dm-ploop: disable metadata writeback delay
dm-ploop: disable fast path
dm-ploop: use lockless lists for chained cow updates list
dm-ploop: use lockless lists for data ready pios
dm-ploop: give runner threads better name
dm-ploop: resize operation - add holes bitmap locking
dm-ploop: remove unnecessary operations
dm-ploop: use filp per thread
dm-ploop: catch if we try to advance pio past bio end
dm-ploop: support REQ_FUA for data pios
dm-ploop: proplerly access nr_bat_entries
dm-ploop: fix locking and improve error handling when submitting pios
dm-ploop: fix how ENOTBLK is handled
dm-ploop: sync when suspended or stopping
dm-ploop: rework bat completion logic
dm-ploop: rework logic in pio processing
dm-ploop: end fsync pios in parallel
dm-ploop: make filespace preallocations async
dm-ploop: resubmit enospc pios from dispatcher thread
dm-ploop: dm-ploop: simplify discard completion
dm-ploop: use GFP_ATOMIC instead of GFP_NOIO
dm-ploop: fix locks used in mixed context
dm-ploop: fix how current flags are managed inside threads
Andrey Zhadchenko (13):
dm-ploop: do not flush after metadata writes
dm-ploop: set IOCB_DSYNC on all FUA requests
dm-ploop: remove extra ploop_cluster_is_in_top_delta()
dm-ploop: introduce per-md page locking
dm-ploop: reduce BAT accesses on discard completion
dm-ploop: simplify llseek
dm-ploop: speed up ploop_prepare_bat_update()
dm-ploop: make new allocations immediately visible in BAT
dm-ploop: drop ploop_cluster_is_in_top_delta()
dm-ploop: do not wait for BAT update for non-FUA requests
dm-ploop: add delay for metadata writeback
dm-ploop: submit all postponed metadata on REQ_OP_FLUSH
dm-ploop: handle REQ_PREFLUSH
Feature: dm-ploop: ploop target driver
---
drivers/md/dm-ploop-map.c | 54 +++++++++++++++++++++++++++++++++-----
drivers/md/dm-ploop-target.c | 62 +++++++++++++++++++++++++++++++++++---------
drivers/md/dm-ploop.h | 11 +++++++-
3 files changed, 107 insertions(+), 20 deletions(-)
diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 93def46f15b4..76ef88d563a3 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -340,6 +340,11 @@ static int ploop_split_pio_to_list(struct ploop *ploop, struct pio *pio,
}
ALLOW_ERROR_INJECTION(ploop_split_pio_to_list, ERRNO);
+static void ploop_schedule_work(struct ploop *ploop)
+{
+ wake_up_process(ploop->kt_worker->task);
+}
+
static void ploop_dispatch_pio(struct ploop *ploop, struct pio *pio)
{
struct llist_head *list = (struct llist_head *)&ploop->pios[pio->queue_list_id];
@@ -362,7 +367,7 @@ void ploop_dispatch_pios(struct ploop *ploop, struct pio *pio,
ploop_dispatch_pio(ploop, pio);
}
- queue_work(ploop->wq, &ploop->worker);
+ ploop_schedule_work(ploop);
}
static bool ploop_delay_if_md_busy(struct ploop *ploop, struct md_page *md,
@@ -694,7 +699,7 @@ static void ploop_complete_cow(struct ploop_cow *cow, blk_status_t bi_status)
ploop_queue_or_fail(ploop, blk_status_to_errno(bi_status), cow_pio);
- queue_work(ploop->wq, &ploop->worker);
+ ploop_schedule_work(ploop);
ploop_free_pio_with_pages(ploop, cow->aux_pio);
kmem_cache_free(cow_cache, cow);
}
@@ -1147,7 +1152,7 @@ static void ploop_queue_resubmit(struct pio *pio)
llist_add((struct llist_node *)(&pio->list), &ploop->llresubmit_pios);
- queue_work(ploop->wq, &ploop->worker);
+ ploop_schedule_work(ploop);
}
static void ploop_check_standby_mode(struct ploop *ploop, long res)
@@ -1822,9 +1827,8 @@ static void process_ploop_fsync_work(struct ploop *ploop)
}
}
-void do_ploop_work(struct work_struct *ws)
+void do_ploop_run_work(struct ploop *ploop)
{
- struct ploop *ploop = container_of(ws, struct ploop, worker);
LIST_HEAD(deferred_pios);
struct llist_node *llembedded_pios;
struct llist_node *lldeferred_pios;
@@ -1836,12 +1840,13 @@ void do_ploop_work(struct work_struct *ws)
current->flags |= PF_IO_THREAD|PF_LOCAL_THROTTLE|PF_MEMALLOC_NOIO;
llembedded_pios = llist_del_all(&ploop->pios[PLOOP_LIST_PREPARE]);
+
lldeferred_pios = llist_del_all(&ploop->pios[PLOOP_LIST_DEFERRED]);
lldiscard_pios = llist_del_all(&ploop->pios[PLOOP_LIST_DISCARD]);
llcow_pios = llist_del_all(&ploop->pios[PLOOP_LIST_COW]);
llresubmit = llist_del_all(&ploop->llresubmit_pios);
- /* add old deferred to the list */
+ /* add old deferred back to the list */
if (lldeferred_pios) {
struct llist_node *pos, *t;
struct pio *pio;
@@ -1873,6 +1878,41 @@ void do_ploop_work(struct work_struct *ws)
current->flags = old_flags;
}
+void do_ploop_work(struct work_struct *ws)
+{
+ struct ploop *ploop = container_of(ws, struct ploop, worker);
+
+ do_ploop_run_work(ploop);
+}
+
+int ploop_worker(void *data)
+{
+ struct ploop_worker *worker = data;
+ struct ploop *ploop = worker->ploop;
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ break;
+ }
+ if (llist_empty(&ploop->pios[PLOOP_LIST_FLUSH]) &&
+ llist_empty(&ploop->pios[PLOOP_LIST_PREPARE]) &&
+ llist_empty(&ploop->pios[PLOOP_LIST_DEFERRED]) &&
+ llist_empty(&ploop->pios[PLOOP_LIST_DISCARD]) &&
+ llist_empty(&ploop->pios[PLOOP_LIST_COW]) &&
+ llist_empty(&ploop->llresubmit_pios)
+ )
+ schedule();
+
+ __set_current_state(TASK_RUNNING);
+ do_ploop_run_work(ploop);
+ cond_resched();
+ }
+ return 0;
+}
+
static void ploop_submit_embedded_pio(struct ploop *ploop, struct pio *pio)
{
struct ploop_rq *prq = pio->endio_cb_data;
@@ -1893,7 +1933,7 @@ static void ploop_submit_embedded_pio(struct ploop *ploop, struct pio *pio)
ploop_inc_nr_inflight(ploop, pio);
llist_add((struct llist_node *)(&pio->list), &ploop->pios[PLOOP_LIST_PREPARE]);
- queue_work(ploop->wq, &ploop->worker);
+ ploop_schedule_work(ploop);
}
void ploop_submit_embedded_pios(struct ploop *ploop, struct list_head *list)
diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c
index ea9af6b6abe9..589082c8e110 100644
--- a/drivers/md/dm-ploop-target.c
+++ b/drivers/md/dm-ploop-target.c
@@ -161,11 +161,24 @@ static void ploop_destroy(struct ploop *ploop)
{
int i;
- if (ploop->wq) {
- flush_workqueue(ploop->wq);
- destroy_workqueue(ploop->wq);
- WARN_ON_ONCE(ploop_has_pending_activity(ploop));
+ if (ploop->kt_worker) {
+ wake_up_process(ploop->kt_worker->task);
+ /* try to send all pending - if we have partial io and enospc end bellow */
+ while (!llist_empty(&ploop->pios[PLOOP_LIST_FLUSH]) ||
+ !llist_empty(&ploop->pios[PLOOP_LIST_PREPARE]) ||
+ !llist_empty(&ploop->pios[PLOOP_LIST_DEFERRED]) ||
+ !llist_empty(&ploop->pios[PLOOP_LIST_DISCARD]) ||
+ !llist_empty(&ploop->pios[PLOOP_LIST_COW])
+ ) {
+ schedule();
+ }
+
+ kthread_stop(ploop->kt_worker->task); /* waits for the thread to stop */
+ WARN_ON(!llist_empty(&ploop->pios[PLOOP_LIST_PREPARE]));
+ WARN_ON(!llist_empty(&ploop->llresubmit_pios));
+ kfree(ploop->kt_worker);
}
+
for (i = 0; i < 2; i++)
percpu_ref_exit(&ploop->inflight_bios_ref[i]);
/* Nobody uses it after destroy_workqueue() */
@@ -173,6 +186,7 @@ static void ploop_destroy(struct ploop *ploop)
if (ploop->deltas[ploop->nr_deltas].file)
fput(ploop->deltas[ploop->nr_deltas].file);
}
+ WARN_ON(ploop_has_pending_activity(ploop));
WARN_ON(!ploop_empty_htable(ploop->exclusive_pios));
WARN_ON(!ploop_empty_htable(ploop->inflight_pios));
kfree(ploop->inflight_pios);
@@ -330,6 +344,33 @@ ALLOW_ERROR_INJECTION(ploop_add_deltas_stack, ERRNO);
argc--; \
argv++; \
} while (0);
+
+static struct ploop_worker *ploop_worker_create(struct ploop *ploop)
+{
+ struct ploop_worker *worker;
+ struct task_struct *task;
+
+ worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
+ if (!worker)
+ return NULL;
+
+ worker->ploop = ploop;
+ task = kthread_create(ploop_worker, worker, "ploop-%d-0",
+ current->pid);
+
+ if (IS_ERR(task))
+ goto out_err;
+ worker->task = task;
+
+ wake_up_process(task);
+
+ return worker;
+
+out_err:
+ kfree(worker);
+ return NULL;
+}
+
/*
* <data dev>
*/
@@ -337,7 +378,6 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
percpu_ref_func_t *release;
struct ploop *ploop;
- unsigned int flags;
int i, ret;
if (argc < 2)
@@ -397,13 +437,6 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
}
- flags = WQ_MEM_RECLAIM|WQ_HIGHPRI|WQ_UNBOUND;
- ploop->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, flags, 0);
- if (!ploop->wq) {
- ret = -ENOMEM;
- goto err;
- }
-
ti->private = ploop;
ploop->ti = ti;
@@ -474,6 +507,11 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (argc <= 0)
goto err;
+
+ ploop->kt_worker = ploop_worker_create(ploop);
+ if (!ploop->kt_worker)
+ goto err;
+
ret = ploop_add_deltas_stack(ploop, &argv[0], argc);
if (ret)
goto err;
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index 1ba91cbc4f04..64d8eac4ef84 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -138,6 +138,12 @@ enum {
PLOOP_LIST_INVALID = PLOOP_LIST_COUNT,
};
+struct ploop_worker {
+ struct ploop *ploop;
+ struct task_struct *task;
+ u64 kcov_handle;
+};
+
struct ploop {
struct dm_target *ti;
#define PLOOP_PRQ_POOL_SIZE 512 /* Twice nr_requests from blk_mq_init_sched() */
@@ -179,10 +185,10 @@ struct ploop {
*/
struct hlist_head *exclusive_pios;
- struct workqueue_struct *wq;
struct work_struct worker;
struct work_struct event_work;
+ struct ploop_worker *kt_worker;
struct completion inflight_bios_ref_comp;
struct percpu_ref inflight_bios_ref[2];
bool inflight_ref_comp_pending;
@@ -598,4 +604,7 @@ extern void ploop_call_rw_iter(struct file *file, loff_t pos, unsigned rw,
struct iov_iter *iter, struct pio *pio);
extern void ploop_enospc_timer(struct timer_list *timer);
extern loff_t ploop_llseek_hole(struct dm_target *ti, loff_t offset, int whence);
+
+int ploop_worker(void *data);
+
#endif /* __DM_PLOOP_H */
More information about the Devel
mailing list