[Devel] [PATCH vz9 v2 25/65] dm-ploop: add delay for metadata writeback

Konstantin Khorenko khorenko at virtuozzo.com
Wed Feb 12 12:33:18 MSK 2025


From: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>

Revert llist conversion for metadata writeback.
Create new list for priority metadata updates, which are triggered
by FUA requests.
Write metadata for all other requests in batch after some delay.
Add new parameter to specify delay time.
Always submit COW and discard io to prio list.

https://virtuozzo.atlassian.net/browse/VSTOR-91817
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>

======
Patchset description:
ploop: optimistations and scalling

Ploop processes requsts in a different threads in parallel
where possible which results in significant improvement in
performance and makes further optimistations possible.

Known bugs:
  - delayed metadata writeback is not working and is missing error handling
     - patch to disable it until fixed
  - fast path is not working - causes rcu lockups - patch to disable it

Further improvements:
  - optimize md pages lookups

Alexander Atanasov (50):
  dm-ploop: md_pages map all pages at creation time
  dm-ploop: Use READ_ONCE/WRITE_ONCE to access md page data
  dm-ploop: fsync after all pios are sent
  dm-ploop: move md status to use proper bitops
  dm-ploop: convert wait_list and wb_batch_llist to use lockless lists
  dm-ploop: convert enospc handling to use lockless lists
  dm-ploop: convert suspended_pios list to use lockless list
  dm-ploop: convert the rest of the lists to use llist variant
  dm-ploop: combine processing of pios thru prepare list and remove
    fsync worker
  dm-ploop: move from wq to kthread
  dm-ploop: move preparations of pios into the caller from worker
  dm-ploop: fast path execution for reads
  dm-ploop: do not use a wrapper for set_bit to make a page writeback
  dm-ploop: BAT use only one list for writeback
  dm-ploop: make md writeback timeout to be per page
  dm-ploop: add interface to disable bat writeback delay
  dm-ploop: convert wb_batch_list to lockless variant
  dm-ploop: convert high_prio to status
  dm-ploop: split cow processing into two functions
  dm-ploop: convert md page rw lock to spin lock
  dm-ploop: convert bat_rwlock to bat_lock spinlock
  dm-ploop: prepare bat updates under bat_lock
  dm-ploop: make ploop_bat_write_complete ready for parallel pio
    completion
  dm-ploop: make ploop_submit_metadata_writeback return number of
    requests sent
  dm-ploop: introduce pio runner threads
  dm-ploop: add pio list ids to be used when passing pios to runners
  dm-ploop: process pios via runners
  dm-ploop: disable metadata writeback delay
  dm-ploop: disable fast path
  dm-ploop: use lockless lists for chained cow updates list
  dm-ploop: use lockless lists for data ready pios
  dm-ploop: give runner threads better name
  dm-ploop: resize operation - add holes bitmap locking
  dm-ploop: remove unnecessary operations
  dm-ploop: use filp per thread
  dm-ploop: catch if we try to advance pio past bio end
  dm-ploop: support REQ_FUA for data pios
  dm-ploop: proplerly access nr_bat_entries
  dm-ploop: fix locking and improve error handling when submitting pios
  dm-ploop: fix how ENOTBLK is handled
  dm-ploop: sync when suspended or stopping
  dm-ploop: rework bat completion logic
  dm-ploop: rework logic in pio processing
  dm-ploop: end fsync pios in parallel
  dm-ploop: make filespace preallocations async
  dm-ploop: resubmit enospc pios from dispatcher thread
  dm-ploop: dm-ploop: simplify discard completion
  dm-ploop: use GFP_ATOMIC instead of GFP_NOIO
  dm-ploop: fix locks used in mixed context
  dm-ploop: fix how current flags are managed inside threads

Andrey Zhadchenko (13):
  dm-ploop: do not flush after metadata writes
  dm-ploop: set IOCB_DSYNC on all FUA requests
  dm-ploop: remove extra ploop_cluster_is_in_top_delta()
  dm-ploop: introduce per-md page locking
  dm-ploop: reduce BAT accesses on discard completion
  dm-ploop: simplify llseek
  dm-ploop: speed up ploop_prepare_bat_update()
  dm-ploop: make new allocations immediately visible in BAT
  dm-ploop: drop ploop_cluster_is_in_top_delta()
  dm-ploop: do not wait for BAT update for non-FUA requests
  dm-ploop: add delay for metadata writeback
  dm-ploop: submit all postponed metadata on REQ_OP_FLUSH
  dm-ploop: handle REQ_PREFLUSH

Feature: dm-ploop: ploop target driver
---
 drivers/md/dm-ploop-bat.c    |  2 ++
 drivers/md/dm-ploop-map.c    | 66 ++++++++++++++++++++++++++++++------
 drivers/md/dm-ploop-target.c | 13 +++++++
 drivers/md/dm-ploop.h        | 10 +++++-
 4 files changed, 80 insertions(+), 11 deletions(-)

diff --git a/drivers/md/dm-ploop-bat.c b/drivers/md/dm-ploop-bat.c
index 96ba099b1ca4..33069d580e87 100644
--- a/drivers/md/dm-ploop-bat.c
+++ b/drivers/md/dm-ploop-bat.c
@@ -88,7 +88,9 @@ static struct md_page *ploop_alloc_md_page(u32 id)
 	md->page = page;
 	md->kmpage = kmap(page);
 	md->id = id;
+	md->high_prio = false;
 	spin_lock_init(&md->md_lock);
+
 	return md;
 err_page:
 	kfree(levels);
diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 46e1afbdad32..88e128bba479 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -553,17 +553,39 @@ static void ploop_unlink_completed_pio(struct ploop *ploop, struct pio *pio)
 
 static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md)
 {
+	unsigned long flags;
 	bool new = false;
 
+	write_lock_irqsave(&ploop->bat_rwlock, flags);
 	WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status));
 	if (!test_and_set_bit(MD_DIRTY, &md->status)) {
-		llist_add(&md->wb_llink, &ploop->wb_batch_llist);
+		list_add(&md->wb_link, &ploop->wb_batch_list);
 		new = true;
 	}
-
+	write_unlock_irqrestore(&ploop->bat_rwlock, flags);
 	return new;
 }
 
+static void ploop_md_up_prio(struct ploop *ploop, struct md_page *md)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&ploop->bat_rwlock, flags);
+	if (test_bit(MD_WRITEBACK, &md->status))
+		goto out;
+	if (md->high_prio)
+		goto out;
+
+	md->high_prio = true;
+	WARN_ON_ONCE(!test_bit(MD_DIRTY, &md->status));
+
+	list_del(&md->wb_link);
+	list_add(&md->wb_link, &ploop->wb_batch_list_prio);
+
+out:
+	write_unlock_irqrestore(&ploop->bat_rwlock, flags);
+}
+
 static bool ploop_pio_endio_if_all_zeros(struct pio *pio)
 {
 	struct bvec_iter bi = {
@@ -1421,6 +1443,8 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
 	WRITE_ONCE(md->bat_levels[clu], ploop_top_level(ploop));
 	spin_unlock_irqrestore(&md->md_lock, flags);
 
+	ploop_md_up_prio(ploop, md);
+
 	/* Prevent double clearing of holes_bitmap bit on complete_cow() */
 	cow->dst_clu = BAT_ENTRY_NONE;
 	spin_lock_irq(&ploop->deferred_lock);
@@ -1518,6 +1542,7 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
 	if (pio->bi_op & REQ_FUA) {
 		piwb->pio->bi_op |= REQ_FUA;
 		ploop_attach_end_action(pio, piwb);
+		ploop_md_up_prio(ploop, md);
 	}
 
 	attached = true;
@@ -1760,6 +1785,7 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
 
 	if (bat_update_prepared)
 		ploop_md_make_dirty(ploop, md);
+	ploop_md_up_prio(ploop, md);
 out:
 	return;
 err:
@@ -1795,25 +1821,45 @@ static void ploop_process_resubmit_pios(struct ploop *ploop,
 	}
 }
 
-static void ploop_submit_metadata_writeback(struct ploop *ploop)
+static void ploop_submit_metadata_writeback_from_list(struct ploop *ploop,
+						      struct list_head *list)
 {
 	struct md_page *md;
-	struct md_page *t;
-	struct llist_node *wbl;
 
-	wbl = llist_del_all(&ploop->wb_batch_llist);
-	if (!wbl)
-		return;
-	wbl = llist_reverse_order(wbl);
-	llist_for_each_entry_safe(md, t, wbl, wb_llink) {
+	while (1) {
+		write_lock_irq(&ploop->bat_rwlock);
+		md = list_first_entry_or_null(list, struct md_page, wb_link);
+		if (!md) {
+			write_unlock_irq(&ploop->bat_rwlock);
+			break;
+		}
+		list_del_init(&md->wb_link);
+		/* L1L2 mustn't be redirtyed, when wb in-flight! */
+
 		WARN_ON_ONCE(!test_bit(MD_DIRTY, &md->status));
 		WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status));
 		set_bit(MD_WRITEBACK, &md->status);
 		clear_bit(MD_DIRTY, &md->status);
+		md->high_prio = false;
+		write_unlock_irq(&ploop->bat_rwlock);
+
 		ploop_index_wb_submit(ploop, md->piwb);
 	}
 }
 
+static void ploop_submit_metadata_writeback(struct ploop *ploop)
+{
+	ktime_t time;
+
+	ploop_submit_metadata_writeback_from_list(ploop, &ploop->wb_batch_list_prio);
+
+	time = ktime_get();
+	if (ktime_after(time, ktime_add_ms(ploop->last_md_submit, ploop->md_submit_delay_ms))) {
+		ploop->last_md_submit = time;
+		ploop_submit_metadata_writeback_from_list(ploop, &ploop->wb_batch_list);
+	}
+}
+
 static void process_ploop_fsync_work(struct ploop *ploop, struct llist_node *llflush_pios)
 {
 	struct file *file;
diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c
index 00d160b3fd3c..f714c2cb540f 100644
--- a/drivers/md/dm-ploop-target.c
+++ b/drivers/md/dm-ploop-target.c
@@ -422,6 +422,11 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	INIT_LIST_HEAD(&ploop->cluster_lk_list);
 	init_llist_head(&ploop->wb_batch_llist);
+	INIT_LIST_HEAD(&ploop->wb_batch_list);
+	INIT_LIST_HEAD(&ploop->wb_batch_list_prio);
+	ploop->last_md_submit = 0;
+	ploop->md_submit_delay_ms = PLOOP_DEFAULT_METADATA_SUBMIT_DELAY;
+
 	ploop->bat_entries = RB_ROOT;
 	timer_setup(&ploop->enospc_timer, ploop_enospc_timer, 0);
 
@@ -503,6 +508,14 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 			EAT_ARG(argc, argv);
 			continue;
 		}
+#define PLOOP_MD_DELAY_ARG "metadata_delay="
+		if (strncmp(argv[0], PLOOP_MD_DELAY_ARG, sizeof(PLOOP_MD_DELAY_ARG) - 1) == 0) {
+			if (kstrtou64(argv[0] + sizeof(PLOOP_MD_DELAY_ARG) - 1, 10,
+			    &ploop->md_submit_delay_ms) < 0)
+				goto err;
+			EAT_ARG(argc, argv);
+			continue;
+		}
 		break;
 	}
 
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index 6bf8b85a85c0..0ea73b4289e2 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -122,10 +122,12 @@ struct md_page {
 	u8 *bat_levels;
 	struct llist_head wait_llist;
 
-	struct llist_node wb_llink;
+	struct list_head wb_link;
 	struct ploop_index_wb *piwb;
 
 	spinlock_t md_lock;
+
+	bool high_prio;
 };
 
 enum {
@@ -174,6 +176,12 @@ struct ploop {
 	rwlock_t bat_rwlock;
 
 	struct llist_head wb_batch_llist;
+	struct list_head wb_batch_list;
+	struct list_head wb_batch_list_prio;
+
+	ktime_t last_md_submit;
+#define PLOOP_DEFAULT_METADATA_SUBMIT_DELAY 10000
+	uint64_t md_submit_delay_ms;
 
 	/*
 	 * Hash table to link non-exclusive submitted bios.
-- 
2.43.5



More information about the Devel mailing list