[Devel] [PATCH RHEL9 COMMIT] dm-ploop: prepare bat updates under bat_lock

Konstantin Khorenko khorenko at virtuozzo.com
Mon Jan 27 16:12:45 MSK 2025


The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.44.1.vz9.80.6
------>
commit 9b7d1740a213859683c595b3d22bcdfaaad6af5c
Author: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
Date:   Fri Jan 24 17:36:09 2025 +0200

    dm-ploop: prepare bat updates under bat_lock
    
    Prepare for threads. When preparing bat updates there are two important
    things to protect - md->status MD_DIRTY bit  and holes bitmap.
    Use bat_lock to protect them.
    
    https://virtuozzo.atlassian.net/browse/VSTOR-91821
    Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
    
    ======
    Patchset description:
    ploop: optimistations and scalling
    
    Ploop processes requsts in a different threads in parallel
    where possible which results in significant improvement in
    performance and makes further optimistations possible.
    
    Known bugs:
      - delayed metadata writeback is not working and is missing error handling
         - patch to disable it until fixed
      - fast path is not working - causes rcu lockups - patch to disable it
    
    Further improvements:
      - optimize md pages lookups
    
    Alexander Atanasov (50):
      dm-ploop: md_pages map all pages at creation time
      dm-ploop: Use READ_ONCE/WRITE_ONCE to access md page data
      dm-ploop: fsync after all pios are sent
      dm-ploop: move md status to use proper bitops
      dm-ploop: convert wait_list and wb_batch_llist to use lockless lists
      dm-ploop: convert enospc handling to use lockless lists
      dm-ploop: convert suspended_pios list to use lockless list
      dm-ploop: convert the rest of the lists to use llist variant
      dm-ploop: combine processing of pios thru prepare list and remove
        fsync worker
      dm-ploop: move from wq to kthread
      dm-ploop: move preparations of pios into the caller from worker
      dm-ploop: fast path execution for reads
      dm-ploop: do not use a wrapper for set_bit to make a page writeback
      dm-ploop: BAT use only one list for writeback
      dm-ploop: make md writeback timeout to be per page
      dm-ploop: add interface to disable bat writeback delay
      dm-ploop: convert wb_batch_list to lockless variant
      dm-ploop: convert high_prio to status
      dm-ploop: split cow processing into two functions
      dm-ploop: convert md page rw lock to spin lock
      dm-ploop: convert bat_rwlock to bat_lock spinlock
      dm-ploop: prepare bat updates under bat_lock
      dm-ploop: make ploop_bat_write_complete ready for parallel pio
        completion
      dm-ploop: make ploop_submit_metadata_writeback return number of
        requests sent
      dm-ploop: introduce pio runner threads
      dm-ploop: add pio list ids to be used when passing pios to runners
      dm-ploop: process pios via runners
      dm-ploop: disable metadata writeback delay
      dm-ploop: disable fast path
      dm-ploop: use lockless lists for chained cow updates list
      dm-ploop: use lockless lists for data ready pios
      dm-ploop: give runner threads better name
      dm-ploop: resize operation - add holes bitmap locking
      dm-ploop: remove unnecessary operations
      dm-ploop: use filp per thread
      dm-ploop: catch if we try to advance pio past bio end
      dm-ploop: support REQ_FUA for data pios
      dm-ploop: proplerly access nr_bat_entries
      dm-ploop: fix locking and improve error handling when submitting pios
      dm-ploop: fix how ENOTBLK is handled
      dm-ploop: sync when suspended or stopping
      dm-ploop: rework bat completion logic
      dm-ploop: rework logic in pio processing
      dm-ploop: end fsync pios in parallel
      dm-ploop: make filespace preallocations async
      dm-ploop: resubmit enospc pios from dispatcher thread
      dm-ploop: dm-ploop: simplify discard completion
      dm-ploop: use GFP_ATOMIC instead of GFP_NOIO
      dm-ploop: fix locks used in mixed context
      dm-ploop: fix how current flags are managed inside threads
    
    Andrey Zhadchenko (13):
      dm-ploop: do not flush after metadata writes
      dm-ploop: set IOCB_DSYNC on all FUA requests
      dm-ploop: remove extra ploop_cluster_is_in_top_delta()
      dm-ploop: introduce per-md page locking
      dm-ploop: reduce BAT accesses on discard completion
      dm-ploop: simplify llseek
      dm-ploop: speed up ploop_prepare_bat_update()
      dm-ploop: make new allocations immediately visible in BAT
      dm-ploop: drop ploop_cluster_is_in_top_delta()
      dm-ploop: do not wait for BAT update for non-FUA requests
      dm-ploop: add delay for metadata writeback
      dm-ploop: submit all postponed metadata on REQ_OP_FLUSH
      dm-ploop: handle REQ_PREFLUSH
    
    Feature: dm-ploop: ploop target driver
---
 drivers/md/dm-ploop-cmd.c |   2 +
 drivers/md/dm-ploop-map.c | 100 ++++++++++++++++++++++++++++++++--------------
 2 files changed, 72 insertions(+), 30 deletions(-)

diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c
index dc0be65261f1..84e965855698 100644
--- a/drivers/md/dm-ploop-cmd.c
+++ b/drivers/md/dm-ploop-cmd.c
@@ -315,7 +315,9 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
 		goto out;
 	}
 
+	spin_lock_irq(&ploop->bat_lock);
 	ret = ploop_prepare_reloc_index_wb(ploop, &md, clu, &new_dst);
+	spin_unlock_irq(&ploop->bat_lock);
 	if (ret < 0) {
 		PL_ERR("reloc: can't prepare it: %d", ret);
 		goto out;
diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 8b32f559fdb9..129447510033 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -554,16 +554,20 @@ static void ploop_unlink_completed_pio(struct ploop *ploop, struct pio *pio)
 
 static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md)
 {
-	unsigned long flags;
 	bool new = false;
 
-	spin_lock_irqsave(&ploop->bat_lock, flags);
+	/*
+	 * Usual pattern is check for dirty, prepare bat update
+	 * and then call make_dirty - to avoid races callers must lock
+	 */
+
+	lockdep_assert_held(&ploop->bat_lock);
+
 	WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status));
 	if (!test_and_set_bit(MD_DIRTY, &md->status)) {
 		llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
 		new = true;
 	}
-	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 	md->dirty_timeout = jiffies + ploop->md_submit_delay_ms*HZ/1000;
 
 	return new;
@@ -809,6 +813,9 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop,
 
 	dst_clu = kmap_local_page(piwb->bat_page);
 
+	/* holes bit map requires bat_lock */
+	spin_lock_irqsave(&ploop->bat_lock, flags);
+	spin_lock(&md->md_lock);
 	if (piwb->type == PIWB_TYPE_ALLOC)
 		goto skip_apply;
 
@@ -832,10 +839,10 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop,
 
 	WARN_ON_ONCE(!test_bit(MD_WRITEBACK, &md->status));
 	/* protect piwb */
-	spin_lock_irqsave(&md->md_lock, flags); /* write */
 	clear_bit(MD_WRITEBACK, &md->status);
 	md->piwb = NULL;
-	spin_unlock_irqrestore(&md->md_lock, flags);
+	spin_unlock(&md->md_lock);
+	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 	kunmap_local(dst_clu);
 
 	wait_llist_pending = llist_del_all(&md->wait_llist);
@@ -946,6 +953,8 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md,
 	map_index_t *to;
 	u8 level;
 
+	lockdep_assert_held(&ploop->bat_lock);
+
 	piwb = kmalloc(sizeof(*piwb), GFP_NOIO);
 	if (!piwb)
 		return -ENOMEM;
@@ -959,7 +968,7 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md,
 
 	bat_entries = md->kmpage;
 
-	spin_lock_irq(&md->md_lock); /* write */
+	spin_lock(&md->md_lock); /* write */
 	WARN_ON_ONCE(md->piwb);
 	md->piwb = piwb;
 	piwb->md = md;
@@ -991,7 +1000,7 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md,
 		if (clu == BAT_ENTRY_NONE || level < ploop_top_level(ploop))
 			to[i] = 0;
 	}
-	spin_unlock_irq(&md->md_lock);
+	spin_unlock(&md->md_lock);
 
 	if (is_last_page) {
 	/* Fill tail of page with 0 */
@@ -1081,10 +1090,22 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu)
 	u32 clu_size = CLU_SIZE(ploop);
 	loff_t off, pos, end, old_size;
 	struct file *file = top->file;
+	unsigned long flags;
 	int ret;
 
-	if (ploop_find_dst_clu_bit(ploop, dst_clu) < 0)
+	spin_lock_irqsave(&ploop->bat_lock, flags);
+	if (ploop_find_dst_clu_bit(ploop, dst_clu) < 0) {
+		spin_unlock_irqrestore(&ploop->bat_lock, flags);
 		return -EIO;
+	}
+	/*
+	 * Mark clu as used. Find & clear bit must be locked,
+	 * since  this may be called from different threads.
+	 * Note, that set_bit may be made from many places.
+	 * We only care to clear what find got. parallel set is ok.
+	 */
+	ploop_hole_clear_bit(*dst_clu, ploop);
+	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	pos = CLU_TO_POS(ploop, *dst_clu);
 	end = pos + clu_size;
@@ -1098,6 +1119,8 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu)
 		else
 			ret = ploop_zero_range(file, pos, off - pos);
 		if (ret) {
+			ploop_hole_set_bit(*dst_clu, ploop);
+
 			if (ret != -EOPNOTSUPP)
 				PL_ERR("punch/zero area: %d", ret);
 			else if (ploop->falloc_new_clu)
@@ -1112,18 +1135,15 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu)
 
 	if (end > old_size) {
 		ret = ploop_truncate_prealloc_safe(ploop, top, end, __func__);
-		if (ret)
+		if (ret) {
+			ploop_hole_set_bit(*dst_clu, ploop);
 			return ret;
+		}
 	}
 
 	if (end > top->file_preallocated_area_start)
 		top->file_preallocated_area_start = end;
-	/*
-	 * Mark clu as used. Find & clear bit is unlocked,
-	 * since currently this may be called only from deferred
-	 * kwork. Note, that set_bit may be made from many places.
-	 */
-	ploop_hole_clear_bit(*dst_clu, ploop);
+
 	return 0;
 }
 ALLOW_ERROR_INJECTION(ploop_allocate_cluster, ERRNO);
@@ -1433,15 +1453,21 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
 	page_id = ploop_bat_clu_to_page_nr(clu);
 	md = ploop_md_page_find(ploop, page_id);
 
-	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, cow->aux_pio))
+	spin_lock_irqsave(&ploop->bat_lock, flags);
+	/* MD_DIRTY is set and cleared concurrently, do it under bat_lock */
+	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, cow->aux_pio)) {
+		spin_unlock_irqrestore(&ploop->bat_lock, flags);
 		goto out;
+	}
 
 	if (!test_bit(MD_DIRTY, &md->status)) {
-		/* MD_DIRTY is set and cleared from this work */
-		if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_ALLOC) < 0)
+		if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_ALLOC) < 0) {
+			spin_unlock_irqrestore(&ploop->bat_lock, flags);
 			goto err_resource;
+		}
 		ploop_md_make_dirty(ploop, md);
 	}
+	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	piwb = md->piwb;
 
@@ -1532,32 +1558,38 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
 	bool attached = false;
 	u32 page_id;
 	int err;
+	unsigned long flags;
 
 	WARN_ON_ONCE(pio->queue_list_id != PLOOP_LIST_DEFERRED);
-	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, pio))
+	spin_lock_irqsave(&ploop->bat_lock, flags);
+	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, pio)) {
+		spin_unlock_irqrestore(&ploop->bat_lock, flags);
 		goto out;
+	}
+
 
 	if (!test_bit(MD_DIRTY, &md->status)) {
-		 /* Unlocked since MD_DIRTY is set and cleared from this work */
+		 /* Locked since MD_DIRTY is set and cleared concurrently  */
 		page_id = ploop_bat_clu_to_page_nr(clu);
 		if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_ALLOC) < 0) {
 			pio->bi_status = BLK_STS_RESOURCE;
+			spin_unlock_irqrestore(&ploop->bat_lock, flags);
 			goto error;
 		}
 		bat_update_prepared = true;
+		ploop_md_make_dirty(ploop, md);
 	}
+	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	piwb = md->piwb;
 
 	err = ploop_alloc_cluster(ploop, piwb, clu, dst_clu);
 	if (err) {
 		pio->bi_status = errno_to_blk_status(err);
+		clear_bit(MD_DIRTY, &md->status);
 		goto error;
 	}
 
-	if (bat_update_prepared)
-		ploop_md_make_dirty(ploop, md);
-
 	if (pio->bi_op & REQ_FUA) {
 		piwb->pio->bi_op |= REQ_FUA;
 		ploop_attach_end_action(pio, piwb);
@@ -1771,23 +1803,28 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
 	struct ploop_index_wb *piwb;
 	struct md_page *md;
 	map_index_t *to;
+	unsigned long flags;
 
 	WARN_ON(ploop->nr_deltas != 1 ||
 		pio->queue_list_id != PLOOP_LIST_DISCARD);
 
 	page_id = ploop_bat_clu_to_page_nr(clu);
 	md = ploop_md_page_find(ploop, page_id);
-	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_DISCARD, pio))
-		goto out;
+	spin_lock_irqsave(&ploop->bat_lock, flags);
+	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_DISCARD, pio)) {
+		spin_unlock_irqrestore(&ploop->bat_lock, flags);
+		return;
+	}
 
 	if (!test_bit(MD_DIRTY, &md->status)) {
-		 /* Unlocked since MD_DIRTY is set and cleared from this work */
 		if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_DISCARD) < 0) {
 			pio->bi_status = BLK_STS_RESOURCE;
-			goto err;
+			goto err_unlck;
 		}
 		bat_update_prepared = true;
+		ploop_md_make_dirty(ploop, md);
 	}
+	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	piwb = md->piwb;
 
@@ -1797,18 +1834,21 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
 	to = kmap_local_page(piwb->bat_page);
 	if (WARN_ON_ONCE(!to[clu])) {
 		pio->bi_status = BLK_STS_IOERR;
+		clear_bit(MD_DIRTY, &md->status);
 		goto err;
 	} else {
 		WRITE_ONCE(to[clu], 0);
+		spin_lock_irqsave(&piwb->lock, flags);
 		list_add_tail(&pio->list, &piwb->ready_data_pios);
+		spin_unlock_irqrestore(&piwb->lock, flags);
 	}
 	kunmap_local(to);
 
-	if (bat_update_prepared)
-		ploop_md_make_dirty(ploop, md);
 	ploop_md_up_prio(ploop, md);
-out:
+
 	return;
+err_unlck:
+	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 err:
 	if (bat_update_prepared)
 		ploop_break_bat_update(ploop, md);


More information about the Devel mailing list