[Devel] [PATCH vz9 v2] dm-ploop: fix and rework md updates

Alexander Atanasov alexander.atanasov at virtuozzo.com
Wed Feb 12 16:30:11 MSK 2025


With the locking reduced it opened windows for races between
running updates, pending and new updates. Current logic to
deal with them is not correct.

Current flags are:
MD_DIRTY - means md page is dirty and is waiting for writeback
MD_WRITEBACK - write back is in progress
But we drop the lock after we make page dirty so it races with
writeback.

To fix this introduce a new flag:
MD_UPDATING - which means we are updating the pending writeback
or we are creating a new one. Remove the check if piwb is present
on the page and clear piwb early on submit, check was using a type
alloc vs discard but in case of concurrent alloc and discard the piwb
is overwritten by the second operation. Using a flag solves this
and makes it more clear in what state page is.

Nice side effect of closing that race is that we have a minimalistic
delay for writeback updates. This happens when we try to submit a
pending writeback and we see the MD_UPDATING flags, update is not
submitted but it is retried on next cycle. So if we have a lot of
update to the same page in a very short periot they are accumulated
as one.

On error only clear MD_DIRTY if it is set by us before dropping
the lock and when breaking the prepared bat update only clear
md->piwb  if it is the same as the piwb allocated in the function.

Add some clarification error messages around WARN_ONs.

While at it fix:
- mapping bat_bage two times in same function, do it only once
- missed unmapping on error path
- update proper bat levels in delayed update case
- locking around prepare_bat_update from ploop_prepare_reloc_index_wb

https://virtuozzo.atlassian.net/browse/VSTOR-98807
Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
---
 drivers/md/dm-ploop-cmd.c |   5 +-
 drivers/md/dm-ploop-map.c | 170 +++++++++++++++++++++++++++-----------
 drivers/md/dm-ploop.h     |   8 +-
 3 files changed, 128 insertions(+), 55 deletions(-)

v1->v2:
 - remove obsolete comment
 - add comment on single md page user
 - add missed unlock in error path
 - use ploop_add_dirty_for_wb
 - fix unmapping wrong mapping

diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c
index e032af52b64e..d2eb4797ab6e 100644
--- a/drivers/md/dm-ploop-cmd.c
+++ b/drivers/md/dm-ploop-cmd.c
@@ -315,10 +315,8 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
 		goto out;
 	}
 
-	spin_lock_irq(&ploop->bat_lock);
 	ret = ploop_prepare_reloc_index_wb(ploop, &md, clu, &new_dst,
 					   ploop_top_delta(ploop)->file);
-	spin_unlock_irq(&ploop->bat_lock);
 	if (ret < 0) {
 		PL_ERR("reloc: can't prepare it: %d", ret);
 		goto out;
@@ -329,7 +327,8 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
 	ret = ploop_write_cluster_sync(ploop, pio, new_dst);
 	if (ret) {
 		PL_ERR("reloc: failed write: %d", ret);
-		ploop_break_bat_update(ploop, md);
+		/* piwb can not change here we are the only user of md page */
+		ploop_break_bat_update(ploop, md, piwb);
 		goto out;
 	}
 
diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 7efbaedd69e0..9fc72d5ad376 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -404,18 +404,15 @@ void ploop_dispatch_pios(struct ploop *ploop, struct pio *pio,
 }
 
 static bool ploop_delay_if_md_busy(struct ploop *ploop, struct md_page *md,
-				   enum piwb_type type, struct pio *pio)
+				   struct pio *pio)
 {
-	struct ploop_index_wb *piwb;
 	unsigned long flags;
 	bool busy = false;
 
 	WARN_ON_ONCE(!list_empty(&pio->list));
 
-	/* lock protects piwb */
 	spin_lock_irqsave(&md->md_lock, flags); /* read */
-	piwb = md->piwb;
-	if (piwb && (piwb->type != type || test_bit(MD_WRITEBACK, &md->status))) {
+	if (test_bit(MD_WRITEBACK, &md->status) || test_bit(MD_UPDATING, &md->status)) {
 		llist_add((struct llist_node *)(&pio->list), &md->wait_llist);
 		busy = true;
 	}
@@ -601,6 +598,11 @@ static void ploop_unlink_completed_pio(struct ploop *ploop, struct pio *pio)
 		ploop_dispatch_pios(ploop, NULL, &pio_list);
 }
 
+static void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md)
+{
+	llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
+}
+
 static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md)
 {
 	bool new = false;
@@ -613,10 +615,8 @@ static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md)
 	lockdep_assert_held(&ploop->bat_lock);
 
 	WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status));
-	if (!test_and_set_bit(MD_DIRTY, &md->status)) {
-		llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
+	if (!test_and_set_bit(MD_DIRTY, &md->status))
 		new = true;
-	}
 	md->dirty_timeout = jiffies + ploop->md_submit_delay_ms*HZ/1000;
 
 	return new;
@@ -874,9 +874,6 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop,
 #endif
 
 	WARN_ON_ONCE(!test_bit(MD_WRITEBACK, &md->status));
-	/* protect piwb */
-	clear_bit(MD_WRITEBACK, &md->status);
-	md->piwb = NULL;
 	spin_unlock(&md->md_lock);
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 	kunmap_local(dst_clu);
@@ -892,6 +889,11 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop,
 
 	if (!list_empty(&list))
 		ploop_dispatch_pios(ploop, NULL, &list);
+
+	spin_lock_irqsave(&ploop->bat_lock, flags);
+	/* Clear flag at the end when all processing related is done */
+	clear_bit(MD_WRITEBACK, &md->status);
+	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 }
 
 static void ploop_free_piwb(struct ploop_index_wb *piwb)
@@ -1030,7 +1032,10 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md,
 	bat_entries = md->kmpage;
 
 	spin_lock(&md->md_lock); /* write */
-	WARN_ON_ONCE(md->piwb);
+	if (WARN_ON(md->piwb)) {
+		PL_ERR("md %p has piwb: %p type:%d ourtype:%d\n",
+			md, md->piwb, md->piwb->type, type);
+	}
 	md->piwb = piwb;
 	piwb->md = md;
 
@@ -1077,15 +1082,18 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md,
 	return -ENOMEM;
 }
 
-void ploop_break_bat_update(struct ploop *ploop, struct md_page *md)
+void ploop_break_bat_update(struct ploop *ploop, struct md_page *md,
+			    struct ploop_index_wb *piwb)
 {
-	struct ploop_index_wb *piwb;
 	unsigned long flags;
 
-	spin_lock_irqsave(&md->md_lock, flags); /* write */
-	piwb = md->piwb;
-	md->piwb = NULL;
-	spin_unlock_irqrestore(&md->md_lock, flags);
+	spin_lock_irqsave(&ploop->bat_lock, flags);
+	spin_lock(&md->md_lock); /* write */
+	/* Only break if piwb is the same that failed to prepare */
+	if (md->piwb == piwb)
+		md->piwb = NULL;
+	spin_unlock(&md->md_lock);
+	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	ploop_free_piwb(piwb);
 }
@@ -1335,7 +1343,10 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb,
 	map_index_t *to;
 	int ret = 0;
 	u32 tmp_clu;
+#ifdef PLOOP_DELAYWB
 	unsigned long flags;
+	map_index_t *tomd;
+#endif
 
 	/* Cluster index related to the page[page_id] start */
 	clu -= piwb->page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET;
@@ -1347,7 +1358,6 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb,
 		*dst_clu = tmp_clu;
 		already_alloced = true;
 	}
-	kunmap_local(to);
 
 	if (already_alloced)
 		goto out;
@@ -1356,17 +1366,16 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb,
 	if (unlikely(ret < 0))
 		goto out;
 
-	to = kmap_local_page(piwb->bat_page);
-	spin_lock_irqsave(&piwb->md->md_lock, flags);
 	WRITE_ONCE(to[clu], *dst_clu);
+#ifdef PLOOP_DELAYWB
+	spin_lock_irqsave(&piwb->md->md_lock, flags);
+	tomd = piwb->md->kmpage;
+	WRITE_ONCE(tomd[clu], *dst_clu);
 	WRITE_ONCE(piwb->md->bat_levels[clu], ploop_top_level(ploop));
 	spin_unlock_irqrestore(&piwb->md->md_lock, flags);
-	kunmap_local(to);
-#ifdef PLOOP_DELAYWB
-	to = piwb->md->kmpage;
-	WRITE_ONCE(to[clu], *dst_clu);
 #endif
 out:
+	kunmap_local(to);
 	return ret;
 }
 
@@ -1635,6 +1644,7 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
 	struct md_page *md;
 	unsigned long flags;
 	map_index_t *to;
+	bool add_to_wblist = false;
 
 	WARN_ON_ONCE(cow->aux_pio->queue_list_id != PLOOP_LIST_COW);
 	page_id = ploop_bat_clu_to_page_nr(clu);
@@ -1642,7 +1652,7 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
 
 	spin_lock_irqsave(&ploop->bat_lock, flags);
 	/* MD_DIRTY is set and cleared concurrently, do it under bat_lock */
-	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, cow->aux_pio)) {
+	if (ploop_delay_if_md_busy(ploop, md, cow->aux_pio)) {
 		spin_unlock_irqrestore(&ploop->bat_lock, flags);
 		goto out;
 	}
@@ -1652,11 +1662,14 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
 			spin_unlock_irqrestore(&ploop->bat_lock, flags);
 			goto err_resource;
 		}
-		ploop_md_make_dirty(ploop, md);
+		add_to_wblist = ploop_md_make_dirty(ploop, md);
 	}
-	spin_unlock_irqrestore(&ploop->bat_lock, flags);
-
 	piwb = md->piwb;
+	if (WARN_ON(!piwb))
+		PL_ERR("cow no piwb:%d\n", add_to_wblist);
+
+	set_bit(MD_UPDATING, &md->status);
+	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	clu -= page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET;
 
@@ -1678,6 +1691,10 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
 	cow->dst_clu = BAT_ENTRY_NONE;
 
 	llist_add((struct llist_node *)(&cow->aux_pio->list), &piwb->cow_llist);
+
+	if (add_to_wblist)
+		ploop_add_dirty_for_wb(ploop, md);
+	clear_bit(MD_UPDATING, &md->status);
 out:
 	return;
 err_resource:
@@ -1740,15 +1757,22 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
 	int err;
 	unsigned long flags;
 	struct file *file;
+	bool add_to_wblist = false;
 
 	WARN_ON_ONCE(pio->queue_list_id != PLOOP_LIST_DEFERRED);
 	spin_lock_irqsave(&ploop->bat_lock, flags);
-	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, pio)) {
+	if (ploop_delay_if_md_busy(ploop, md, pio)) {
 		spin_unlock_irqrestore(&ploop->bat_lock, flags);
 		goto out;
 	}
 
 
+	/*
+	 * Flag page as updating, so if there is already a pending update it
+	 * waits for the new update to be ready before processing it
+	 */
+	set_bit(MD_UPDATING, &md->status);
+
 	if (!test_bit(MD_DIRTY, &md->status)) {
 		 /* Locked since MD_DIRTY is set and cleared concurrently  */
 		page_id = ploop_bat_clu_to_page_nr(clu);
@@ -1757,19 +1781,18 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
 			spin_unlock_irqrestore(&ploop->bat_lock, flags);
 			goto error;
 		}
+		add_to_wblist = ploop_md_make_dirty(ploop, md);
 		bat_update_prepared = true;
-		ploop_md_make_dirty(ploop, md);
 	}
-	spin_unlock_irqrestore(&ploop->bat_lock, flags);
-
 	piwb = md->piwb;
+	WARN_ON_ONCE(!piwb);
+	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	file = ploop_top_delta(ploop)->mtfile[pio->runner_id];
 
 	err = ploop_alloc_cluster(ploop, piwb, clu, dst_clu, file);
 	if (err) {
 		pio->bi_status = errno_to_blk_status(err);
-		clear_bit(MD_DIRTY, &md->status);
 		goto error;
 	}
 
@@ -1787,13 +1810,22 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
 	ploop_attach_end_action(pio, piwb);
 #endif
 	attached = true;
+	if (add_to_wblist)
+		ploop_add_dirty_for_wb(ploop, md);
+	clear_bit(MD_UPDATING, &md->status);
 out:
 	return attached;
 error:
 	/* Uninit piwb */
 	if (bat_update_prepared)
-		ploop_break_bat_update(ploop, md);
+		ploop_break_bat_update(ploop, md, piwb);
 	ploop_pio_endio(pio);
+
+	spin_lock_irqsave(&ploop->bat_lock, flags);
+	if (add_to_wblist)
+		clear_bit(MD_DIRTY, &md->status);
+	clear_bit(MD_UPDATING, &md->status);
+	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 	return false;
 }
 
@@ -1992,6 +2024,7 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
 	struct md_page *md;
 	map_index_t *to;
 	unsigned long flags;
+	bool add_to_wblist = false;
 
 	WARN_ON(ploop->nr_deltas != 1 ||
 		pio->queue_list_id != PLOOP_LIST_DISCARD);
@@ -1999,22 +2032,23 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
 	page_id = ploop_bat_clu_to_page_nr(clu);
 	md = ploop_md_page_find(ploop, page_id);
 	spin_lock_irqsave(&ploop->bat_lock, flags);
-	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_DISCARD, pio)) {
+	if (ploop_delay_if_md_busy(ploop, md, pio)) {
 		spin_unlock_irqrestore(&ploop->bat_lock, flags);
 		return;
 	}
 
+	set_bit(MD_UPDATING, &md->status);
 	if (!test_bit(MD_DIRTY, &md->status)) {
 		if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_DISCARD) < 0) {
 			pio->bi_status = BLK_STS_RESOURCE;
 			goto err_unlck;
 		}
+		add_to_wblist = ploop_md_make_dirty(ploop, md);
 		bat_update_prepared = true;
-		ploop_md_make_dirty(ploop, md);
 	}
-	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	piwb = md->piwb;
+	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	/* Cluster index related to the page[page_id] start */
 	clu -= piwb->page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET;
@@ -2022,7 +2056,7 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
 	to = kmap_local_page(piwb->bat_page);
 	if (WARN_ON_ONCE(!to[clu])) {
 		pio->bi_status = BLK_STS_IOERR;
-		clear_bit(MD_DIRTY, &md->status);
+		kunmap_local(to);
 		goto err;
 	} else {
 		WRITE_ONCE(to[clu], 0);
@@ -2031,14 +2065,22 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
 	kunmap_local(to);
 
 	ploop_md_up_prio(ploop, md);
+	if (add_to_wblist)
+		ploop_add_dirty_for_wb(ploop, md);
+	clear_bit(MD_UPDATING, &md->status);
 
 	return;
 err_unlck:
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 err:
 	if (bat_update_prepared)
-		ploop_break_bat_update(ploop, md);
+		ploop_break_bat_update(ploop, md, piwb);
 	ploop_pio_endio(pio);
+	spin_lock_irqsave(&ploop->bat_lock, flags);
+	if (add_to_wblist)
+		clear_bit(MD_DIRTY, &md->status);
+	clear_bit(MD_UPDATING, &md->status);
+	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 }
 
 static inline int ploop_runners_have_pending(struct ploop *ploop)
@@ -2067,22 +2109,49 @@ static inline int ploop_submit_metadata_writeback(struct ploop *ploop, int force
 	 */
 	llist_for_each_safe(pos, t, ll_wb_batch) {
 		md = list_entry((struct list_head *)pos, typeof(*md), wb_link);
+		INIT_LIST_HEAD(&md->wb_link);
 		/* XXX: fixme delay results in a hang - TBD */
 		if (1 || !llist_empty(&md->wait_llist) || force ||
 		    test_bit(MD_HIGHPRIO, &md->status) ||
 		    time_before(md->dirty_timeout, timeout) ||
 		    ploop->force_md_writeback) {
+			spin_lock_irqsave(&ploop->bat_lock, flags);
+			if (test_bit(MD_UPDATING, &md->status) ||
+			    test_bit(MD_WRITEBACK, &md->status)) {
+				/* if updating or there is a writeback then delay */
+				spin_unlock_irqrestore(&ploop->bat_lock, flags);
+				llist_add((struct llist_node *)&md->wb_link,
+					  &ploop->wb_batch_llist);
+				continue;
+			}
+
 			/* L1L2 mustn't be redirtyed, when wb in-flight! */
-			WARN_ON_ONCE(!test_bit(MD_DIRTY, &md->status));
-			WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status));
-			set_bit(MD_WRITEBACK, &md->status);
-			clear_bit(MD_DIRTY, &md->status);
+			WARN_ONCE(!test_bit(MD_DIRTY, &md->status),
+			      PL_FMT("wb but dirty not set"),
+			      ploop_device_name(ploop));
+
+			WARN_ONCE(test_bit(MD_WRITEBACK, &md->status),
+			      PL_FMT("wb but writeback already set"),
+			      ploop_device_name(ploop));
+
 			clear_bit(MD_HIGHPRIO, &md->status);
-			ploop_index_wb_submit(ploop, md->piwb);
-			ret++;
+			if (md->piwb) {
+				struct ploop_index_wb *piwb;
+				/* New updates will go in wait list */
+				set_bit(MD_WRITEBACK, &md->status);
+				clear_bit(MD_DIRTY, &md->status);
+				/* at this point we can clear md->piwb */
+				piwb = md->piwb;
+				md->piwb = NULL;
+				/* hand off to threads */
+				ploop_index_wb_submit(ploop, piwb);
+				ret++;
+			} else {
+				PL_ERR("Unexpected md piwb is null");
+			}
+			spin_unlock_irqrestore(&ploop->bat_lock, flags);
 		} else {
-			INIT_LIST_HEAD(&md->wb_link);
-			llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
+			ploop_add_dirty_for_wb(ploop, md);
 		}
 	}
 	return ret;
@@ -2607,6 +2676,7 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop,
 		type = PIWB_TYPE_RELOC;
 
 	err = -EIO;
+	spin_lock_irq(&ploop->bat_lock);
 	if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, &md->status)) {
 		PL_ERR("Unexpected md status: %lx", md->status);
 		goto out_error;
@@ -2629,13 +2699,15 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop,
 		if (err)
 			goto out_reset;
 	}
+	spin_unlock_irq(&ploop->bat_lock);
 
 	*ret_md = md;
 	return 0;
 
 out_reset:
-	ploop_break_bat_update(ploop, md);
+	ploop_break_bat_update(ploop, md, piwb);
 out_error:
+	spin_unlock_irq(&ploop->bat_lock);
 	return err;
 }
 ALLOW_ERROR_INJECTION(ploop_prepare_reloc_index_wb, ERRNO);
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index 43e65e841e4a..90cf08472660 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -116,9 +116,10 @@ struct ploop_index_wb {
 struct md_page {
 	struct rb_node node;
 	u32 id; /* Number of this page starting from hdr */
-#define MD_DIRTY	(1U << 1) /* Page contains changes and wants writeback */
-#define MD_WRITEBACK	(1U << 2) /* Writeback was submitted */
+#define MD_DIRTY	(1U << 1) /* Page contains changes and awaits writeback */
+#define MD_WRITEBACK	(1U << 2) /* Writeback in progress */
 #define MD_HIGHPRIO	(1U << 3) /* High priority writeback  */
+#define MD_UPDATING	(1U << 4) /* Preparing an update */
 	unsigned long status;
 	struct page *page;
 	void *kmpage;
@@ -611,7 +612,8 @@ extern void ploop_map_and_submit_rw(struct ploop *ploop, u32 dst_clu,
 extern int ploop_prepare_reloc_index_wb(struct ploop *ploop,
 					struct md_page **ret_md, u32 clu, u32 *dst_clu,
 					struct file *file);
-extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md);
+extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md,
+				   struct ploop_index_wb *piwb);
 extern void ploop_index_wb_submit(struct ploop *, struct ploop_index_wb *);
 extern int ploop_message(struct dm_target *ti, unsigned int argc, char **argv,
 			 char *result, unsigned int maxlen);
-- 
2.43.0



More information about the Devel mailing list