[Devel] [PATCH vz9] dm-ploop: fix and rework md updates
Alexander Atanasov
alexander.atanasov at virtuozzo.com
Tue Feb 11 17:25:57 MSK 2025
With the locking reduced it opened windows for races between
running updates, pending and new updates. Current logic to
deal with them is not correct.
Current flags are:
MD_DIRTY - means md page is dirty and is waiting for writeback
MD_WRITEBACK - write back is in progress
But we drop the lock after we make page dirty so it races with
writeback.
To fix this introduce a new flag:
MD_UPDATING - which means we are updating the pending writeback
or we are creating a new one. Remove the check if piwb is present
on the page and clear piwb early on submit, check was using a type
alloc vs discard but in case of concurrent alloc and discard the piwb
is overwritten by the second operation. Using a flag solves this
and makes it more clear in what state page is.
Nice side effect of closing that race is that we have a minimalistic
delay for writeback updates. This happens when we try to submit a
pending writeback and we see the MD_UPDATING flags, update is not
submitted but it is retried on next cycle. So if we have a lot of
update to the same page in a very short periot they are accumulated
as one.
On error only clear MD_DIRTY if it is set by us before dropping
the lock and when breaking the prepared bat update only clear
md->piwb if it is the same as the piwb allocated in the function.
Add some clarification error messages around WARN_ONs.
While at it fix:
- mapping bat_bage two times in same function, do it only once
- missed unmapping on error path
- update proper bat levels in delayed update case
- locking around prepare_bat_update from ploop_prepare_reloc_index_wb
https://virtuozzo.atlassian.net/browse/VSTOR-98807
Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
---
drivers/md/dm-ploop-cmd.c | 4 +-
drivers/md/dm-ploop-map.c | 153 +++++++++++++++++++++++++++-----------
drivers/md/dm-ploop.h | 8 +-
3 files changed, 117 insertions(+), 48 deletions(-)
diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c
index e032af52b64e..daec9827b42c 100644
--- a/drivers/md/dm-ploop-cmd.c
+++ b/drivers/md/dm-ploop-cmd.c
@@ -315,10 +315,8 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
goto out;
}
- spin_lock_irq(&ploop->bat_lock);
ret = ploop_prepare_reloc_index_wb(ploop, &md, clu, &new_dst,
ploop_top_delta(ploop)->file);
- spin_unlock_irq(&ploop->bat_lock);
if (ret < 0) {
PL_ERR("reloc: can't prepare it: %d", ret);
goto out;
@@ -329,7 +327,7 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
ret = ploop_write_cluster_sync(ploop, pio, new_dst);
if (ret) {
PL_ERR("reloc: failed write: %d", ret);
- ploop_break_bat_update(ploop, md);
+ ploop_break_bat_update(ploop, md, piwb);
goto out;
}
diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 7efbaedd69e0..e72f6baf4aa7 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -406,7 +406,6 @@ void ploop_dispatch_pios(struct ploop *ploop, struct pio *pio,
static bool ploop_delay_if_md_busy(struct ploop *ploop, struct md_page *md,
enum piwb_type type, struct pio *pio)
{
- struct ploop_index_wb *piwb;
unsigned long flags;
bool busy = false;
@@ -414,8 +413,7 @@ static bool ploop_delay_if_md_busy(struct ploop *ploop, struct md_page *md,
/* lock protects piwb */
spin_lock_irqsave(&md->md_lock, flags); /* read */
- piwb = md->piwb;
- if (piwb && (piwb->type != type || test_bit(MD_WRITEBACK, &md->status))) {
+ if (test_bit(MD_WRITEBACK, &md->status) || test_bit(MD_UPDATING, &md->status)) {
llist_add((struct llist_node *)(&pio->list), &md->wait_llist);
busy = true;
}
@@ -601,6 +599,11 @@ static void ploop_unlink_completed_pio(struct ploop *ploop, struct pio *pio)
ploop_dispatch_pios(ploop, NULL, &pio_list);
}
+static void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md)
+{
+ llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
+}
+
static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md)
{
bool new = false;
@@ -613,10 +616,8 @@ static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md)
lockdep_assert_held(&ploop->bat_lock);
WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status));
- if (!test_and_set_bit(MD_DIRTY, &md->status)) {
- llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
+ if (!test_and_set_bit(MD_DIRTY, &md->status))
new = true;
- }
md->dirty_timeout = jiffies + ploop->md_submit_delay_ms*HZ/1000;
return new;
@@ -874,9 +875,6 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop,
#endif
WARN_ON_ONCE(!test_bit(MD_WRITEBACK, &md->status));
- /* protect piwb */
- clear_bit(MD_WRITEBACK, &md->status);
- md->piwb = NULL;
spin_unlock(&md->md_lock);
spin_unlock_irqrestore(&ploop->bat_lock, flags);
kunmap_local(dst_clu);
@@ -892,6 +890,11 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop,
if (!list_empty(&list))
ploop_dispatch_pios(ploop, NULL, &list);
+
+ spin_lock_irqsave(&ploop->bat_lock, flags);
+ /* Clear flag at the end when all processing related is done */
+ clear_bit(MD_WRITEBACK, &md->status);
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
}
static void ploop_free_piwb(struct ploop_index_wb *piwb)
@@ -1030,7 +1033,10 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md,
bat_entries = md->kmpage;
spin_lock(&md->md_lock); /* write */
- WARN_ON_ONCE(md->piwb);
+ if (WARN_ON(md->piwb)) {
+ PL_ERR("md %p has piwb: %p type:%d ourtype:%d\n",
+ md, md->piwb, md->piwb->type, type);
+ }
md->piwb = piwb;
piwb->md = md;
@@ -1077,15 +1083,18 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md,
return -ENOMEM;
}
-void ploop_break_bat_update(struct ploop *ploop, struct md_page *md)
+void ploop_break_bat_update(struct ploop *ploop, struct md_page *md,
+ struct ploop_index_wb *piwb)
{
- struct ploop_index_wb *piwb;
unsigned long flags;
- spin_lock_irqsave(&md->md_lock, flags); /* write */
- piwb = md->piwb;
- md->piwb = NULL;
- spin_unlock_irqrestore(&md->md_lock, flags);
+ spin_lock_irqsave(&ploop->bat_lock, flags);
+ spin_lock(&md->md_lock); /* write */
+ /* Only break if piwb is the same that failed to prepare */
+ if (md->piwb == piwb)
+ md->piwb = NULL;
+ spin_unlock(&md->md_lock);
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
ploop_free_piwb(piwb);
}
@@ -1335,7 +1344,9 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb,
map_index_t *to;
int ret = 0;
u32 tmp_clu;
+#ifdef PLOOP_DELAYWB
unsigned long flags;
+#endif
/* Cluster index related to the page[page_id] start */
clu -= piwb->page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET;
@@ -1347,7 +1358,6 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb,
*dst_clu = tmp_clu;
already_alloced = true;
}
- kunmap_local(to);
if (already_alloced)
goto out;
@@ -1356,17 +1366,16 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb,
if (unlikely(ret < 0))
goto out;
- to = kmap_local_page(piwb->bat_page);
- spin_lock_irqsave(&piwb->md->md_lock, flags);
WRITE_ONCE(to[clu], *dst_clu);
- WRITE_ONCE(piwb->md->bat_levels[clu], ploop_top_level(ploop));
- spin_unlock_irqrestore(&piwb->md->md_lock, flags);
- kunmap_local(to);
#ifdef PLOOP_DELAYWB
+ spin_lock_irqsave(&piwb->md->md_lock, flags);
to = piwb->md->kmpage;
WRITE_ONCE(to[clu], *dst_clu);
+ WRITE_ONCE(piwb->md->bat_levels[clu], ploop_top_level(ploop));
+ spin_unlock_irqrestore(&piwb->md->md_lock, flags);
#endif
out:
+ kunmap_local(to);
return ret;
}
@@ -1635,6 +1644,7 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
struct md_page *md;
unsigned long flags;
map_index_t *to;
+ bool add_to_wblist = false;
WARN_ON_ONCE(cow->aux_pio->queue_list_id != PLOOP_LIST_COW);
page_id = ploop_bat_clu_to_page_nr(clu);
@@ -1652,11 +1662,14 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
spin_unlock_irqrestore(&ploop->bat_lock, flags);
goto err_resource;
}
- ploop_md_make_dirty(ploop, md);
+ add_to_wblist = ploop_md_make_dirty(ploop, md);
}
- spin_unlock_irqrestore(&ploop->bat_lock, flags);
-
piwb = md->piwb;
+ if (WARN_ON(!piwb))
+ PL_ERR("cow no piwb:%d\n", add_to_wblist);
+
+ set_bit(MD_UPDATING, &md->status);
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
clu -= page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET;
@@ -1678,6 +1691,10 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
cow->dst_clu = BAT_ENTRY_NONE;
llist_add((struct llist_node *)(&cow->aux_pio->list), &piwb->cow_llist);
+
+ if (add_to_wblist)
+ ploop_add_dirty_for_wb(ploop, md);
+ clear_bit(MD_UPDATING, &md->status);
out:
return;
err_resource:
@@ -1740,6 +1757,7 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
int err;
unsigned long flags;
struct file *file;
+ bool add_to_wblist = false;
WARN_ON_ONCE(pio->queue_list_id != PLOOP_LIST_DEFERRED);
spin_lock_irqsave(&ploop->bat_lock, flags);
@@ -1749,6 +1767,12 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
}
+ /*
+ * Flag page as updating, so if there is already a pending update it
+ * waits for the new update to be ready before processing it
+ */
+ set_bit(MD_UPDATING, &md->status);
+
if (!test_bit(MD_DIRTY, &md->status)) {
/* Locked since MD_DIRTY is set and cleared concurrently */
page_id = ploop_bat_clu_to_page_nr(clu);
@@ -1757,19 +1781,18 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
spin_unlock_irqrestore(&ploop->bat_lock, flags);
goto error;
}
+ add_to_wblist = ploop_md_make_dirty(ploop, md);
bat_update_prepared = true;
- ploop_md_make_dirty(ploop, md);
}
- spin_unlock_irqrestore(&ploop->bat_lock, flags);
-
piwb = md->piwb;
+ WARN_ON_ONCE(!piwb);
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
file = ploop_top_delta(ploop)->mtfile[pio->runner_id];
err = ploop_alloc_cluster(ploop, piwb, clu, dst_clu, file);
if (err) {
pio->bi_status = errno_to_blk_status(err);
- clear_bit(MD_DIRTY, &md->status);
goto error;
}
@@ -1787,13 +1810,22 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
ploop_attach_end_action(pio, piwb);
#endif
attached = true;
+ if (add_to_wblist)
+ ploop_add_dirty_for_wb(ploop, md);
+ clear_bit(MD_UPDATING, &md->status);
out:
return attached;
error:
/* Uninit piwb */
if (bat_update_prepared)
- ploop_break_bat_update(ploop, md);
+ ploop_break_bat_update(ploop, md, piwb);
ploop_pio_endio(pio);
+
+ spin_lock_irqsave(&ploop->bat_lock, flags);
+ if (add_to_wblist)
+ clear_bit(MD_DIRTY, &md->status);
+ clear_bit(MD_UPDATING, &md->status);
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
return false;
}
@@ -1992,6 +2024,7 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
struct md_page *md;
map_index_t *to;
unsigned long flags;
+ bool add_to_wblist = false;
WARN_ON(ploop->nr_deltas != 1 ||
pio->queue_list_id != PLOOP_LIST_DISCARD);
@@ -2004,17 +2037,18 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
return;
}
+ set_bit(MD_UPDATING, &md->status);
if (!test_bit(MD_DIRTY, &md->status)) {
if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_DISCARD) < 0) {
pio->bi_status = BLK_STS_RESOURCE;
goto err_unlck;
}
+ add_to_wblist = ploop_md_make_dirty(ploop, md);
bat_update_prepared = true;
- ploop_md_make_dirty(ploop, md);
}
- spin_unlock_irqrestore(&ploop->bat_lock, flags);
piwb = md->piwb;
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
/* Cluster index related to the page[page_id] start */
clu -= piwb->page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET;
@@ -2022,7 +2056,7 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
to = kmap_local_page(piwb->bat_page);
if (WARN_ON_ONCE(!to[clu])) {
pio->bi_status = BLK_STS_IOERR;
- clear_bit(MD_DIRTY, &md->status);
+ kunmap_local(to);
goto err;
} else {
WRITE_ONCE(to[clu], 0);
@@ -2031,14 +2065,22 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
kunmap_local(to);
ploop_md_up_prio(ploop, md);
+ if (add_to_wblist)
+ ploop_add_dirty_for_wb(ploop, md);
+ clear_bit(MD_UPDATING, &md->status);
return;
err_unlck:
spin_unlock_irqrestore(&ploop->bat_lock, flags);
err:
if (bat_update_prepared)
- ploop_break_bat_update(ploop, md);
+ ploop_break_bat_update(ploop, md, piwb);
ploop_pio_endio(pio);
+ spin_lock_irqsave(&ploop->bat_lock, flags);
+ if (add_to_wblist)
+ clear_bit(MD_DIRTY, &md->status);
+ clear_bit(MD_UPDATING, &md->status);
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
}
static inline int ploop_runners_have_pending(struct ploop *ploop)
@@ -2067,21 +2109,46 @@ static inline int ploop_submit_metadata_writeback(struct ploop *ploop, int force
*/
llist_for_each_safe(pos, t, ll_wb_batch) {
md = list_entry((struct list_head *)pos, typeof(*md), wb_link);
+ INIT_LIST_HEAD(&md->wb_link);
/* XXX: fixme delay results in a hang - TBD */
if (1 || !llist_empty(&md->wait_llist) || force ||
test_bit(MD_HIGHPRIO, &md->status) ||
time_before(md->dirty_timeout, timeout) ||
ploop->force_md_writeback) {
+ spin_lock_irqsave(&ploop->bat_lock, flags);
+ if (test_bit(MD_UPDATING, &md->status) ||
+ test_bit(MD_WRITEBACK, &md->status)) {
+ /* if updating or there is a writeback then delay */
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
+ llist_add((struct llist_node *)&md->wb_link,
+ &ploop->wb_batch_llist);
+ continue;
+ }
+
/* L1L2 mustn't be redirtyed, when wb in-flight! */
- WARN_ON_ONCE(!test_bit(MD_DIRTY, &md->status));
- WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status));
- set_bit(MD_WRITEBACK, &md->status);
- clear_bit(MD_DIRTY, &md->status);
+ if (WARN_ON_ONCE(!test_bit(MD_DIRTY, &md->status)))
+ PL_ERR("wb but dirty not set\n");
+
+ if (WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status)))
+ PL_ERR("wb but writeback already set\n");
+
clear_bit(MD_HIGHPRIO, &md->status);
- ploop_index_wb_submit(ploop, md->piwb);
- ret++;
+ if (md->piwb) {
+ struct ploop_index_wb *piwb;
+ /* New updates will go in wait list */
+ set_bit(MD_WRITEBACK, &md->status);
+ clear_bit(MD_DIRTY, &md->status);
+ /* at this point we can clear md->piwb */
+ piwb = md->piwb;
+ md->piwb = NULL;
+ /* hand off to threads */
+ ploop_index_wb_submit(ploop, piwb);
+ ret++;
+ } else {
+ PL_ERR("Unexpected md piwb is null");
+ }
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
} else {
- INIT_LIST_HEAD(&md->wb_link);
llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
}
}
@@ -2607,6 +2674,7 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop,
type = PIWB_TYPE_RELOC;
err = -EIO;
+ spin_lock_irq(&ploop->bat_lock);
if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, &md->status)) {
PL_ERR("Unexpected md status: %lx", md->status);
goto out_error;
@@ -2629,12 +2697,13 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop,
if (err)
goto out_reset;
}
+ spin_unlock_irq(&ploop->bat_lock);
*ret_md = md;
return 0;
out_reset:
- ploop_break_bat_update(ploop, md);
+ ploop_break_bat_update(ploop, md, piwb);
out_error:
return err;
}
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index 43e65e841e4a..90cf08472660 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -116,9 +116,10 @@ struct ploop_index_wb {
struct md_page {
struct rb_node node;
u32 id; /* Number of this page starting from hdr */
-#define MD_DIRTY (1U << 1) /* Page contains changes and wants writeback */
-#define MD_WRITEBACK (1U << 2) /* Writeback was submitted */
+#define MD_DIRTY (1U << 1) /* Page contains changes and awaits writeback */
+#define MD_WRITEBACK (1U << 2) /* Writeback in progress */
#define MD_HIGHPRIO (1U << 3) /* High priority writeback */
+#define MD_UPDATING (1U << 4) /* Preparing an update */
unsigned long status;
struct page *page;
void *kmpage;
@@ -611,7 +612,8 @@ extern void ploop_map_and_submit_rw(struct ploop *ploop, u32 dst_clu,
extern int ploop_prepare_reloc_index_wb(struct ploop *ploop,
struct md_page **ret_md, u32 clu, u32 *dst_clu,
struct file *file);
-extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md);
+extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md,
+ struct ploop_index_wb *piwb);
extern void ploop_index_wb_submit(struct ploop *, struct ploop_index_wb *);
extern int ploop_message(struct dm_target *ti, unsigned int argc, char **argv,
char *result, unsigned int maxlen);
--
2.43.0
More information about the Devel
mailing list