[Devel] [PATCH RHEL9 COMMIT] dm-ploop: fix and rework md updates
Konstantin Khorenko
khorenko at virtuozzo.com
Fri Feb 21 14:48:09 MSK 2025
The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.44.1.vz9.80.17
------>
commit 9caa1af11b0a57b2f99840bc09a3d539798cee78
Author: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
Date: Wed Feb 12 15:30:11 2025 +0200
dm-ploop: fix and rework md updates
With the locking reduced it opened windows for races between
running updates, pending and new updates. Current logic to
deal with them is not correct.
Current flags are:
MD_DIRTY - means md page is dirty and is waiting for writeback
MD_WRITEBACK - write back is in progress
But we drop the lock after we make page dirty so it races with
writeback.
To fix this introduce a new flag:
MD_UPDATING - which means we are updating the pending writeback
or we are creating a new one. Remove the check if piwb is present
on the page and clear piwb early on submit, check was using a type
alloc vs discard but in case of concurrent alloc and discard the piwb
is overwritten by the second operation. Using a flag solves this
and makes it more clear in what state page is.
Nice side effect of closing that race is that we have a minimalistic
delay for writeback updates. This happens when we try to submit a
pending writeback and we see the MD_UPDATING flags, update is not
submitted but it is retried on next cycle. So if we have a lot of
update to the same page in a very short periot they are accumulated
as one.
On error only clear MD_DIRTY if it is set by us before dropping
the lock and when breaking the prepared bat update only clear
md->piwb if it is the same as the piwb allocated in the function.
Add some clarification error messages around WARN_ONs.
While at it fix:
- mapping bat_bage two times in same function, do it only once
- missed unmapping on error path
- update proper bat levels in delayed update case
- locking around prepare_bat_update from ploop_prepare_reloc_index_wb
https://virtuozzo.atlassian.net/browse/VSTOR-98807
Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
Reviewed-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Reviewed-by: Konstantin Khorenko <khorenko at virtuozzo.com>
khorenko@: all the places where clear_bit() are called without a lock taken
require smp_mb__before_atomic() for ARM and possibly other arches (x86 arch is
fine even without it as LOCK prefix is used in clear_bit() operations and on x86
it guarantees atomicy and "release" barrier, i.e. all stores before are not
reordered after the clear_bit(). The bit is checked always under a spin_lock(),
so read barrier is also implied there. On ARM clear_bit() does not imply any
memory barrier, so theoretically it's possible clear_bit() is reordered during
execution before some piwb defererence usage and piwb is freed after MD_UPDATING
is set and seen in parallel and we'll get the piwb use-after-free).
Feature: vStorage
---
drivers/md/dm-ploop-cmd.c | 5 +-
drivers/md/dm-ploop-map.c | 188 ++++++++++++++++++++++++++++++++++------------
drivers/md/dm-ploop.h | 8 +-
3 files changed, 146 insertions(+), 55 deletions(-)
diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c
index e032af52b64e..d2eb4797ab6e 100644
--- a/drivers/md/dm-ploop-cmd.c
+++ b/drivers/md/dm-ploop-cmd.c
@@ -315,10 +315,8 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
goto out;
}
- spin_lock_irq(&ploop->bat_lock);
ret = ploop_prepare_reloc_index_wb(ploop, &md, clu, &new_dst,
ploop_top_delta(ploop)->file);
- spin_unlock_irq(&ploop->bat_lock);
if (ret < 0) {
PL_ERR("reloc: can't prepare it: %d", ret);
goto out;
@@ -329,7 +327,8 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
ret = ploop_write_cluster_sync(ploop, pio, new_dst);
if (ret) {
PL_ERR("reloc: failed write: %d", ret);
- ploop_break_bat_update(ploop, md);
+ /* piwb can not change here we are the only user of md page */
+ ploop_break_bat_update(ploop, md, piwb);
goto out;
}
diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index f6fc453f7464..f0706a03ce61 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -419,18 +419,15 @@ void ploop_dispatch_pios(struct ploop *ploop, struct pio *pio,
}
static bool ploop_delay_if_md_busy(struct ploop *ploop, struct md_page *md,
- enum piwb_type type, struct pio *pio)
+ struct pio *pio)
{
- struct ploop_index_wb *piwb;
unsigned long flags;
bool busy = false;
WARN_ON_ONCE(!list_empty(&pio->list));
- /* lock protects piwb */
spin_lock_irqsave(&md->md_lock, flags); /* read */
- piwb = md->piwb;
- if (piwb && (piwb->type != type || test_bit(MD_WRITEBACK, &md->status))) {
+ if (test_bit(MD_WRITEBACK, &md->status) || test_bit(MD_UPDATING, &md->status)) {
llist_add((struct llist_node *)(&pio->list), &md->wait_llist);
busy = true;
}
@@ -616,6 +613,11 @@ static void ploop_unlink_completed_pio(struct ploop *ploop, struct pio *pio)
ploop_dispatch_pios(ploop, NULL, &pio_list);
}
+static void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md)
+{
+ llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
+}
+
static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md)
{
bool new = false;
@@ -628,10 +630,8 @@ static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md)
lockdep_assert_held(&ploop->bat_lock);
WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status));
- if (!test_and_set_bit(MD_DIRTY, &md->status)) {
- llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
+ if (!test_and_set_bit(MD_DIRTY, &md->status))
new = true;
- }
md->dirty_timeout = jiffies + ploop->md_submit_delay_ms*HZ/1000;
return new;
@@ -889,9 +889,6 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop,
#endif
WARN_ON_ONCE(!test_bit(MD_WRITEBACK, &md->status));
- /* protect piwb */
- clear_bit(MD_WRITEBACK, &md->status);
- md->piwb = NULL;
spin_unlock(&md->md_lock);
spin_unlock_irqrestore(&ploop->bat_lock, flags);
kunmap_local(dst_clu);
@@ -907,6 +904,11 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop,
if (!list_empty(&list))
ploop_dispatch_pios(ploop, NULL, &list);
+
+ spin_lock_irqsave(&ploop->bat_lock, flags);
+ /* Clear flag at the end when all processing related is done */
+ clear_bit(MD_WRITEBACK, &md->status);
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
}
static void ploop_free_piwb(struct ploop_index_wb *piwb)
@@ -1045,7 +1047,10 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md,
bat_entries = md->kmpage;
spin_lock(&md->md_lock); /* write */
- WARN_ON_ONCE(md->piwb);
+ if (WARN_ON(md->piwb)) {
+ PL_ERR("md %p has piwb: %p type:%d ourtype:%d\n",
+ md, md->piwb, md->piwb->type, type);
+ }
md->piwb = piwb;
piwb->md = md;
@@ -1092,15 +1097,18 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md,
return -ENOMEM;
}
-void ploop_break_bat_update(struct ploop *ploop, struct md_page *md)
+void ploop_break_bat_update(struct ploop *ploop, struct md_page *md,
+ struct ploop_index_wb *piwb)
{
- struct ploop_index_wb *piwb;
unsigned long flags;
- spin_lock_irqsave(&md->md_lock, flags); /* write */
- piwb = md->piwb;
- md->piwb = NULL;
- spin_unlock_irqrestore(&md->md_lock, flags);
+ spin_lock_irqsave(&ploop->bat_lock, flags);
+ spin_lock(&md->md_lock); /* write */
+ /* Only break if piwb is the same that failed to prepare */
+ if (md->piwb == piwb)
+ md->piwb = NULL;
+ spin_unlock(&md->md_lock);
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
ploop_free_piwb(piwb);
}
@@ -1350,7 +1358,10 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb,
map_index_t *to;
int ret = 0;
u32 tmp_clu;
+#ifdef PLOOP_DELAYWB
unsigned long flags;
+ map_index_t *tomd;
+#endif
/* Cluster index related to the page[page_id] start */
clu -= piwb->page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET;
@@ -1362,7 +1373,6 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb,
*dst_clu = tmp_clu;
already_alloced = true;
}
- kunmap_local(to);
if (already_alloced)
goto out;
@@ -1371,17 +1381,16 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb,
if (unlikely(ret < 0))
goto out;
- to = kmap_local_page(piwb->bat_page);
- spin_lock_irqsave(&piwb->md->md_lock, flags);
WRITE_ONCE(to[clu], *dst_clu);
+#ifdef PLOOP_DELAYWB
+ spin_lock_irqsave(&piwb->md->md_lock, flags);
+ tomd = piwb->md->kmpage;
+ WRITE_ONCE(tomd[clu], *dst_clu);
WRITE_ONCE(piwb->md->bat_levels[clu], ploop_top_level(ploop));
spin_unlock_irqrestore(&piwb->md->md_lock, flags);
- kunmap_local(to);
-#ifdef PLOOP_DELAYWB
- to = piwb->md->kmpage;
- WRITE_ONCE(to[clu], *dst_clu);
#endif
out:
+ kunmap_local(to);
return ret;
}
@@ -1650,6 +1659,7 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
struct md_page *md;
unsigned long flags;
map_index_t *to;
+ bool add_to_wblist = false;
WARN_ON_ONCE(cow->aux_pio->queue_list_id != PLOOP_LIST_COW);
page_id = ploop_bat_clu_to_page_nr(clu);
@@ -1657,7 +1667,7 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
spin_lock_irqsave(&ploop->bat_lock, flags);
/* MD_DIRTY is set and cleared concurrently, do it under bat_lock */
- if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, cow->aux_pio)) {
+ if (ploop_delay_if_md_busy(ploop, md, cow->aux_pio)) {
spin_unlock_irqrestore(&ploop->bat_lock, flags);
goto out;
}
@@ -1667,11 +1677,14 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
spin_unlock_irqrestore(&ploop->bat_lock, flags);
goto err_resource;
}
- ploop_md_make_dirty(ploop, md);
+ add_to_wblist = ploop_md_make_dirty(ploop, md);
}
- spin_unlock_irqrestore(&ploop->bat_lock, flags);
-
piwb = md->piwb;
+ if (WARN_ON(!piwb))
+ PL_ERR("cow no piwb:%d\n", add_to_wblist);
+
+ set_bit(MD_UPDATING, &md->status);
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
clu -= page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET;
@@ -1693,6 +1706,16 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
cow->dst_clu = BAT_ENTRY_NONE;
llist_add((struct llist_node *)(&cow->aux_pio->list), &piwb->cow_llist);
+
+ if (add_to_wblist)
+ ploop_add_dirty_for_wb(ploop, md);
+
+ /*
+ * Avoid possible use-after-free of piwb if it's freed while we
+ * still use it in the code up to previous spin_unlock.
+ */
+ smp_mb__before_atomic();
+ clear_bit(MD_UPDATING, &md->status);
out:
return;
err_resource:
@@ -1755,15 +1778,22 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
int err;
unsigned long flags;
struct file *file;
+ bool add_to_wblist = false;
WARN_ON_ONCE(pio->queue_list_id != PLOOP_LIST_DEFERRED);
spin_lock_irqsave(&ploop->bat_lock, flags);
- if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, pio)) {
+ if (ploop_delay_if_md_busy(ploop, md, pio)) {
spin_unlock_irqrestore(&ploop->bat_lock, flags);
goto out;
}
+ /*
+ * Flag page as updating, so if there is already a pending update it
+ * waits for the new update to be ready before processing it
+ */
+ set_bit(MD_UPDATING, &md->status);
+
if (!test_bit(MD_DIRTY, &md->status)) {
/* Locked since MD_DIRTY is set and cleared concurrently */
page_id = ploop_bat_clu_to_page_nr(clu);
@@ -1772,19 +1802,18 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
spin_unlock_irqrestore(&ploop->bat_lock, flags);
goto error;
}
+ add_to_wblist = ploop_md_make_dirty(ploop, md);
bat_update_prepared = true;
- ploop_md_make_dirty(ploop, md);
}
- spin_unlock_irqrestore(&ploop->bat_lock, flags);
-
piwb = md->piwb;
+ WARN_ON_ONCE(!piwb);
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
file = ploop_top_delta(ploop)->mtfile[pio->runner_id];
err = ploop_alloc_cluster(ploop, piwb, clu, dst_clu, file);
if (err) {
pio->bi_status = errno_to_blk_status(err);
- clear_bit(MD_DIRTY, &md->status);
goto error;
}
@@ -1802,13 +1831,28 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
ploop_attach_end_action(pio, piwb);
#endif
attached = true;
+ if (add_to_wblist)
+ ploop_add_dirty_for_wb(ploop, md);
+
+ /*
+ * Avoid possible use-after-free of piwb if it's freed while we
+ * still use it in the code up to previous spin_unlock.
+ */
+ smp_mb__before_atomic();
+ clear_bit(MD_UPDATING, &md->status);
out:
return attached;
error:
/* Uninit piwb */
if (bat_update_prepared)
- ploop_break_bat_update(ploop, md);
+ ploop_break_bat_update(ploop, md, piwb);
ploop_pio_endio(pio);
+
+ spin_lock_irqsave(&ploop->bat_lock, flags);
+ if (add_to_wblist)
+ clear_bit(MD_DIRTY, &md->status);
+ clear_bit(MD_UPDATING, &md->status);
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
return false;
}
@@ -2007,6 +2051,7 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
struct md_page *md;
map_index_t *to;
unsigned long flags;
+ bool add_to_wblist = false;
WARN_ON(ploop->nr_deltas != 1 ||
pio->queue_list_id != PLOOP_LIST_DISCARD);
@@ -2014,22 +2059,23 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
page_id = ploop_bat_clu_to_page_nr(clu);
md = ploop_md_page_find(ploop, page_id);
spin_lock_irqsave(&ploop->bat_lock, flags);
- if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_DISCARD, pio)) {
+ if (ploop_delay_if_md_busy(ploop, md, pio)) {
spin_unlock_irqrestore(&ploop->bat_lock, flags);
return;
}
+ set_bit(MD_UPDATING, &md->status);
if (!test_bit(MD_DIRTY, &md->status)) {
if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_DISCARD) < 0) {
pio->bi_status = BLK_STS_RESOURCE;
goto err_unlck;
}
+ add_to_wblist = ploop_md_make_dirty(ploop, md);
bat_update_prepared = true;
- ploop_md_make_dirty(ploop, md);
}
- spin_unlock_irqrestore(&ploop->bat_lock, flags);
piwb = md->piwb;
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
/* Cluster index related to the page[page_id] start */
clu -= piwb->page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET;
@@ -2037,7 +2083,7 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
to = kmap_local_page(piwb->bat_page);
if (WARN_ON_ONCE(!to[clu])) {
pio->bi_status = BLK_STS_IOERR;
- clear_bit(MD_DIRTY, &md->status);
+ kunmap_local(to);
goto err;
} else {
WRITE_ONCE(to[clu], 0);
@@ -2046,14 +2092,28 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
kunmap_local(to);
ploop_md_up_prio(ploop, md);
+ if (add_to_wblist)
+ ploop_add_dirty_for_wb(ploop, md);
+
+ /*
+ * Avoid possible use-after-free of piwb if it's freed while we
+ * still use it in the code up to previous spin_unlock.
+ */
+ smp_mb__before_atomic();
+ clear_bit(MD_UPDATING, &md->status);
return;
err_unlck:
spin_unlock_irqrestore(&ploop->bat_lock, flags);
err:
if (bat_update_prepared)
- ploop_break_bat_update(ploop, md);
+ ploop_break_bat_update(ploop, md, piwb);
ploop_pio_endio(pio);
+ spin_lock_irqsave(&ploop->bat_lock, flags);
+ if (add_to_wblist)
+ clear_bit(MD_DIRTY, &md->status);
+ clear_bit(MD_UPDATING, &md->status);
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
}
static inline int ploop_runners_have_pending(struct ploop *ploop)
@@ -2082,22 +2142,49 @@ static inline int ploop_submit_metadata_writeback(struct ploop *ploop, int force
*/
llist_for_each_safe(pos, t, ll_wb_batch) {
md = list_entry((struct list_head *)pos, typeof(*md), wb_link);
+ INIT_LIST_HEAD(&md->wb_link);
/* XXX: fixme delay results in a hang - TBD */
if (1 || !llist_empty(&md->wait_llist) || force ||
test_bit(MD_HIGHPRIO, &md->status) ||
time_before(md->dirty_timeout, timeout) ||
ploop->force_md_writeback) {
+ spin_lock_irqsave(&ploop->bat_lock, flags);
+ if (test_bit(MD_UPDATING, &md->status) ||
+ test_bit(MD_WRITEBACK, &md->status)) {
+ /* if updating or there is a writeback then delay */
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
+ llist_add((struct llist_node *)&md->wb_link,
+ &ploop->wb_batch_llist);
+ continue;
+ }
+
/* L1L2 mustn't be redirtyed, when wb in-flight! */
- WARN_ON_ONCE(!test_bit(MD_DIRTY, &md->status));
- WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status));
- set_bit(MD_WRITEBACK, &md->status);
- clear_bit(MD_DIRTY, &md->status);
+ WARN_ONCE(!test_bit(MD_DIRTY, &md->status),
+ PL_FMT("wb but dirty not set"),
+ ploop_device_name(ploop));
+
+ WARN_ONCE(test_bit(MD_WRITEBACK, &md->status),
+ PL_FMT("wb but writeback already set"),
+ ploop_device_name(ploop));
+
clear_bit(MD_HIGHPRIO, &md->status);
- ploop_index_wb_submit(ploop, md->piwb);
- ret++;
+ if (md->piwb) {
+ struct ploop_index_wb *piwb;
+ /* New updates will go in wait list */
+ set_bit(MD_WRITEBACK, &md->status);
+ clear_bit(MD_DIRTY, &md->status);
+ /* at this point we can clear md->piwb */
+ piwb = md->piwb;
+ md->piwb = NULL;
+ /* hand off to threads */
+ ploop_index_wb_submit(ploop, piwb);
+ ret++;
+ } else {
+ PL_ERR("Unexpected md piwb is null");
+ }
+ spin_unlock_irqrestore(&ploop->bat_lock, flags);
} else {
- INIT_LIST_HEAD(&md->wb_link);
- llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
+ ploop_add_dirty_for_wb(ploop, md);
}
}
return ret;
@@ -2637,6 +2724,7 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop,
type = PIWB_TYPE_RELOC;
err = -EIO;
+ spin_lock_irq(&ploop->bat_lock);
if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, &md->status)) {
PL_ERR("Unexpected md status: %lx", md->status);
goto out_error;
@@ -2659,13 +2747,15 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop,
if (err)
goto out_reset;
}
+ spin_unlock_irq(&ploop->bat_lock);
*ret_md = md;
return 0;
out_reset:
- ploop_break_bat_update(ploop, md);
+ ploop_break_bat_update(ploop, md, piwb);
out_error:
+ spin_unlock_irq(&ploop->bat_lock);
return err;
}
ALLOW_ERROR_INJECTION(ploop_prepare_reloc_index_wb, ERRNO);
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index 06d95c525c90..46450cac8c7d 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -116,9 +116,10 @@ struct ploop_index_wb {
struct md_page {
struct rb_node node;
u32 id; /* Number of this page starting from hdr */
-#define MD_DIRTY (1U << 1) /* Page contains changes and wants writeback */
-#define MD_WRITEBACK (1U << 2) /* Writeback was submitted */
+#define MD_DIRTY (1U << 1) /* Page contains changes and awaits writeback */
+#define MD_WRITEBACK (1U << 2) /* Writeback in progress */
#define MD_HIGHPRIO (1U << 3) /* High priority writeback */
+#define MD_UPDATING (1U << 4) /* Preparing an update */
unsigned long status;
struct page *page;
void *kmpage;
@@ -612,7 +613,8 @@ extern void ploop_map_and_submit_rw(struct ploop *ploop, u32 dst_clu,
extern int ploop_prepare_reloc_index_wb(struct ploop *ploop,
struct md_page **ret_md, u32 clu, u32 *dst_clu,
struct file *file);
-extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md);
+extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md,
+ struct ploop_index_wb *piwb);
extern void ploop_index_wb_submit(struct ploop *, struct ploop_index_wb *);
extern int ploop_message(struct dm_target *ti, unsigned int argc, char **argv,
char *result, unsigned int maxlen);
More information about the Devel
mailing list