[Devel] [PATCH vz9 v2] dm-ploop: fix resize and grow to use the new way of updating md pages

Andrey Zhadchenko andrey.zhadchenko at virtuozzo.com
Tue Apr 1 12:43:32 MSK 2025


On 3/27/25 13:18, Alexander Atanasov wrote:
> We missed the fact that resize/grow touch md0 page to update ploop
> parameters, other pages are not linked so there is no issue.
> But in case there is a concurrent update to md0 piwb is not handled
> correctly, also using sync updates in parallel is not okay.
> To fix this update code to use the new mechanism with MD_UPDATING
> flag and instead of using sync operations pass the updates to
> runner threads.
> 
> https://virtuozzo.atlassian.net/browse/VSTOR-101871
> Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
> ---
>   drivers/md/dm-ploop-cmd.c | 39 ++++++++++++++++++++++++++++++++-------
>   drivers/md/dm-ploop-map.c | 37 ++++++++++++++++++++++++++++---------
>   drivers/md/dm-ploop.h     |  7 ++++++-
>   3 files changed, 66 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c
> index d2eb4797ab6e..7d79b900eb1a 100644
> --- a/drivers/md/dm-ploop-cmd.c
> +++ b/drivers/md/dm-ploop-cmd.c
> @@ -286,6 +286,8 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
>   	struct md_page *md;
>   	bool is_locked;
>   	int ret = 0;
> +	int tries = 5;
> +	int add_for_wb = 0;
>   
>   	dst_clu = cmd->resize.dst_clu;
>   
> @@ -308,6 +310,7 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
>   	}
>   	spin_unlock_irq(&ploop->bat_lock);
>   
> +reread:
>   	/* Read full clu sync */
>   	ret = ploop_read_cluster_sync(ploop, pio, dst_clu);
>   	if (ret < 0) {
> @@ -316,8 +319,13 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
>   	}
>   
>   	ret = ploop_prepare_reloc_index_wb(ploop, &md, clu, &new_dst,
> -					   ploop_top_delta(ploop)->file);
> +					   ploop_top_delta(ploop)->file,
> +					   &add_for_wb);
>   	if (ret < 0) {
> +		if (ret == -EBUSY && tries-- > 0) {
> +			PL_ERR("md0 busy, retry:%d\n", tries);
> +			goto reread;
> +		}
>   		PL_ERR("reloc: can't prepare it: %d", ret);
>   		goto out;
>   	}
> @@ -332,13 +340,16 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
>   		goto out;
>   	}
>   
> -	set_bit(MD_WRITEBACK, &md->status);
>   	init_completion(&comp);
>   	piwb->comp = &comp;
>   	piwb->comp_bi_status = &bi_status;
>   	/* Write new index on disk */
> -	ploop_index_wb_submit(ploop, piwb);
> +	ploop_disable_writeback_delay(ploop);
> +	if (add_for_wb)
> +		ploop_add_dirty_for_wb(ploop, md);
> +	clear_bit(MD_UPDATING, &md->status);
>   	wait_for_completion(&comp);
> +	ploop_enable_writeback_delay(ploop);
>   
>   	ret = blk_status_to_errno(bi_status);
>   	if (ret) {
> @@ -378,12 +389,22 @@ static int ploop_grow_update_header(struct ploop *ploop,
>   	struct md_page *md;
>   	u64 sectors;
>   	int ret;
> +	int tries = 5;
> +	int add_for_wb = false;
>   
> +retry:
>   	/* hdr is in the same page as bat_entries[0] index */
>   	ret = ploop_prepare_reloc_index_wb(ploop, &md, 0, NULL,
> -					   ploop_top_delta(ploop)->file);
> -	if (ret)
> +					   ploop_top_delta(ploop)->file,
> +					   &add_for_wb);
> +	if (ret) {
> +		if (ret == -EBUSY && tries-- > 0) {
> +			PL_ERR("md0 busy, retry:%d\n", tries);
> +			schedule();
> +			goto retry;
> +		}
>   		return ret;
> +	}
>   	piwb = md->piwb;
>   
>   	size = (PLOOP_MAP_OFFSET + cmd->resize.nr_bat_entries);
> @@ -398,12 +419,16 @@ static int ploop_grow_update_header(struct ploop *ploop,
>   	offset = hdr->m_FirstBlockOffset = cpu_to_le32(first_block_off);
>   	kunmap_local(hdr);
>   
> -	set_bit(MD_WRITEBACK, &md->status);
>   	init_completion(&comp);
>   	piwb->comp = &comp;
>   	piwb->comp_bi_status = &bi_status;
> -	ploop_index_wb_submit(ploop, piwb);
> +
> +	ploop_disable_writeback_delay(ploop);
> +	if (add_for_wb)

I think this can be very dangerous. If add_for_wb is 0, then we have no 
control over metadata writeback.
In this function we rely on piwb->compl, which we set before firing the 
writeback. But if we do not fire the writeback ourselves, we have no 
guarantee that piwb is valid at the time we set compl to it, as someone 
may already added it to the list and it was already completed and freed.
Also I think it is not valid to work with piwb->bat_page for the same 
reason: it should be added to the writeback list ONLY after we did our 
manipulations, and we have no control over it if add_for_wb == 0

> +		ploop_add_dirty_for_wb(ploop, md);
> +	clear_bit(MD_UPDATING, &md->status);
>   	wait_for_completion(&comp);
> +	ploop_enable_writeback_delay(ploop);
>   
>   	ret = blk_status_to_errno(bi_status);
>   	if (!ret) {
> diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
> index ef280a8b0f90..04e71c851b64 100644
> --- a/drivers/md/dm-ploop-map.c
> +++ b/drivers/md/dm-ploop-map.c
> @@ -613,11 +613,6 @@ static void ploop_unlink_completed_pio(struct ploop *ploop, struct pio *pio)
>   		ploop_dispatch_pios(ploop, NULL, &pio_list);
>   }
>   
> -static void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md)
> -{
> -	llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
> -}
> -
>   static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md)
>   {
>   	bool new = false;
> @@ -1031,6 +1026,7 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md,
>   	struct pio *pio;
>   	map_index_t *to;
>   	u8 level;
> +	int ret = -ENOMEM;
>   
>   	lockdep_assert_held(&ploop->bat_lock);
>   
> @@ -1050,6 +1046,9 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md,
>   	if (WARN_ON(md->piwb)) {
>   		PL_ERR("md %p has piwb: %p type:%d ourtype:%d\n",
>   			md, md->piwb, md->piwb->type, type);
> +		spin_unlock(&md->md_lock);
> +		ret = -EBUSY;
> +		goto err;
>   	}
>   	md->piwb = piwb;
>   	piwb->md = md;
> @@ -1094,7 +1093,7 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md,
>   	return 0;
>   err:
>   	ploop_free_piwb(piwb);
> -	return -ENOMEM;
> +	return ret;
>   }
>   
>   void ploop_break_bat_update(struct ploop *ploop, struct md_page *md,
> @@ -2692,31 +2691,45 @@ static void ploop_handle_cleanup(struct ploop *ploop, struct pio *pio)
>    * another index instead of existing. If so, management of
>    * old bat_entries[@clu] and of related holes_bitmap bit
>    * is caller duty.
> + * Caller must clear MD_UPDATING and comply to add_for_wb
>    */
>   int ploop_prepare_reloc_index_wb(struct ploop *ploop,
>   				 struct md_page **ret_md,
>   				 u32 clu, u32 *dst_clu,
> -				 struct file *file)
> +				 struct file *file,
> +				 int *add_for_wb)
>   {
>   	enum piwb_type type = PIWB_TYPE_ALLOC;
>   	u32 page_id = ploop_bat_clu_to_page_nr(clu);
>   	struct md_page *md = ploop_md_page_find(ploop, page_id);
>   	struct ploop_index_wb *piwb;
>   	int err;
> +	bool add_to_wblist;
>   
>   	if (dst_clu)
>   		type = PIWB_TYPE_RELOC;
>   
>   	err = -EIO;
> +
> +

extra space

>   	spin_lock_irq(&ploop->bat_lock);
> -	if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, &md->status)) {
> -		PL_ERR("Unexpected md status: %lx", md->status);
> +	spin_lock(&md->md_lock);
> +	if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, &md->status)
> +	    || test_bit(MD_UPDATING, &md->status)) {
> +		err = -EBUSY;
> +		spin_unlock(&md->md_lock);
>   		goto out_error;
> +	} else {
> +		set_bit(MD_UPDATING, &md->status);
>   	}
> +	spin_unlock(&md->md_lock);
> +
>   	err = ploop_prepare_bat_update(ploop, md, type);
>   	if (err)
>   		goto out_error;
>   
> +	add_to_wblist = ploop_md_make_dirty(ploop, md);

I think add_to_wblist will always be true here. We take bat_lock before 
test_bit(MD_DIRTY) and do not release it until ploop_md_make_dirty().
And the only way another thread can set MD_DIRTY is also via 
ploop_md_make_dirty(), which requires bat_lock to be taken, which we are 
still holding

> +
>   	piwb = md->piwb;
>   
>   	if (dst_clu) {
> @@ -2734,12 +2747,18 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop,
>   	spin_unlock_irq(&ploop->bat_lock);
>   
>   	*ret_md = md;
> +	*add_for_wb = add_to_wblist ? 1 : 0;
> +
>   	return 0;
>   
>   out_reset:
>   	ploop_break_bat_update(ploop, md, piwb);
>   out_error:
> +	if (add_to_wblist)
> +		clear_bit(MD_DIRTY, &md->status);
> +	clear_bit(MD_UPDATING, &md->status);
>   	spin_unlock_irq(&ploop->bat_lock);
> +
>   	return err;
>   }
>   ALLOW_ERROR_INJECTION(ploop_prepare_reloc_index_wb, ERRNO);
> diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
> index 46450cac8c7d..db4d92c9679a 100644
> --- a/drivers/md/dm-ploop.h
> +++ b/drivers/md/dm-ploop.h
> @@ -578,6 +578,11 @@ static inline const char *ploop_device_name(struct ploop *ploop)
>   	return ploop->ti->table->md->disk->disk_name;
>   }
>   
> +static inline void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md)
> +{
> +	llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
> +}
> +
>   #define PL_FMT(fmt) "ploop: %s: " fmt "\n"
>   #define PL_ERR(fmt, ...) pr_err(PL_FMT(fmt), ploop_device_name(ploop), ##__VA_ARGS__)
>   #define PL_ERR_ONCE(fmt, ...) pr_err_once(PL_FMT(fmt), ploop_device_name(ploop), ##__VA_ARGS__)
> @@ -612,7 +617,7 @@ extern void ploop_map_and_submit_rw(struct ploop *ploop, u32 dst_clu,
>   				    struct pio *pio, u8 level);
>   extern int ploop_prepare_reloc_index_wb(struct ploop *ploop,
>   					struct md_page **ret_md, u32 clu, u32 *dst_clu,
> -					struct file *file);
> +					struct file *file, int *add_for_wb);
>   extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md,
>   				   struct ploop_index_wb *piwb);
>   extern void ploop_index_wb_submit(struct ploop *, struct ploop_index_wb *);



More information about the Devel mailing list