[Devel] [PATCH vz9/vz10] dm-ploop: fix concurrent COW operations on same cluster

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Thu Sep 18 09:34:35 MSK 2025



On 9/16/25 22:02, Vasileios Almpanis wrote:
> When multiple threads attempt operations on the same cluster simultaneously,
> the current code logs warnings but continues with potentially conflicting
> operations, leading to WARN_ON triggers.
> 
> Fix by implementing proper deferral when cluster locks fail:
> 
> 1. Change ploop_link_pio() to add identical pios to the lock holder's
>     endio list instead of issuing warnings and continuing
> 2. Rename ploop_add_cluster_lk() to ploop_try_add_cluster_lk() to reflect
>     the new behavior
> 3. Update COW and discard paths to abort operations when cluster locking
>     fails, allowing proper deferral.
> 
> This ensures that conflicting operations are serialized rather than
> corrupting shared data structures.
> 
> Observed during snapshot creation where filesystem metadata updates
> trigger concurrent operations on the same clusters.
> 
> https://virtuozzo.atlassian.net/browse/VSTOR-115446
> Signed-off-by: Vasileios Almpanis <vasileios.almpanis at virtuozzo.com>
> 
> Feature: dm-ploop: ploop target driver
> ---
>   drivers/md/dm-ploop-map.c | 33 +++++++++++++++++++--------------
>   1 file changed, 19 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
> index 56481d0f8323..db89a668e766 100644
> --- a/drivers/md/dm-ploop-map.c
> +++ b/drivers/md/dm-ploop-map.c
> @@ -535,10 +535,10 @@ static int ploop_link_pio(struct hlist_head head[], struct pio *pio,
>   		if (pe == pio)
>   			return 1;
>   
> -		WARN_ON_ONCE(pe != NULL);
> -
> -		if (pe)
> -			PL_ERR("clu:%u already exclusively locked\n", clu);
> +		if (pe) {
> +			ploop_add_endio_pio(pe, pio);
> +			return 0;
> +		}
>   	}
>   
>   	if (!hlist_unhashed_lockless(&pio->hlist_node)) {
> @@ -569,7 +569,7 @@ static int ploop_unlink_pio(struct ploop *ploop, struct pio *pio,
>   	return 1;
>   }
>   
> -static int ploop_add_cluster_lk(struct ploop *ploop, struct pio *pio, u32 clu)
> +static int ploop_try_add_cluster_lk(struct ploop *ploop, struct pio *pio, u32 clu)
>   {
>   	unsigned long flags;
>   	int ret;
> @@ -741,6 +741,10 @@ static int ploop_handle_discard_pio(struct ploop *ploop, struct pio *pio,
>   	if (ploop->nr_deltas != 1)
>   		goto punch_hole;
>   
> +	if (!ploop_try_add_cluster_lk(ploop, pio, clu)) {
> +		return 1;
> +	}

Please remove excess {} brackets.

 > Do not unnecessarily use braces where a single statement will do.

https://www.kernel.org/doc/html/v4.10/process/coding-style.html#placing-braces-and-spaces

> +
>   	spin_lock_irqsave(&ploop->inflight_lock, flags);
>   	inflight_h = ploop_find_inflight_bio(ploop, clu);
>   	if (inflight_h)
> @@ -754,8 +758,6 @@ static int ploop_handle_discard_pio(struct ploop *ploop, struct pio *pio,
>   		return 0;
>   	}
>   
> -	if (!ploop_add_cluster_lk(ploop, pio, clu))
> -		PL_ERR("dis clu %d already locked\n", clu);

Why do we move it above inflight-delay code? Is there a specific reason? 
Except that code looks good.

>   	pio->wants_discard_index_cleanup = true;
>   
>   punch_hole:
> @@ -1585,12 +1587,18 @@ static int ploop_submit_cluster_cow(struct ploop *ploop, unsigned int level,
>   {
>   	struct ploop_cow *cow = NULL;
>   	struct pio *aux_pio = NULL;
> +	int ret = 0;
>   
>   	/* Prepare new delta read */
>   	aux_pio = ploop_alloc_pio_with_pages(ploop);
>   	cow = kmem_cache_alloc(cow_cache, GFP_ATOMIC);
> -	if (!aux_pio || !cow)
> -		goto err;
> +	if (!aux_pio || !cow) {
> +		ret = -ENOMEM;
> +		goto cleanup;
> +	}

Please add newline here to separate allocation checks from cluster 
locking to improve code readability.

> +	if (!ploop_try_add_cluster_lk(ploop, cow_pio, clu))
> +		goto cleanup;
> +
>   	ploop_init_pio(ploop, REQ_OP_READ, aux_pio);
>   	ploop_pio_prepare_offsets(ploop, aux_pio, clu);
>   	aux_pio->css = cow_pio->css;
> @@ -1602,17 +1610,14 @@ static int ploop_submit_cluster_cow(struct ploop *ploop, unsigned int level,
>   	cow->aux_pio = aux_pio;
>   	cow->cow_pio = cow_pio;
>   
> -	if (!ploop_add_cluster_lk(ploop, cow_pio, clu))
> -		PL_ERR("cowclu %d already locked\n", clu);
> -
>   	/* Stage #0: read secondary delta full clu */
>   	ploop_map_and_submit_rw(ploop, dst_clu, aux_pio, level);
>   	return 0;
> -err:
> +cleanup:
>   	if (aux_pio)
>   		ploop_free_pio_with_pages(ploop, aux_pio);
>   	kfree(cow);
> -	return -ENOMEM;
> +	return ret;
>   }
>   
>   static int ploop_initiate_cluster_cow(struct ploop *ploop, unsigned int level,

-- 
Best regards, Pavel Tikhomirov
Senior Software Developer, Virtuozzo.



More information about the Devel mailing list