[Devel] [RFC PATCH vz9 v6 39/62] dm-ploop: prepare bat updates under bat_lock

Andrey Zhadchenko andrey.zhadchenko at virtuozzo.com
Fri Dec 20 16:24:57 MSK 2024



On 12/5/24 22:56, Alexander Atanasov wrote:
> Prepare for threads. When preparing bat updates there are two important
> things to protect - md->status MD_DIRTY bit  and holes bitmap.
> Use bat_lock to protect them.
> 
> https://virtuozzo.atlassian.net/browse/VSTOR-91821
> Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
> ---
>   drivers/md/dm-ploop-map.c | 83 ++++++++++++++++++++++++++++-----------
>   1 file changed, 61 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
> index bb15fa1f854b..0eaab61a0c31 100644
> --- a/drivers/md/dm-ploop-map.c
> +++ b/drivers/md/dm-ploop-map.c
> @@ -555,13 +555,18 @@ static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md)
>   	unsigned long flags;
>   	bool new = false;
>   
> -	spin_lock_irqsave(&ploop->bat_lock, flags);
> +	/*
> +	 * Usual pattern is check for dirty, prepare bat update
> +	 * and then call make_dirty - to avoid races callers must lock
> +	 */
> +
> +	lockdep_assert_held(&ploop->bat_lock);
> +
>   	WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status));
>   	if (!test_and_set_bit(MD_DIRTY, &md->status)) {
>   		llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
>   		new = true;
>   	}
> -	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   	md->dirty_timeout = jiffies + ploop->md_submit_delay_ms*HZ/1000;
>   
>   	return new;
> @@ -807,6 +812,8 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop,
>   
>   	dst_clu = piwb->kmpage;
>   
> +	/* holes bit map requires bat_lock */
> +	spin_lock_irqsave(&ploop->bat_lock, flags);
>   	spin_lock(&md->md_lock);
>   	if (piwb->type == PIWB_TYPE_ALLOC)
>   		goto skip_apply;
> @@ -833,6 +840,7 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop,
>   	clear_bit(MD_WRITEBACK, &md->status);
>   	md->piwb = NULL;
>   	spin_unlock(&md->md_lock);
> +	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
>   	wait_llist_pending = llist_del_all(&md->wait_llist);
>   	if (wait_llist_pending) {
> @@ -1077,10 +1085,22 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu)
>   	u32 clu_size = CLU_SIZE(ploop);
>   	loff_t off, pos, end, old_size;
>   	struct file *file = top->file;
> +	unsigned long flags;
>   	int ret;
>   
> -	if (ploop_find_dst_clu_bit(ploop, dst_clu) < 0)
> +	spin_lock_irqsave(&ploop->bat_lock, flags);
> +	if (ploop_find_dst_clu_bit(ploop, dst_clu) < 0) {
> +		spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   		return -EIO;
> +	}
> +	/*
> +	 * Mark clu as used. Find & clear bit must be locked,
> +	 * since  this may be called from different threads.
> +	 * Note, that set_bit may be made from many places.
> +	 * We only care to clear what find got. parallel set is ok.
> +	 */
> +	ploop_hole_clear_bit(*dst_clu, ploop);
> +	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
>   	pos = CLU_TO_POS(ploop, *dst_clu);
>   	end = pos + clu_size;
> @@ -1094,6 +1114,8 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu)
>   		else
>   			ret = ploop_zero_range(file, pos, off - pos);
>   		if (ret) {
> +			ploop_hole_set_bit(*dst_clu, ploop);
> +
>   			if (ret != -EOPNOTSUPP)
>   				PL_ERR("punch/zero area: %d", ret);
>   			else if (ploop->falloc_new_clu)
> @@ -1108,18 +1130,15 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu)
>   
>   	if (end > old_size) {
>   		ret = ploop_truncate_prealloc_safe(ploop, top, end, __func__);
> -		if (ret)
> +		if (ret) {
> +			ploop_hole_set_bit(*dst_clu, ploop);
>   			return ret;
> +		}
>   	}
>   
>   	if (end > top->file_preallocated_area_start)
>   		top->file_preallocated_area_start = end;

I think whole ploop_md_make_dirty() is also racy. If we have parallel 
allocations, we also need to protect ploop fields. Probably if there 
will be too much allocation at the same time (more than prealloc size), 
we may have racing truncates with different file lengths.


> -	/*
> -	 * Mark clu as used. Find & clear bit is unlocked,
> -	 * since currently this may be called only from deferred
> -	 * kwork. Note, that set_bit may be made from many places.
> -	 */
> -	ploop_hole_clear_bit(*dst_clu, ploop);
> +
>   	return 0;
>   }
>   ALLOW_ERROR_INJECTION(ploop_allocate_cluster, ERRNO);
> @@ -1421,15 +1440,21 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow)
>   	page_id = ploop_bat_clu_to_page_nr(clu);
>   	md = ploop_md_page_find(ploop, page_id);
>   
> -	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, cow->aux_pio))
> +	spin_lock_irqsave(&ploop->bat_lock, flags);
> +	/* MD_DIRTY is set and cleared concurrently, do it under bat_lock */
> +	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, cow->aux_pio)) {
> +		spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   		goto out;
> +	}
>   
>   	if (!test_bit(MD_DIRTY, &md->status)) {
> -		/* MD_DIRTY is set and cleared from this work */
> -		if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_ALLOC) < 0)
> +		if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_ALLOC) < 0) {
> +			spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   			goto err_resource;
> +		}
>   		ploop_md_make_dirty(ploop, md);
>   	}
> +	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
>   	piwb = md->piwb;
>   
> @@ -1519,32 +1544,38 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
>   	bool attached = false;
>   	u32 page_id;
>   	int err;
> +	unsigned long flags;
>   
>   	WARN_ON_ONCE(pio->queue_list_id != PLOOP_LIST_DEFERRED);
> -	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, pio))
> +	spin_lock_irqsave(&ploop->bat_lock, flags);
> +	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, pio)) {
> +		spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   		goto out;
> +	}
> +
>   
>   	if (!test_bit(MD_DIRTY, &md->status)) {
> -		 /* Unlocked since MD_DIRTY is set and cleared from this work */
> +		 /* Locked since MD_DIRTY is set and cleared concurrently  */
>   		page_id = ploop_bat_clu_to_page_nr(clu);
>   		if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_ALLOC) < 0) {
>   			pio->bi_status = BLK_STS_RESOURCE;
> +			spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   			goto error;
>   		}
>   		bat_update_prepared = true;
> +		ploop_md_make_dirty(ploop, md);
>   	}
> +	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
>   	piwb = md->piwb;
>   
>   	err = ploop_alloc_cluster(ploop, piwb, clu, dst_clu);
>   	if (err) {
>   		pio->bi_status = errno_to_blk_status(err);
> +		clear_bit(MD_DIRTY, &md->status);
>   		goto error;
>   	}
>   
> -	if (bat_update_prepared)
> -		ploop_md_make_dirty(ploop, md);
> -
>   	if (pio->bi_op & REQ_FUA) {
>   		piwb->pio->bi_op |= REQ_FUA;
>   		ploop_attach_end_action(pio, piwb);
> @@ -1758,23 +1789,29 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
>   	struct ploop_index_wb *piwb;
>   	struct md_page *md;
>   	map_index_t *to;
> +	unsigned long flags;
>   
>   	WARN_ON(ploop->nr_deltas != 1 ||
>   		pio->queue_list_id != PLOOP_LIST_DISCARD);
>   
>   	page_id = ploop_bat_clu_to_page_nr(clu);
>   	md = ploop_md_page_find(ploop, page_id);
> -	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_DISCARD, pio))
> +	spin_lock_irqsave(&ploop->bat_lock, flags);
> +	if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_DISCARD, pio)) {
> +		spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   		goto out;
> +	}
> +
>   
>   	if (!test_bit(MD_DIRTY, &md->status)) {
> -		 /* Unlocked since MD_DIRTY is set and cleared from this work */
>   		if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_DISCARD) < 0) {
>   			pio->bi_status = BLK_STS_RESOURCE;
>   			goto err;
>   		}
>   		bat_update_prepared = true;
> +		ploop_md_make_dirty(ploop, md);
>   	}
> +	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
>   	piwb = md->piwb;
>   
> @@ -1784,18 +1821,20 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio)
>   	to = piwb->kmpage;
>   	if (WARN_ON_ONCE(!to[clu])) {
>   		pio->bi_status = BLK_STS_IOERR;
> +		clear_bit(MD_DIRTY, &md->status);
>   		goto err;
>   	} else {
>   		WRITE_ONCE(to[clu], 0);
> +		spin_lock_irqsave(&piwb->lock, flags);
>   		list_add_tail(&pio->list, &piwb->ready_data_pios);
> +		spin_unlock_irqrestore(&piwb->lock, flags);
>   	}
>   
> -	if (bat_update_prepared)
> -		ploop_md_make_dirty(ploop, md);
>   	ploop_md_up_prio(ploop, md);
>   out:
>   	return;
>   err:
> +	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   	if (bat_update_prepared)
>   		ploop_break_bat_update(ploop, md);
>   	ploop_pio_endio(pio);


More information about the Devel mailing list