[Devel] [PATCH v4 VZ9] drivers/md/ploop: rework preallocation algorithm

Andrey Zhadchenko andrey.zhadchenko at virtuozzo.com
Mon Jul 7 11:59:21 MSK 2025


LGTM

Reviewed-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
(or signed-by?)

On 7/7/25 10:50, Pavel Tikhomirov wrote:
> Recently we noticed that fstrim does not work properly on ploop images.
> The problem is actually worse: current code preallocates excessive space
> on the image and loses it later.
> 
> Example:
>    [root at vz9-demens-1 fstrim]# du -h .
>    4.0K  .
>    [root at vz9-demens-1 fstrim]# ploop init -s 10G image
>    ...
>    [root at vz9-demens-1 fstrim]# du -h .
>    513M  .
>    [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
>    ...
>    [root at vz9-demens-1 fstrim]# dd if=/dev/urandom of=/mnt/fstrim/file bs=1M count=1024 oflag=direct
>    1024+0 records in
>    1024+0 records out
>    1073741824 bytes (1.1 GB, 1.0 GiB) copied, 4.93694 s, 217 MB/s
>    [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
>    ...
>    [root at vz9-demens-1 fstrim]# du -h .
>    8.7G  .
>    [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
>    ...
>    [root at vz9-demens-1 fstrim]# rm -rf /mnt/fstrim/file
>    [root at vz9-demens-1 fstrim]# fstrim /mnt/fstrim
>    [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
>    ...
>    [root at vz9-demens-1 fstrim]# du -h .
>    7.7G  .
> 
> The image just after the creation was already 512M. After writing 1GB
> data it was almost 9GB in size. The fstrim discarded only known blocks, so
> final size is 7.7GB.
> 
> Mainly two factors contribute to that:
>   - file_preallocated_area_start is never updated
>   - parallel preallocation requests scale preallocation size by 128M
>     each time
> 
> To fix this, the patch changes preallocation logic a bit:
> 
> - Make preallocations absolute instead of relative in
>    ploop_req_prealloc(). Thus only request what is really needed.
> - Rename s/ploop_pending_prealloc/ploop_no_pending_prealloc/ as it
>    returns true in case of no pending preallocation.
> - Make ploop_preallocate_cluster() return error on unexpected file size
>    change. Also simplify the logic a little bit (e.g. more is excess).
> - Preallocation in ploop_allocate_cluster() should not depend on (pos <
>    prealloc_start), so always try to preallocate. Also simplify
>    ploop_allocate_cluster() a little bit (e.g. old_size is excess).
> - Update file_preallocated_area_start after allocation starts using
>    preallocated space.
> - Truncate preallocated space on ploop destruction.
> 
> Same example after the patch:
> 
> [root at ptikh-hci fstrim]# du -h .
> 4.0K	.
> [root at ptikh-hci fstrim]# ploop init -s 10G image
> [root at ptikh-hci fstrim]# du -h .
> 3.2M	.
> [root at ptikh-hci fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
> ...
> [root at ptikh-hci fstrim]# dd if=/dev/urandom of=/mnt/fstrim/file bs=1M count=1024 oflag=direct
> ...
> [root at ptikh-hci fstrim]# ploop umount DiskDescriptor.xml
> ...
> [root at ptikh-hci fstrim]# du -h .
> 1.1G	.
> [root at ptikh-hci fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
> ...
> [root at ptikh-hci fstrim]# rm -rf /mnt/fstrim/file
> [root at ptikh-hci fstrim]# fstrim /mnt/fstrim
> [root at ptikh-hci fstrim]# ploop umount DiskDescriptor.xml
> ...
> [root at ptikh-hci fstrim]# du -h .
> 4.2M	.
> 
> Now it is much better. Empty image size is quite small, write does
> not cause excessive preallocation and fstrim therefore reduces the
> size.
> 
> https://virtuozzo.atlassian.net/browse/VSTOR-108868
> Co-developed-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
> Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
> ---
> v3: This is rework of v2 which makes it possible to apply it in
> ready-kernel (no ploop structure changes). Also it covers some new
> cases, like resetting the state on unexpected file size change. In v2
> preemptive_prealloc was not reset and can probably leed to problems.
> v4: fix compilation and incorporate v2 description
> ---
>   drivers/md/dm-ploop-map.c    | 75 ++++++++++++++++++------------------
>   drivers/md/dm-ploop-target.c |  6 +++
>   2 files changed, 43 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
> index 11767dd91950..0c30f2cdd091 100644
> --- a/drivers/md/dm-ploop-map.c
> +++ b/drivers/md/dm-ploop-map.c
> @@ -385,11 +385,16 @@ static void ploop_schedule_work(struct ploop *ploop)
>   	wake_up_process(ploop->kt_worker->task);
>   }
>   
> -void ploop_req_prealloc(struct ploop *ploop, loff_t newlen)
> +static void ploop_req_prealloc(struct ploop *ploop, loff_t size)
>   {
> +	struct ploop_delta *top = ploop_top_delta(ploop);
> +
>   	lockdep_assert_held(&ploop->bat_lock);
>   
> -	ploop->prealloc_size += newlen;
> +	size = ALIGN(size + PREALLOC_SIZE / 2, PREALLOC_SIZE);
> +	if (top->file_size + ploop->prealloc_in_progress + ploop->prealloc_size >= size)
> +		return;
> +	ploop->prealloc_size = size - top->file_size - ploop->prealloc_in_progress;
>   	wake_up_process(ploop->kt_allocator->task);
>   }
>   
> @@ -1170,46 +1175,43 @@ ALLOW_ERROR_INJECTION(ploop_truncate_prealloc_safe, ERRNO);
>   static int ploop_preallocate_cluster(struct ploop *ploop, struct file *file)
>   {
>   	struct ploop_delta *top = ploop_top_delta(ploop);
> -	loff_t end;
> +	loff_t new_len;
>   	unsigned long flags;
> -	int ret, more = 0;
> +	int ret;
>   
> -prealloc_more:
>   	spin_lock_irqsave(&ploop->bat_lock, flags);
> +prealloc_more:
>   	ploop->prealloc_in_progress = ploop->prealloc_size;
> -	end = top->file_size + ploop->prealloc_in_progress;
> -	loff_t new_len = ALIGN(end, ploop->prealloc_in_progress);
>   	ploop->prealloc_size = 0;
> -	if (!ploop->prealloc_in_progress)
> -		new_len = 0;
> -	spin_unlock_irqrestore(&ploop->bat_lock, flags);
> -	if (!new_len)
> +	if (!ploop->prealloc_in_progress) {
> +		spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   		return 0;
> +	}
> +	new_len = top->file_size + ploop->prealloc_in_progress;
> +	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
>   	ret = ploop_truncate_prealloc_safe(ploop, top, top->file_size,
>   					   new_len, file, __func__);
>   	if (ret) {
>   		PL_ERR("Failed to preallocate space: %d\n", ret);
> +		spin_lock_irqsave(&ploop->bat_lock, flags);
>   		goto out;
>   	}
>   
> -	/* here must be the only place to change file_size */
> +	/*
> +	 * Here must be the only place to change file_size.
> +	 */
>   	spin_lock_irqsave(&ploop->bat_lock, flags);
>   	if (top->file_size < new_len) {
>   		top->file_size = new_len;
>   	} else {
>   		PL_ERR("unexpected file size change\n");
> +		ret = -EIO;
> +		goto out;
>   	}
>   	if (ploop->prealloc_size)
> -		more = 1;
> -	spin_unlock_irqrestore(&ploop->bat_lock, flags);
> -	if (more) {
> -		more = 0;
>   		goto prealloc_more;
> -	}
> -
>   out:
> -	spin_lock_irqsave(&ploop->bat_lock, flags);
>   	ploop->prealloc_in_progress = 0;
>   	ploop->prealloc_size = 0;
>   	spin_unlock_irqrestore(&ploop->bat_lock, flags);
> @@ -1223,7 +1225,6 @@ ALLOW_ERROR_INJECTION(ploop_preallocate_cluster, ERRNO);
>   
>   void ploop_should_prealloc(struct ploop *ploop, struct file *file)
>   {
> -	struct ploop_delta *top = ploop_top_delta(ploop);
>   	u32 dst_clu;
>   	u32 clu_size = CLU_SIZE(ploop);
>   	loff_t pos, end;
> @@ -1237,13 +1238,12 @@ void ploop_should_prealloc(struct ploop *ploop, struct file *file)
>   
>   	pos = CLU_TO_POS(ploop, dst_clu);
>   	end = pos + clu_size;
> -	if (end > top->file_preallocated_area_start - (PREALLOC_SIZE/2)) {
> -		ploop_req_prealloc(ploop, PREALLOC_SIZE);
> -	}
> +
> +	ploop_req_prealloc(ploop, end);
>   	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   }
>   
> -static int ploop_pending_prealloc(struct ploop *ploop)
> +static int ploop_no_pending_prealloc(struct ploop *ploop)
>   {
>   	int ret;
>   	unsigned long flags;
> @@ -1259,7 +1259,7 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
>   {
>   	struct ploop_delta *top = ploop_top_delta(ploop);
>   	u32 clu_size = CLU_SIZE(ploop);
> -	loff_t off, pos, end, old_size;
> +	loff_t off, pos, end;
>   	unsigned long flags;
>   	int ret;
>   	int retry_cnt = 0;
> @@ -1278,21 +1278,15 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
>   	 */
>   	ploop_hole_clear_bit(*dst_clu, ploop);
>   
> -	old_size = top->file_size;
>   	prealloc_start = top->file_preallocated_area_start;
>   	pos = CLU_TO_POS(ploop, *dst_clu);
>   	end = pos + clu_size;
> -	off = min_t(loff_t, old_size, end);
> +	off = min_t(loff_t, top->file_size, end);
> +	ploop_req_prealloc(ploop, end);
> +
>   	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
>   	if (pos < prealloc_start) {
> -		if (end + clu_size >
> -		    top->file_preallocated_area_start - (PREALLOC_SIZE/2)) {
> -			spin_lock_irqsave(&ploop->bat_lock, flags);
> -			ploop_req_prealloc(ploop, PREALLOC_SIZE);
> -			spin_unlock_irqrestore(&ploop->bat_lock, flags);
> -		}
> -
>   		/* Clu at @pos may contain dirty data */
>   		if (!ploop->falloc_new_clu)
>   			ret = ploop_punch_hole(file, pos, off - pos);
> @@ -1316,13 +1310,16 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
>   retry_alloc:
>   	spin_lock_irqsave(&ploop->bat_lock, flags);
>   	/* size can change from parallel alloc */
> -	old_size = top->file_size;
> -	if (end > old_size) {
> -		ploop_req_prealloc(ploop, PREALLOC_SIZE);
> +	if (end > top->file_size) {
> +		/*
> +		 * Reset preallocation, in case ploop_preallocate_cluster
> +		 * dropped it on error path.
> +		 */
> +		ploop_req_prealloc(ploop, end);
>   		spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
>   		wait_event_interruptible(ploop->dispatcher_wq_prealloc,
> -					 ploop_pending_prealloc(ploop));
> +					 ploop_no_pending_prealloc(ploop));
>   
>   		spin_lock_irqsave(&ploop->bat_lock, flags);
>   		if (end > top->file_size) {
> @@ -1339,6 +1336,8 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
>   		}
>   	}
>   
> +	if (end > top->file_preallocated_area_start)
> +		top->file_preallocated_area_start = end;
>   	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
>   	return 0;
> diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c
> index 8fa0043a7142..ea5f3a8dab6b 100644
> --- a/drivers/md/dm-ploop-target.c
> +++ b/drivers/md/dm-ploop-target.c
> @@ -162,6 +162,7 @@ static bool ploop_empty_htable(struct hlist_head head[])
>   
>   static void ploop_destroy(struct ploop *ploop)
>   {
> +	struct ploop_delta *top = ploop_top_delta(ploop);
>   	int i;
>   
>   	if (ploop->kt_worker) {
> @@ -201,6 +202,11 @@ static void ploop_destroy(struct ploop *ploop)
>   
>   	for (i = 0; i < 2; i++)
>   		percpu_ref_exit(&ploop->inflight_bios_ref[i]);
> +
> +	if (top->file_preallocated_area_start < top->file_size &&
> +	    vfs_truncate2(&top->file->f_path, top->file_preallocated_area_start, top->file))
> +		PL_ERR("Unable to truncate preallocated area on destroy");
> +
>   	/* Nobody uses it after destroy_workqueue() */
>   	while (ploop->nr_deltas-- > 0) {
>   		if (ploop->deltas[ploop->nr_deltas].file) {



More information about the Devel mailing list