[Devel] [PATCH VZ9 v2 1/2] drivers/md/ploop: rework preallocation algorithm

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Fri Jul 4 13:31:58 MSK 2025


Reviewed-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>

(for both patches)

On 7/4/25 18:24, Andrey Zhadchenko wrote:
> Recently we noticed that fstrim does not work properly on ploop
> images.
> The problem is actually worse: current code preallocates excessive
> space on the image and loses it later.
> 
> Example:
>    [root at vz9-demens-1 fstrim]# du -h .
>    4.0K	.
>    [root at vz9-demens-1 fstrim]# ploop init -s 10G image
>    ...
>    [root at vz9-demens-1 fstrim]# du -h .
>    513M	.
>    [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
>    ...
>    [root at vz9-demens-1 fstrim]# dd if=/dev/urandom of=/mnt/fstrim/file bs=1M count=1024 oflag=direct
>    1024+0 records in
>    1024+0 records out
>    1073741824 bytes (1.1 GB, 1.0 GiB) copied, 4.93694 s, 217 MB/s
>    [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
>    ...
>    [root at vz9-demens-1 fstrim]# du -h .
>    8.7G	.
>    [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
>    ...
>    [root at vz9-demens-1 fstrim]# rm -rf /mnt/fstrim/file
>    [root at vz9-demens-1 fstrim]# fstrim /mnt/fstrim
>    [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
>    ...
>    [root at vz9-demens-1 fstrim]# du -h .
>    7.7G	.
> 
> The image just after the creation was already 512M. After
> writing 1GB data it was almost 9GB in size. Fstrim discarded
> only known blocks, so final size is 7.7GB.
> 
> Mainly two factors contribute to that:
>   - file_preallocated_area_start is never updated
>   - parallel preallocation requests scale preallocation size by
> 128M each.
> 
> To fix this, the patch changes preallocation logic a bit:
>   - preallocation is requested with cluster size granularity
>   - preallocation is done with PREALLOC_SIZE granularity
>   - parallel preemptive preallocation does not increase preallocation
> size
>   - each cluster allocation from preallocated space advances
> file_preallocated_area_start
>   - preallocated area is truncated on umount
> 
> Same example after the patch:
>    [root at vz9-demens-1 fstrim]# du -h .
>    4.0K	.
>    [root at vz9-demens-1 fstrim]# ploop init -s 10G image
>    ...
>    [root at vz9-demens-1 fstrim]# du -h .
>    9.1M	.
>    [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
>    ...
>    [root at vz9-demens-1 fstrim]# dd if=/dev/urandom of=/mnt/fstrim/file bs=1M count=1024 oflag=direct
>    1024+0 records in
>    1024+0 records out
>    1073741824 bytes (1.1 GB, 1.0 GiB) copied, 5.33758 s, 201 MB/s
>    [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
>    ...
>    [root at vz9-demens-1 fstrim]# du -h .
>    1.1G	.
>    [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
>    [root at vz9-demens-1 fstrim]# rm -rf /mnt/fstrim/file
>    [root at vz9-demens-1 fstrim]# fstrim /mnt/fstrim
>    [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
>    ...
>    [root at vz9-demens-1 fstrim]# du -h .
>    9.1M	.
> 
> Now it is much better. Empty image size is quite small, write does
> not cause excessive preallocation and fstrim therefore reduces the
> size.
> 
> https://virtuozzo.atlassian.net/browse/VSTOR-108868
> Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
> ---
> v2:
>   - separate a patch deleting unused arguments
>   - change size calculation when requesting prealloc to avoid
> substraction so we are a bit safer with unsigned values
>   - rename ploop_pending_prealloc() to ploop_no_pending_prealloc()
> 
>   drivers/md/dm-ploop-map.c    | 112 +++++++++++++++++------------------
>   drivers/md/dm-ploop-target.c |  10 +++-
>   drivers/md/dm-ploop.h        |   4 +-
>   3 files changed, 63 insertions(+), 63 deletions(-)
> 
> diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
> index 988bf00ed4be..089731bc7e3a 100644
> --- a/drivers/md/dm-ploop-map.c
> +++ b/drivers/md/dm-ploop-map.c
> @@ -385,14 +385,6 @@ static void ploop_schedule_work(struct ploop *ploop)
>   	wake_up_process(ploop->kt_worker->task);
>   }
>   
> -void ploop_req_prealloc(struct ploop *ploop, loff_t newlen)
> -{
> -	lockdep_assert_held(&ploop->bat_lock);
> -
> -	ploop->prealloc_size += newlen;
> -	wake_up_process(ploop->kt_allocator->task);
> -}
> -
>   static void ploop_dispatch_pio(struct ploop *ploop, struct pio *pio)
>   {
>   	struct llist_head *list = &ploop->pios[pio->queue_list_id];
> @@ -1161,8 +1153,22 @@ static int ploop_truncate_prealloc_safe(struct ploop *ploop,
>   }
>   ALLOW_ERROR_INJECTION(ploop_truncate_prealloc_safe, ERRNO);
>   
> +static void ploop_req_prealloc(struct ploop *ploop, bool preemptive)
> +{
> +	lockdep_assert_held(&ploop->bat_lock);
> +
> +	if (preemptive) {
> +		if (ploop->preemptive_prealloc)
> +			return;
> +		ploop->preemptive_prealloc = true;
> +	}
> +
> +	ploop->prealloc_requested += CLU_SIZE(ploop);
> +	wake_up_process(ploop->kt_allocator->task);
> +}
> +
>   /*
> - * Always update prealloc_size and prealloc_in_progress under lock
> + * Always update prealloc_requested and preemptive_prealloc under lock
>    *
>    * every allocation checks if next will need space and requests
>    * preallocation
> @@ -1170,53 +1176,45 @@ ALLOW_ERROR_INJECTION(ploop_truncate_prealloc_safe, ERRNO);
>   static int ploop_preallocate_cluster(struct ploop *ploop, struct file *file)
>   {
>   	struct ploop_delta *top = ploop_top_delta(ploop);
> -	loff_t end;
> +	loff_t new_len, prealloc_size;
>   	unsigned long flags;
> -	int ret, more = 0;
> +	int ret;
>   
> +	spin_lock_irqsave(&ploop->bat_lock, flags);
>   prealloc_more:
> -	spin_lock_irqsave(&ploop->bat_lock, flags);
> -	ploop->prealloc_in_progress = ploop->prealloc_size;
> -	end = top->file_size + ploop->prealloc_in_progress;
> -	loff_t new_len = ALIGN(end, ploop->prealloc_in_progress);
> -	ploop->prealloc_size = 0;
> -	if (!ploop->prealloc_in_progress)
> -		new_len = 0;
> +	prealloc_size = ALIGN(ploop->prealloc_requested, PREALLOC_SIZE);
> +	new_len = top->file_size + prealloc_size;
>   	spin_unlock_irqrestore(&ploop->bat_lock, flags);
> -	if (!new_len)
> +	if (!prealloc_size)
>   		return 0;
>   
>   	ret = ploop_truncate_prealloc_safe(ploop, top, top->file_size,
>   					   new_len, file, __func__);
> -	if (ret) {
> +	if (ret)
>   		PL_ERR("Failed to preallocate space: %d\n", ret);
> -		goto out;
> -	}
>   
> -	/* here must be the only place to change file_size */
>   	spin_lock_irqsave(&ploop->bat_lock, flags);
> -	if (top->file_size < new_len) {
> +	/* sanity check*/
> +	if (new_len < top->file_size) {
> +		WARN_ONCE(1, "unexpected file size change\n");
> +		spin_unlock_irqrestore(&ploop->bat_lock, flags);
> +		return -EIO;
> +	}
> +
> +	/* Update file size in case of success */
> +	if (!ret)
>   		top->file_size = new_len;
> -	} else {
> -		PL_ERR("unexpected file size change\n");
> -	}
> -	if (ploop->prealloc_size)
> -		more = 1;
> -	spin_unlock_irqrestore(&ploop->bat_lock, flags);
> -	if (more) {
> -		more = 0;
> +
> +	/* If we have no errors and still haven't allocated enough */
> +	if (!ret && ploop->prealloc_requested > prealloc_size) {
> +		ploop->prealloc_requested -= prealloc_size;
>   		goto prealloc_more;
>   	}
>   
> -out:
> -	spin_lock_irqsave(&ploop->bat_lock, flags);
> -	ploop->prealloc_in_progress = 0;
> -	ploop->prealloc_size = 0;
> +	ploop->prealloc_requested = 0;
> +	ploop->preemptive_prealloc = false;
>   	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
> -	/* notify if someone is waiting */
> -	wake_up_interruptible(&ploop->dispatcher_wq_prealloc);
> -
>   	return ret;
>   }
>   ALLOW_ERROR_INJECTION(ploop_preallocate_cluster, ERRNO);
> @@ -1237,19 +1235,18 @@ void ploop_should_prealloc(struct ploop *ploop, struct file *file)
>   
>   	pos = CLU_TO_POS(ploop, dst_clu);
>   	end = pos + clu_size;
> -	if (end > top->file_preallocated_area_start - (PREALLOC_SIZE/2)) {
> -		ploop_req_prealloc(ploop, PREALLOC_SIZE);
> -	}
> +	if (end + PREALLOC_SIZE / 2 > top->file_size)
> +		ploop_req_prealloc(ploop, true);
>   	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   }
>   
> -static int ploop_pending_prealloc(struct ploop *ploop)
> +static int ploop_no_pending_prealloc(struct ploop *ploop)
>   {
>   	int ret;
>   	unsigned long flags;
>   
>   	spin_lock_irqsave(&ploop->bat_lock, flags);
> -	ret = !ploop->prealloc_size && !ploop->prealloc_in_progress;
> +	ret = !ploop->prealloc_requested;
>   	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
>   	return ret;
> @@ -1259,7 +1256,7 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
>   {
>   	struct ploop_delta *top = ploop_top_delta(ploop);
>   	u32 clu_size = CLU_SIZE(ploop);
> -	loff_t off, pos, end, old_size;
> +	loff_t off, pos, end;
>   	unsigned long flags;
>   	int ret;
>   	int retry_cnt = 0;
> @@ -1278,21 +1275,17 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
>   	 */
>   	ploop_hole_clear_bit(*dst_clu, ploop);
>   
> -	old_size = top->file_size;
>   	prealloc_start = top->file_preallocated_area_start;
>   	pos = CLU_TO_POS(ploop, *dst_clu);
>   	end = pos + clu_size;
> -	off = min_t(loff_t, old_size, end);
> +	off = min_t(loff_t, top->file_size, end);
> +
> +	if (prealloc_start + PREALLOC_SIZE / 2 > top->file_size)
> +		ploop_req_prealloc(ploop, true);
> +
>   	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
>   	if (pos < prealloc_start) {
> -		if (end + clu_size >
> -		    top->file_preallocated_area_start - (PREALLOC_SIZE/2)) {
> -			spin_lock_irqsave(&ploop->bat_lock, flags);
> -			ploop_req_prealloc(ploop, PREALLOC_SIZE);
> -			spin_unlock_irqrestore(&ploop->bat_lock, flags);
> -		}
> -
>   		/* Clu at @pos may contain dirty data */
>   		if (!ploop->falloc_new_clu)
>   			ret = ploop_punch_hole(file, pos, off - pos);
> @@ -1316,13 +1309,12 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
>   retry_alloc:
>   	spin_lock_irqsave(&ploop->bat_lock, flags);
>   	/* size can change from parallel alloc */
> -	old_size = top->file_size;
> -	if (end > old_size) {
> -		ploop_req_prealloc(ploop, PREALLOC_SIZE);
> +	if (end > top->file_size) {
> +		ploop_req_prealloc(ploop, false);
>   		spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
>   		wait_event_interruptible(ploop->dispatcher_wq_prealloc,
> -					 ploop_pending_prealloc(ploop));
> +					 ploop_no_pending_prealloc(ploop));
>   
>   		spin_lock_irqsave(&ploop->bat_lock, flags);
>   		if (end > top->file_size) {
> @@ -1339,6 +1331,8 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
>   		}
>   	}
>   
> +	if (end > top->file_preallocated_area_start)
> +		top->file_preallocated_area_start = end;
>   	spin_unlock_irqrestore(&ploop->bat_lock, flags);
>   
>   	return 0;
> @@ -2363,7 +2357,7 @@ int ploop_allocator(void *data)
>   
>   	for (;;) {
>   		/* we do not care about exact number here - risk one extra schedule wrt taking the lock */
> -		if (ploop->prealloc_size) {
> +		if (ploop->prealloc_requested) {
>   			__set_current_state(TASK_RUNNING);
>   			file = ploop_top_delta(ploop)->file;
>   			current->flags |= PLOOP_PFLAGS;
> diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c
> index de549f7633af..aa1379088a80 100644
> --- a/drivers/md/dm-ploop-target.c
> +++ b/drivers/md/dm-ploop-target.c
> @@ -162,6 +162,7 @@ static bool ploop_empty_htable(struct hlist_head head[])
>   
>   static void ploop_destroy(struct ploop *ploop)
>   {
> +	struct ploop_delta *top = ploop_top_delta(ploop);
>   	int i;
>   
>   	if (ploop->kt_worker) {
> @@ -201,6 +202,11 @@ static void ploop_destroy(struct ploop *ploop)
>   
>   	for (i = 0; i < 2; i++)
>   		percpu_ref_exit(&ploop->inflight_bios_ref[i]);
> +
> +	if (top->file_preallocated_area_start < top->file_size &&
> +	    vfs_truncate2(&top->file->f_path, top->file_preallocated_area_start, top->file))
> +		PL_ERR("Unable to truncate preallocated area on destroy");
> +
>   	/* Nobody uses it after destroy_workqueue() */
>   	while (ploop->nr_deltas-- > 0) {
>   		vfs_fsync(ploop->deltas[ploop->nr_deltas].file, 1);
> @@ -563,8 +569,8 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>   	init_waitqueue_head(&ploop->dispatcher_wq_data);
>   	init_waitqueue_head(&ploop->dispatcher_wq_fsync);
>   	init_waitqueue_head(&ploop->dispatcher_wq_prealloc);
> -	ploop->prealloc_size = 0;
> -	ploop->prealloc_in_progress = 0;
> +	ploop->preemptive_prealloc = false;
> +	ploop->prealloc_requested = 0;
>   
>   	ploop->kt_worker = ploop_worker_create(ploop, ploop_worker, "d", 0);
>   	if (!ploop->kt_worker)
> diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
> index db4d92c9679a..ec3f2ffdf2b1 100644
> --- a/drivers/md/dm-ploop.h
> +++ b/drivers/md/dm-ploop.h
> @@ -264,8 +264,8 @@ struct ploop {
>   	bool submit_enospc; /* timer expired run pios from dispatcher */
>   	bool event_enospc;
>   
> -	loff_t prealloc_size;
> -	loff_t prealloc_in_progress;
> +	bool preemptive_prealloc;
> +	loff_t prealloc_requested;
>   	struct wait_queue_head dispatcher_wq_prealloc;
>   };
>   #define ploop_blk_queue(p) ((p)->ti->table->md->queue)

-- 
Best regards, Pavel Tikhomirov
Senior Software Developer, Virtuozzo.



More information about the Devel mailing list