[Devel] [PATCH VZ9 v2 1/2] drivers/md/ploop: rework preallocation algorithm

Andrey Zhadchenko andrey.zhadchenko at virtuozzo.com
Fri Jul 4 13:24:20 MSK 2025


Recently we noticed that fstrim does not work properly on ploop
images.
The problem is actually worse: current code preallocates excessive
space on the image and loses it later.

Example:
  [root at vz9-demens-1 fstrim]# du -h .
  4.0K	.
  [root at vz9-demens-1 fstrim]# ploop init -s 10G image
  ...
  [root at vz9-demens-1 fstrim]# du -h .
  513M	.
  [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# dd if=/dev/urandom of=/mnt/fstrim/file bs=1M count=1024 oflag=direct
  1024+0 records in
  1024+0 records out
  1073741824 bytes (1.1 GB, 1.0 GiB) copied, 4.93694 s, 217 MB/s
  [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# du -h .
  8.7G	.
  [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# rm -rf /mnt/fstrim/file
  [root at vz9-demens-1 fstrim]# fstrim /mnt/fstrim
  [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# du -h .
  7.7G	.

The image just after the creation was already 512M. After
writing 1GB data it was almost 9GB in size. Fstrim discarded
only known blocks, so final size is 7.7GB.

Mainly two factors contribute to that:
 - file_preallocated_area_start is never updated
 - parallel preallocation requests scale preallocation size by
128M each.

To fix this, the patch changes preallocation logic a bit:
 - preallocation is requested with cluster size granularity
 - preallocation is done with PREALLOC_SIZE granularity
 - parallel preemptive preallocation does not increase preallocation
size
 - each cluster allocation from preallocated space advances
file_preallocated_area_start
 - preallocated area is truncated on umount

Same example after the patch:
  [root at vz9-demens-1 fstrim]# du -h .
  4.0K	.
  [root at vz9-demens-1 fstrim]# ploop init -s 10G image
  ...
  [root at vz9-demens-1 fstrim]# du -h .
  9.1M	.
  [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# dd if=/dev/urandom of=/mnt/fstrim/file bs=1M count=1024 oflag=direct
  1024+0 records in
  1024+0 records out
  1073741824 bytes (1.1 GB, 1.0 GiB) copied, 5.33758 s, 201 MB/s
  [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# du -h .
  1.1G	.
  [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
  [root at vz9-demens-1 fstrim]# rm -rf /mnt/fstrim/file
  [root at vz9-demens-1 fstrim]# fstrim /mnt/fstrim
  [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# du -h .
  9.1M	.

Now it is much better. Empty image size is quite small, write does
not cause excessive preallocation and fstrim therefore reduces the
size.

https://virtuozzo.atlassian.net/browse/VSTOR-108868
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
---
v2:
 - separate a patch deleting unused arguments
 - change size calculation when requesting prealloc to avoid 
substraction so we are a bit safer with unsigned values
 - rename ploop_pending_prealloc() to ploop_no_pending_prealloc()

 drivers/md/dm-ploop-map.c    | 112 +++++++++++++++++------------------
 drivers/md/dm-ploop-target.c |  10 +++-
 drivers/md/dm-ploop.h        |   4 +-
 3 files changed, 63 insertions(+), 63 deletions(-)

diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 988bf00ed4be..089731bc7e3a 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -385,14 +385,6 @@ static void ploop_schedule_work(struct ploop *ploop)
 	wake_up_process(ploop->kt_worker->task);
 }
 
-void ploop_req_prealloc(struct ploop *ploop, loff_t newlen)
-{
-	lockdep_assert_held(&ploop->bat_lock);
-
-	ploop->prealloc_size += newlen;
-	wake_up_process(ploop->kt_allocator->task);
-}
-
 static void ploop_dispatch_pio(struct ploop *ploop, struct pio *pio)
 {
 	struct llist_head *list = &ploop->pios[pio->queue_list_id];
@@ -1161,8 +1153,22 @@ static int ploop_truncate_prealloc_safe(struct ploop *ploop,
 }
 ALLOW_ERROR_INJECTION(ploop_truncate_prealloc_safe, ERRNO);
 
+static void ploop_req_prealloc(struct ploop *ploop, bool preemptive)
+{
+	lockdep_assert_held(&ploop->bat_lock);
+
+	if (preemptive) {
+		if (ploop->preemptive_prealloc)
+			return;
+		ploop->preemptive_prealloc = true;
+	}
+
+	ploop->prealloc_requested += CLU_SIZE(ploop);
+	wake_up_process(ploop->kt_allocator->task);
+}
+
 /*
- * Always update prealloc_size and prealloc_in_progress under lock
+ * Always update prealloc_requested and preemptive_prealloc under lock
  *
  * every allocation checks if next will need space and requests
  * preallocation
@@ -1170,53 +1176,45 @@ ALLOW_ERROR_INJECTION(ploop_truncate_prealloc_safe, ERRNO);
 static int ploop_preallocate_cluster(struct ploop *ploop, struct file *file)
 {
 	struct ploop_delta *top = ploop_top_delta(ploop);
-	loff_t end;
+	loff_t new_len, prealloc_size;
 	unsigned long flags;
-	int ret, more = 0;
+	int ret;
 
+	spin_lock_irqsave(&ploop->bat_lock, flags);
 prealloc_more:
-	spin_lock_irqsave(&ploop->bat_lock, flags);
-	ploop->prealloc_in_progress = ploop->prealloc_size;
-	end = top->file_size + ploop->prealloc_in_progress;
-	loff_t new_len = ALIGN(end, ploop->prealloc_in_progress);
-	ploop->prealloc_size = 0;
-	if (!ploop->prealloc_in_progress)
-		new_len = 0;
+	prealloc_size = ALIGN(ploop->prealloc_requested, PREALLOC_SIZE);
+	new_len = top->file_size + prealloc_size;
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
-	if (!new_len)
+	if (!prealloc_size)
 		return 0;
 
 	ret = ploop_truncate_prealloc_safe(ploop, top, top->file_size,
 					   new_len, file, __func__);
-	if (ret) {
+	if (ret)
 		PL_ERR("Failed to preallocate space: %d\n", ret);
-		goto out;
-	}
 
-	/* here must be the only place to change file_size */
 	spin_lock_irqsave(&ploop->bat_lock, flags);
-	if (top->file_size < new_len) {
+	/* sanity check*/
+	if (new_len < top->file_size) {
+		WARN_ONCE(1, "unexpected file size change\n");
+		spin_unlock_irqrestore(&ploop->bat_lock, flags);
+		return -EIO;
+	}
+
+	/* Update file size in case of success */
+	if (!ret)
 		top->file_size = new_len;
-	} else {
-		PL_ERR("unexpected file size change\n");
-	}
-	if (ploop->prealloc_size)
-		more = 1;
-	spin_unlock_irqrestore(&ploop->bat_lock, flags);
-	if (more) {
-		more = 0;
+
+	/* If we have no errors and still haven't allocated enough */
+	if (!ret && ploop->prealloc_requested > prealloc_size) {
+		ploop->prealloc_requested -= prealloc_size;
 		goto prealloc_more;
 	}
 
-out:
-	spin_lock_irqsave(&ploop->bat_lock, flags);
-	ploop->prealloc_in_progress = 0;
-	ploop->prealloc_size = 0;
+	ploop->prealloc_requested = 0;
+	ploop->preemptive_prealloc = false;
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
-	/* notify if someone is waiting */
-	wake_up_interruptible(&ploop->dispatcher_wq_prealloc);
-
 	return ret;
 }
 ALLOW_ERROR_INJECTION(ploop_preallocate_cluster, ERRNO);
@@ -1237,19 +1235,18 @@ void ploop_should_prealloc(struct ploop *ploop, struct file *file)
 
 	pos = CLU_TO_POS(ploop, dst_clu);
 	end = pos + clu_size;
-	if (end > top->file_preallocated_area_start - (PREALLOC_SIZE/2)) {
-		ploop_req_prealloc(ploop, PREALLOC_SIZE);
-	}
+	if (end + PREALLOC_SIZE / 2 > top->file_size)
+		ploop_req_prealloc(ploop, true);
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 }
 
-static int ploop_pending_prealloc(struct ploop *ploop)
+static int ploop_no_pending_prealloc(struct ploop *ploop)
 {
 	int ret;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ploop->bat_lock, flags);
-	ret = !ploop->prealloc_size && !ploop->prealloc_in_progress;
+	ret = !ploop->prealloc_requested;
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	return ret;
@@ -1259,7 +1256,7 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
 {
 	struct ploop_delta *top = ploop_top_delta(ploop);
 	u32 clu_size = CLU_SIZE(ploop);
-	loff_t off, pos, end, old_size;
+	loff_t off, pos, end;
 	unsigned long flags;
 	int ret;
 	int retry_cnt = 0;
@@ -1278,21 +1275,17 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
 	 */
 	ploop_hole_clear_bit(*dst_clu, ploop);
 
-	old_size = top->file_size;
 	prealloc_start = top->file_preallocated_area_start;
 	pos = CLU_TO_POS(ploop, *dst_clu);
 	end = pos + clu_size;
-	off = min_t(loff_t, old_size, end);
+	off = min_t(loff_t, top->file_size, end);
+
+	if (prealloc_start + PREALLOC_SIZE / 2 > top->file_size)
+		ploop_req_prealloc(ploop, true);
+
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	if (pos < prealloc_start) {
-		if (end + clu_size >
-		    top->file_preallocated_area_start - (PREALLOC_SIZE/2)) {
-			spin_lock_irqsave(&ploop->bat_lock, flags);
-			ploop_req_prealloc(ploop, PREALLOC_SIZE);
-			spin_unlock_irqrestore(&ploop->bat_lock, flags);
-		}
-
 		/* Clu at @pos may contain dirty data */
 		if (!ploop->falloc_new_clu)
 			ret = ploop_punch_hole(file, pos, off - pos);
@@ -1316,13 +1309,12 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
 retry_alloc:
 	spin_lock_irqsave(&ploop->bat_lock, flags);
 	/* size can change from parallel alloc */
-	old_size = top->file_size;
-	if (end > old_size) {
-		ploop_req_prealloc(ploop, PREALLOC_SIZE);
+	if (end > top->file_size) {
+		ploop_req_prealloc(ploop, false);
 		spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 		wait_event_interruptible(ploop->dispatcher_wq_prealloc,
-					 ploop_pending_prealloc(ploop));
+					 ploop_no_pending_prealloc(ploop));
 
 		spin_lock_irqsave(&ploop->bat_lock, flags);
 		if (end > top->file_size) {
@@ -1339,6 +1331,8 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
 		}
 	}
 
+	if (end > top->file_preallocated_area_start)
+		top->file_preallocated_area_start = end;
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	return 0;
@@ -2363,7 +2357,7 @@ int ploop_allocator(void *data)
 
 	for (;;) {
 		/* we do not care about exact number here - risk one extra schedule wrt taking the lock */
-		if (ploop->prealloc_size) {
+		if (ploop->prealloc_requested) {
 			__set_current_state(TASK_RUNNING);
 			file = ploop_top_delta(ploop)->file;
 			current->flags |= PLOOP_PFLAGS;
diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c
index de549f7633af..aa1379088a80 100644
--- a/drivers/md/dm-ploop-target.c
+++ b/drivers/md/dm-ploop-target.c
@@ -162,6 +162,7 @@ static bool ploop_empty_htable(struct hlist_head head[])
 
 static void ploop_destroy(struct ploop *ploop)
 {
+	struct ploop_delta *top = ploop_top_delta(ploop);
 	int i;
 
 	if (ploop->kt_worker) {
@@ -201,6 +202,11 @@ static void ploop_destroy(struct ploop *ploop)
 
 	for (i = 0; i < 2; i++)
 		percpu_ref_exit(&ploop->inflight_bios_ref[i]);
+
+	if (top->file_preallocated_area_start < top->file_size &&
+	    vfs_truncate2(&top->file->f_path, top->file_preallocated_area_start, top->file))
+		PL_ERR("Unable to truncate preallocated area on destroy");
+
 	/* Nobody uses it after destroy_workqueue() */
 	while (ploop->nr_deltas-- > 0) {
 		vfs_fsync(ploop->deltas[ploop->nr_deltas].file, 1);
@@ -563,8 +569,8 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	init_waitqueue_head(&ploop->dispatcher_wq_data);
 	init_waitqueue_head(&ploop->dispatcher_wq_fsync);
 	init_waitqueue_head(&ploop->dispatcher_wq_prealloc);
-	ploop->prealloc_size = 0;
-	ploop->prealloc_in_progress = 0;
+	ploop->preemptive_prealloc = false;
+	ploop->prealloc_requested = 0;
 
 	ploop->kt_worker = ploop_worker_create(ploop, ploop_worker, "d", 0);
 	if (!ploop->kt_worker)
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index db4d92c9679a..ec3f2ffdf2b1 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -264,8 +264,8 @@ struct ploop {
 	bool submit_enospc; /* timer expired run pios from dispatcher */
 	bool event_enospc;
 
-	loff_t prealloc_size;
-	loff_t prealloc_in_progress;
+	bool preemptive_prealloc;
+	loff_t prealloc_requested;
 	struct wait_queue_head dispatcher_wq_prealloc;
 };
 #define ploop_blk_queue(p) ((p)->ti->table->md->queue)
-- 
2.43.5



More information about the Devel mailing list