[Devel] [PATCH VZ9] drivers/md/ploop: rework preallocation algorithm

Andrey Zhadchenko andrey.zhadchenko at virtuozzo.com
Wed Jul 2 17:43:36 MSK 2025


Recently we notices that fstrim does not work properly on ploop
images.
The problem is actually worse: current code preallocates excessive
space on the image and loses it later.

Example:
  [root at vz9-demens-1 fstrim]# du -h .
  4.0K	.
  [root at vz9-demens-1 fstrim]# ploop init -s 10G image
  ...
  [root at vz9-demens-1 fstrim]# du -h .
  513M	.
  [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# dd if=/dev/urandom of=/mnt/fstrim/file bs=1M count=1024 oflag=direct
  1024+0 records in
  1024+0 records out
  1073741824 bytes (1.1 GB, 1.0 GiB) copied, 4.93694 s, 217 MB/s
  [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# du -h .
  8.7G	.
  [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# rm -rf /mnt/fstrim/file
  [root at vz9-demens-1 fstrim]# fstrim /mnt/fstrim
  [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# du -h .
  7.7G	.

The image just after the creation was already 512M. After
writing 1GB data it was almost 9GB in size. fstrim discarded
only known blocks, so final size is 7.7GB.

Mainly two factors contribute to that:
 - file_preallocated_area_start is never updated
 - parallel preallocation requests scale preallocation size by
128M each.

To fix this, the patch changes preallocation logic a bit:
 - preallocation is requested with cluster size granularity
 - preallocation is done with PREALLOC_SIZE granularity
 - parallel preemptive preallocation does not increase preallocation
size
 - each cluster allocation from preallocated space advances
file_preallocated_area_start
 - preallocated area is truncated on umount

Same example after the patch:
  [root at vz9-demens-1 fstrim]# du -h .
  4.0K	.
  [root at vz9-demens-1 fstrim]# ploop init -s 10G image
  ...
  [root at vz9-demens-1 fstrim]# du -h .
  9.1M	.
  [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# dd if=/dev/urandom of=/mnt/fstrim/file bs=1M count=1024 oflag=direct
  1024+0 records in
  1024+0 records out
  1073741824 bytes (1.1 GB, 1.0 GiB) copied, 5.33758 s, 201 MB/s
  [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# du -h .
  1.1G	.
  [root at vz9-demens-1 fstrim]# ploop mount -m /mnt/fstrim/ DiskDescriptor.xml
  [root at vz9-demens-1 fstrim]# rm -rf /mnt/fstrim/file
  [root at vz9-demens-1 fstrim]# fstrim /mnt/fstrim
  [root at vz9-demens-1 fstrim]# ploop umount DiskDescriptor.xml
  ...
  [root at vz9-demens-1 fstrim]# du -h .
  9.1M	.

Now it is much better. Empty image size is quite small, write does
not cause excessive preallocation and fstrim therefore reduces the
size.

Also dropped unnused arguments for ploop_should_preallocate() and
ploop_truncate_prealloc_safe()

https://virtuozzo.atlassian.net/browse/VSTOR-108868
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
---
 drivers/md/dm-ploop-map.c    | 114 +++++++++++++++++------------------
 drivers/md/dm-ploop-target.c |  12 +++-
 drivers/md/dm-ploop.h        |   6 +-
 3 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 988bf00ed4be..b62fc4151808 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -385,14 +385,6 @@ static void ploop_schedule_work(struct ploop *ploop)
 	wake_up_process(ploop->kt_worker->task);
 }
 
-void ploop_req_prealloc(struct ploop *ploop, loff_t newlen)
-{
-	lockdep_assert_held(&ploop->bat_lock);
-
-	ploop->prealloc_size += newlen;
-	wake_up_process(ploop->kt_allocator->task);
-}
-
 static void ploop_dispatch_pio(struct ploop *ploop, struct pio *pio)
 {
 	struct llist_head *list = &ploop->pios[pio->queue_list_id];
@@ -1139,7 +1131,6 @@ static int ploop_find_dst_clu_bit(struct ploop *ploop, u32 *ret_dst_clu)
 ALLOW_ERROR_INJECTION(ploop_find_dst_clu_bit, ERRNO);
 
 static int ploop_truncate_prealloc_safe(struct ploop *ploop,
-					struct ploop_delta *delta,
 					loff_t old_len, loff_t new_len, struct file *file,
 					const char *func)
 {
@@ -1161,8 +1152,22 @@ static int ploop_truncate_prealloc_safe(struct ploop *ploop,
 }
 ALLOW_ERROR_INJECTION(ploop_truncate_prealloc_safe, ERRNO);
 
+static void ploop_req_prealloc(struct ploop *ploop, bool preemptive)
+{
+	lockdep_assert_held(&ploop->bat_lock);
+
+	if (preemptive) {
+		if (ploop->preemptive_prealloc)
+			return;
+		ploop->preemptive_prealloc = true;
+	}
+
+	ploop->prealloc_requested += CLU_SIZE(ploop);
+	wake_up_process(ploop->kt_allocator->task);
+}
+
 /*
- * Always update prealloc_size and prealloc_in_progress under lock
+ * Always update prealloc_requested and preemptive_prealloc under lock
  *
  * every allocation checks if next will need space and requests
  * preallocation
@@ -1170,58 +1175,50 @@ ALLOW_ERROR_INJECTION(ploop_truncate_prealloc_safe, ERRNO);
 static int ploop_preallocate_cluster(struct ploop *ploop, struct file *file)
 {
 	struct ploop_delta *top = ploop_top_delta(ploop);
-	loff_t end;
+	loff_t new_len, prealloc_size;
 	unsigned long flags;
-	int ret, more = 0;
+	int ret;
 
+	spin_lock_irqsave(&ploop->bat_lock, flags);
 prealloc_more:
-	spin_lock_irqsave(&ploop->bat_lock, flags);
-	ploop->prealloc_in_progress = ploop->prealloc_size;
-	end = top->file_size + ploop->prealloc_in_progress;
-	loff_t new_len = ALIGN(end, ploop->prealloc_in_progress);
-	ploop->prealloc_size = 0;
-	if (!ploop->prealloc_in_progress)
-		new_len = 0;
+	prealloc_size = ALIGN(ploop->prealloc_requested, PREALLOC_SIZE);
+	new_len = top->file_size + prealloc_size;
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
-	if (!new_len)
+	if (!prealloc_size)
 		return 0;
 
-	ret = ploop_truncate_prealloc_safe(ploop, top, top->file_size,
+	ret = ploop_truncate_prealloc_safe(ploop, top->file_size,
 					   new_len, file, __func__);
-	if (ret) {
+	if (ret)
 		PL_ERR("Failed to preallocate space: %d\n", ret);
-		goto out;
-	}
 
-	/* here must be the only place to change file_size */
 	spin_lock_irqsave(&ploop->bat_lock, flags);
-	if (top->file_size < new_len) {
+	/* sanity check*/
+	if (new_len < top->file_size) {
+		WARN_ONCE(1, "unexpected file size change\n");
+		spin_unlock_irqrestore(&ploop->bat_lock, flags);
+		return -EIO;
+	}
+
+	/* Update file size in case of success */
+	if (!ret)
 		top->file_size = new_len;
-	} else {
-		PL_ERR("unexpected file size change\n");
-	}
-	if (ploop->prealloc_size)
-		more = 1;
-	spin_unlock_irqrestore(&ploop->bat_lock, flags);
-	if (more) {
-		more = 0;
+
+	/* If we have no errors and still didn't allocate enough */
+	if (!ret && ploop->prealloc_requested > prealloc_size) {
+		ploop->prealloc_requested -= prealloc_size;
 		goto prealloc_more;
 	}
 
-out:
-	spin_lock_irqsave(&ploop->bat_lock, flags);
-	ploop->prealloc_in_progress = 0;
-	ploop->prealloc_size = 0;
+	ploop->prealloc_requested = 0;
+	ploop->preemptive_prealloc = false;
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
-	/* notify if someone is waiting */
-	wake_up_interruptible(&ploop->dispatcher_wq_prealloc);
-
 	return ret;
 }
 ALLOW_ERROR_INJECTION(ploop_preallocate_cluster, ERRNO);
 
-void ploop_should_prealloc(struct ploop *ploop, struct file *file)
+void ploop_should_prealloc(struct ploop *ploop)
 {
 	struct ploop_delta *top = ploop_top_delta(ploop);
 	u32 dst_clu;
@@ -1237,9 +1234,9 @@ void ploop_should_prealloc(struct ploop *ploop, struct file *file)
 
 	pos = CLU_TO_POS(ploop, dst_clu);
 	end = pos + clu_size;
-	if (end > top->file_preallocated_area_start - (PREALLOC_SIZE/2)) {
-		ploop_req_prealloc(ploop, PREALLOC_SIZE);
-	}
+	if (top->file_size < PREALLOC_SIZE ||
+	    end > top->file_size - PREALLOC_SIZE / 2)
+		ploop_req_prealloc(ploop, true);
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 }
 
@@ -1249,7 +1246,7 @@ static int ploop_pending_prealloc(struct ploop *ploop)
 	unsigned long flags;
 
 	spin_lock_irqsave(&ploop->bat_lock, flags);
-	ret = !ploop->prealloc_size && !ploop->prealloc_in_progress;
+	ret = !ploop->prealloc_requested;
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	return ret;
@@ -1259,7 +1256,7 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
 {
 	struct ploop_delta *top = ploop_top_delta(ploop);
 	u32 clu_size = CLU_SIZE(ploop);
-	loff_t off, pos, end, old_size;
+	loff_t off, pos, end;
 	unsigned long flags;
 	int ret;
 	int retry_cnt = 0;
@@ -1278,21 +1275,17 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
 	 */
 	ploop_hole_clear_bit(*dst_clu, ploop);
 
-	old_size = top->file_size;
 	prealloc_start = top->file_preallocated_area_start;
 	pos = CLU_TO_POS(ploop, *dst_clu);
 	end = pos + clu_size;
-	off = min_t(loff_t, old_size, end);
+	off = min_t(loff_t, top->file_size, end);
+
+	if (top->file_size - prealloc_start < PREALLOC_SIZE / 2)
+		ploop_req_prealloc(ploop, true);
+
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	if (pos < prealloc_start) {
-		if (end + clu_size >
-		    top->file_preallocated_area_start - (PREALLOC_SIZE/2)) {
-			spin_lock_irqsave(&ploop->bat_lock, flags);
-			ploop_req_prealloc(ploop, PREALLOC_SIZE);
-			spin_unlock_irqrestore(&ploop->bat_lock, flags);
-		}
-
 		/* Clu at @pos may contain dirty data */
 		if (!ploop->falloc_new_clu)
 			ret = ploop_punch_hole(file, pos, off - pos);
@@ -1316,9 +1309,8 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
 retry_alloc:
 	spin_lock_irqsave(&ploop->bat_lock, flags);
 	/* size can change from parallel alloc */
-	old_size = top->file_size;
-	if (end > old_size) {
-		ploop_req_prealloc(ploop, PREALLOC_SIZE);
+	if (end > top->file_size) {
+		ploop_req_prealloc(ploop, false);
 		spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 		wait_event_interruptible(ploop->dispatcher_wq_prealloc,
@@ -1339,6 +1331,8 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file
 		}
 	}
 
+	if (end > top->file_preallocated_area_start)
+		top->file_preallocated_area_start = end;
 	spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
 	return 0;
@@ -2363,7 +2357,7 @@ int ploop_allocator(void *data)
 
 	for (;;) {
 		/* we do not care about exact number here - risk one extra schedule wrt taking the lock */
-		if (ploop->prealloc_size) {
+		if (ploop->prealloc_requested) {
 			__set_current_state(TASK_RUNNING);
 			file = ploop_top_delta(ploop)->file;
 			current->flags |= PLOOP_PFLAGS;
diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c
index de549f7633af..525e8d7ceb1f 100644
--- a/drivers/md/dm-ploop-target.c
+++ b/drivers/md/dm-ploop-target.c
@@ -162,6 +162,7 @@ static bool ploop_empty_htable(struct hlist_head head[])
 
 static void ploop_destroy(struct ploop *ploop)
 {
+	struct ploop_delta *top = ploop_top_delta(ploop);
 	int i;
 
 	if (ploop->kt_worker) {
@@ -201,6 +202,11 @@ static void ploop_destroy(struct ploop *ploop)
 
 	for (i = 0; i < 2; i++)
 		percpu_ref_exit(&ploop->inflight_bios_ref[i]);
+
+	if (top->file_preallocated_area_start < top->file_size &&
+	    vfs_truncate2(&top->file->f_path, top->file_preallocated_area_start, top->file))
+		PL_ERR("Unable to truncate preallocated area on destroy");
+
 	/* Nobody uses it after destroy_workqueue() */
 	while (ploop->nr_deltas-- > 0) {
 		vfs_fsync(ploop->deltas[ploop->nr_deltas].file, 1);
@@ -563,8 +569,8 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	init_waitqueue_head(&ploop->dispatcher_wq_data);
 	init_waitqueue_head(&ploop->dispatcher_wq_fsync);
 	init_waitqueue_head(&ploop->dispatcher_wq_prealloc);
-	ploop->prealloc_size = 0;
-	ploop->prealloc_in_progress = 0;
+	ploop->preemptive_prealloc = false;
+	ploop->prealloc_requested = 0;
 
 	ploop->kt_worker = ploop_worker_create(ploop, ploop_worker, "d", 0);
 	if (!ploop->kt_worker)
@@ -602,7 +608,7 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->discards_supported = true;
 
 	if (ploop->nr_deltas > 0)
-		ploop_should_prealloc(ploop, ploop_top_delta(ploop)->file);
+		ploop_should_prealloc(ploop);
 
 	return 0;
 
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index db4d92c9679a..72cc38bbe931 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -264,8 +264,8 @@ struct ploop {
 	bool submit_enospc; /* timer expired run pios from dispatcher */
 	bool event_enospc;
 
-	loff_t prealloc_size;
-	loff_t prealloc_in_progress;
+	bool preemptive_prealloc;
+	loff_t prealloc_requested;
 	struct wait_queue_head dispatcher_wq_prealloc;
 };
 #define ploop_blk_queue(p) ((p)->ti->table->md->queue)
@@ -645,7 +645,7 @@ extern int ploop_allocator(void *data);
 extern void ploop_disable_writeback_delay(struct ploop *ploop);
 extern void ploop_enable_writeback_delay(struct ploop *ploop);
 
-extern void ploop_should_prealloc(struct ploop *ploop, struct file *file);
+extern void ploop_should_prealloc(struct ploop *ploop);
 extern void ploop_resubmit_enospc_pios(struct ploop *ploop);
 
 #endif /* __DM_PLOOP_H */
-- 
2.43.5



More information about the Devel mailing list