[Devel] [PATCH VZ9 3/10] fs/ext4: swapext ioctl

Alexey Kuznetsov kuznet at virtuozzo.com
Fri Jan 17 21:08:52 MSK 2025


It is not for mainstream, this is our local kludge,
which is used to make huge optimizations particularly in csd,
but not only in it.

What is does: it allows to swap range of blocks between two files.
The code is borrowed from existing move_extent ioctl and fallocate.

Obviosuly it is not a good idea to do this without mainstream review,
but it is hopeless from this viewpoint as its mainstream-ready version
preserving safety and transparence with user read/write would be extremally
suboptimal and hairy. Even if this were done in mainstream we would have
to keep separate optimal branch. And the advantage is too huge just to
waive this possibility.

Also, new FALLOC_FL_PREALLOCATE flag to fallocate is added.
It is just like usual fallocate(0), but with important property:
from viewpoint of preallocation it is considered exactly like write(0),
i.e. it uses preallocation. Normally preallocation for fallocate
is disabled, the argument is that if application does fallocate
it wishes to keep full control on space allocatio, but when
it is used as optimization to write zeros without memory copies
and block zeroing, we have no reasons to suppress normal preallocation.

Signed-off-by: Alexey Kuznetsov <kuznet at virtuozzo.com>
---
 fs/ext4/ext4.h              |  30 ++++-
 fs/ext4/extents.c           | 298 +++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/ioctl.c             |  57 ++++++++
 fs/ext4/move_extent.c       | 308 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/falloc.h      |   3 +-
 include/uapi/linux/falloc.h |   2 +
 6 files changed, 690 insertions(+), 8 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8dcee43..dbef4114 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -729,6 +729,7 @@ enum {
 #define EXT4_IOC_OPEN_BALLOON		_IO('f', 42)
 #define EXT4_IOC_CHECKPOINT		_IOW('f', 43, __u32)
 #define EXT4_IOC_MFSYNC			_IO('f', 43)
+#define EXT4_IOC_SWAP_EXT		_IOWR('f', 43, struct ext4_swap_extent)
 #define EXT4_IOC_GETFSUUID		_IOR('f', 44, struct fsuuid)
 #define EXT4_IOC_SETFSUUID		_IOW('f', 44, struct fsuuid)
 
@@ -868,6 +869,23 @@ struct move_extent {
 	__u64 moved_len;	/* moved block length */
 };
 
+#define EXT4_SWAP_EXTENT_ASSERT_DONOR	1
+#define EXT4_SWAP_EXTENT_UNWRITE_DONOR	2
+#define EXT4_SWAP_EXTENT_ALLOC_ACCEPTOR	4
+
+struct ext4_swap_extent {
+	__u32 instructions;	/* what to do? */
+	__u32 donor_fd;		/* donor file descriptor */
+	__u64 orig_start;	/* logical start offset in block for orig */
+	__u64 donor_start;	/* logical start offset in block for donor */
+	__u64 len;		/* block length to be moved */
+	__u64 moved_len;	/* moved block length */
+	__u32 s_dirty;
+	__u32 s_donor_ext;
+	__u32 s_orig_ext;
+	__u32 s_pad;
+};
+
 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
 #define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)
@@ -3754,9 +3772,19 @@ extern int ext4_get_es_cache(struct inode *inode,
 			     __u64 start, __u64 len);
 extern int ext4_ext_precache(struct inode *inode);
 extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
-				struct inode *inode2, ext4_lblk_t lblk1,
+			     struct inode *inode2, ext4_lblk_t lblk1,
 			     ext4_lblk_t lblk2,  ext4_lblk_t count,
 			     int mark_unwritten,int *err);
+struct ext4_swap_extent;
+extern int ext4_swap_extents_2(handle_t *handle, struct inode *inode1,
+		  struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
+		  ext4_lblk_t count, __u32 insn, int *erp, struct ext4_swap_extent *se);
+extern int ext4_expose_extents_2(handle_t *handle, struct inode *inode,
+		      ext4_lblk_t lblk,
+		      ext4_lblk_t count, __u32 insn, int *erp, struct ext4_swap_extent *se);
+extern int ext4_ioc_swap_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+				 __u64 donor_blk, __u64 len, __u32 insn,
+				 struct ext4_swap_extent *se);
 extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
 extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
 				       int check_cred, int restart_cred,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74251ee..09c6555 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -35,6 +35,10 @@
 
 #include <trace/events/ext4.h>
 
+static unsigned int falloc_prealloc;
+module_param(falloc_prealloc, uint, 0644);
+MODULE_PARM_DESC(falloc_prealloc, "Preallocate fallocate like it is a write for smaller sizes");
+
 /*
  * used by extent splitting.
  */
@@ -4423,7 +4427,7 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode)
 
 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 				  ext4_lblk_t len, loff_t new_size,
-				  int flags)
+				  int flags, int mode)
 {
 	struct inode *inode = file_inode(file);
 	handle_t *handle;
@@ -4442,7 +4446,8 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 	 * that it doesn't get unnecessarily split into multiple
 	 * extents.
 	 */
-	if (len <= EXT_UNWRITTEN_MAX_LEN)
+	if (!(mode & FALLOC_FL_PREALLOCATE) && len <= EXT_UNWRITTEN_MAX_LEN &&
+	   len > falloc_prealloc)
 		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 
 	/*
@@ -4588,7 +4593,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 				round_down(offset, 1 << blkbits) >> blkbits,
 				(round_up((offset + len), 1 << blkbits) -
 				 round_down(offset, 1 << blkbits)) >> blkbits,
-				new_size, flags);
+				new_size, flags, mode);
 		if (ret)
 			goto out_mutex;
 
@@ -4621,7 +4626,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		inode->i_mtime = inode->i_ctime = current_time(inode);
 
 		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
-					     flags);
+					     flags, mode);
 		filemap_invalidate_unlock(mapping);
 		if (ret)
 			goto out_mutex;
@@ -4694,7 +4699,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	/* Return error if mode is not supported */
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
 		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
-		     FALLOC_FL_INSERT_RANGE))
+		     FALLOC_FL_INSERT_RANGE | FALLOC_FL_PREALLOCATE))
 		return -EOPNOTSUPP;
 
 	inode_lock(inode);
@@ -4754,7 +4759,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (ret)
 		goto out;
 
-	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
+	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode);
 	if (ret)
 		goto out;
 
@@ -5777,6 +5782,287 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
 	return replaced_count;
 }
 
+int
+ext4_swap_extents_2(handle_t *handle, struct inode *inode1,
+		  struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
+		  ext4_lblk_t count, __u32 insn, int *erp, struct ext4_swap_extent *se)
+{
+	struct ext4_ext_path *path1 = NULL;
+	struct ext4_ext_path *path2 = NULL;
+	int replaced_count = 0;
+
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
+
+	*erp = ext4_es_remove_extent(inode1, lblk1, count);
+	if (unlikely(*erp))
+		return 0;
+	*erp = ext4_es_remove_extent(inode2, lblk2, count);
+	if (unlikely(*erp))
+		return 0;
+
+	while (count) {
+		struct ext4_extent *ex1, *ex2, tmp_ex;
+		ext4_lblk_t e1_blk, e2_blk;
+		int e1_len, e2_len, len;
+		int split = 0;
+
+		path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
+		if (IS_ERR(path1)) {
+			*erp = PTR_ERR(path1);
+			path1 = NULL;
+finish:
+			count = 0;
+			goto repeat;
+		}
+		path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
+		if (IS_ERR(path2)) {
+			*erp = PTR_ERR(path2);
+			path2 = NULL;
+			goto finish;
+		}
+		ex1 = path1[path1->p_depth].p_ext;
+		ex2 = path2[path2->p_depth].p_ext;
+		/* Do we have something to swap ? */
+		if (unlikely(!ex2 || !ex1)) {
+			*erp = -ENODATA;
+			goto finish;
+		}
+
+		e1_blk = le32_to_cpu(ex1->ee_block);
+		e2_blk = le32_to_cpu(ex2->ee_block);
+		e1_len = ext4_ext_get_actual_len(ex1);
+		e2_len = ext4_ext_get_actual_len(ex2);
+
+		if (!in_range(lblk2, e2_blk, e2_len) ||
+		    ext4_ext_is_unwritten(ex2)) {
+			*erp = -ENODATA;
+			goto finish;
+		}
+
+		/* Hole at inode1, must move extent 2
+		 * in place of hole and make hole at inode1.
+		 * For now fail as we force fallocate the range
+		 */
+		if (!in_range(lblk1, e1_blk, e1_len)) {
+			*erp = -ENODATA;
+			goto finish;
+		}
+
+		/* Prepare left boundary */
+		if (e1_blk < lblk1) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						&path1, lblk1, 0);
+			if (unlikely(*erp))
+				goto finish;
+			se->s_orig_ext++;
+		}
+		if (e2_blk < lblk2) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						&path2,  lblk2, 0);
+			if (unlikely(*erp))
+				goto finish;
+			se->s_donor_ext++;
+		}
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated.
+		 */
+		if (split)
+			goto repeat;
+
+		/* Prepare right boundary */
+		len = count;
+		if (len > e1_blk + e1_len - lblk1)
+			len = e1_blk + e1_len - lblk1;
+		if (len > e2_blk + e2_len - lblk2)
+			len = e2_blk + e2_len - lblk2;
+
+		if (len != e1_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						&path1, lblk1 + len, 0);
+			if (unlikely(*erp))
+				goto finish;
+			se->s_orig_ext++;
+		}
+		if (len != e2_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						&path2, lblk2 + len, 0);
+			if (*erp)
+				goto finish;
+			se->s_donor_ext++;
+		}
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated.
+		 */
+		if (split)
+			goto repeat;
+
+		BUG_ON(e2_len != e1_len);
+		*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+		*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+
+		/* Both extents are fully inside boundaries. Swap it now */
+		tmp_ex = *ex1;
+		ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
+		ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
+		ex1->ee_len = ex2->ee_len;
+		ex2->ee_len = tmp_ex.ee_len;
+		if (insn & EXT4_SWAP_EXTENT_UNWRITE_DONOR)
+			ext4_ext_mark_unwritten(ex2);
+
+		ext4_ext_try_to_merge(handle, inode2, path2, ex2);
+		ext4_ext_try_to_merge(handle, inode1, path1, ex1);
+		*erp = ext4_ext_dirty(handle, inode2, path2 +
+				      path2->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+		*erp = ext4_ext_dirty(handle, inode1, path1 +
+				      path1->p_depth);
+		/*
+		 * Looks scarry ah..? second inode already points to new blocks,
+		 * and it was successfully dirtied. But luckily error may happen
+		 * only due to journal error, so full transaction will be
+		 * aborted anyway.
+		 */
+		if (unlikely(*erp))
+			goto finish;
+
+		se->s_orig_ext++;
+		se->s_donor_ext++;
+
+		lblk1 += len;
+		lblk2 += len;
+		replaced_count += len;
+		count -= len;
+
+repeat:
+		ext4_free_ext_path(path1);
+		ext4_free_ext_path(path2);
+		path1 = path2 = NULL;
+	}
+	return replaced_count;
+}
+
+int
+ext4_expose_extents_2(handle_t *handle, struct inode *inode,
+		      ext4_lblk_t lblk,
+		      ext4_lblk_t count, __u32 insn, int *erp, struct ext4_swap_extent *se)
+{
+	struct ext4_ext_path *path = NULL;
+	int replaced_count = 0;
+
+	*erp = ext4_es_remove_extent(inode, lblk, count);
+	if (unlikely(*erp))
+		return 0;
+
+	while (count) {
+		struct ext4_extent *ex;
+		ext4_lblk_t e_blk;
+		int e_len, len;
+		int split = 0;
+
+		path = ext4_find_extent(inode, lblk, NULL, EXT4_EX_NOCACHE);
+		if (IS_ERR(path)) {
+			*erp = PTR_ERR(path);
+			path = NULL;
+			count = 0;
+			break;
+		}
+
+		ex = path[path->p_depth].p_ext;
+		if (unlikely(!ex)) {
+			*erp = -ENODATA;
+finish:
+			count = 0;
+			goto repeat;
+		}
+
+		e_blk = le32_to_cpu(ex->ee_block);
+		e_len = ext4_ext_get_actual_len(ex);
+
+		if (!in_range(lblk, e_blk, e_len)) {
+			*erp = -ENODATA;
+			goto finish;
+		}
+
+		if (!ext4_ext_is_unwritten(ex)) {
+			ext4_lblk_t skip = e_blk + e_len;
+
+			if (skip > lblk + count)
+				skip = lblk + count;
+			count -= skip - lblk;
+			replaced_count += skip - lblk;
+			lblk += skip - lblk;
+			goto repeat;
+		}
+
+		/* Prepare left boundary */
+		if (e_blk < lblk) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode,
+						&path, lblk, 0);
+			if (unlikely(*erp))
+				goto finish;
+			se->s_donor_ext++;
+		}
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated.
+		 */
+		if (split)
+			goto repeat;
+
+		/* Prepare right boundary */
+		len = count;
+		if (len > e_blk + e_len - lblk)
+			len = e_blk + e_len - lblk;
+
+		if (len != e_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode,
+						&path, lblk + len, 0);
+			if (unlikely(*erp))
+				goto finish;
+			se->s_donor_ext++;
+		}
+
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated.
+		 */
+		if (split)
+			goto repeat;
+
+		*erp = ext4_ext_get_access(handle, inode, path + path->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+
+		ex->ee_len = e_len;
+		se->s_donor_ext++;
+
+		ext4_ext_try_to_merge(handle, inode, path, ex);
+		*erp = ext4_ext_dirty(handle, inode, path +
+				      path->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+
+		lblk += len;
+		replaced_count += len;
+		count -= len;
+
+repeat:
+		ext4_free_ext_path(path);
+		path = NULL;
+	}
+	return replaced_count;
+}
+
 /*
  * ext4_clu_mapped - determine whether any block in a logical cluster has
  *                   been mapped to a physical cluster
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index af70c29..22983db 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1426,6 +1426,63 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return err;
 	}
 
+	case EXT4_IOC_SWAP_EXT: {
+		struct ext4_swap_extent se;
+		struct fd donor;
+		int err;
+
+		if (!(filp->f_mode & FMODE_READ) ||
+		    !(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (copy_from_user(&se,
+			(struct swap_extent __user *)arg, sizeof(se)))
+			return -EFAULT;
+		se.moved_len = 0;
+		se.s_dirty = 0;
+		se.s_donor_ext = 0;
+		se.s_orig_ext = 0;
+		se.s_pad = 0;
+
+		if (se.instructions & ~(EXT4_SWAP_EXTENT_ASSERT_DONOR|
+					EXT4_SWAP_EXTENT_UNWRITE_DONOR|
+					EXT4_SWAP_EXTENT_ALLOC_ACCEPTOR))
+			return -EOPNOTSUPP;
+		/* For now we still implement only forced instructions */
+		if ((se.instructions & ~EXT4_SWAP_EXTENT_UNWRITE_DONOR) !=
+		    (EXT4_SWAP_EXTENT_ASSERT_DONOR|EXT4_SWAP_EXTENT_ALLOC_ACCEPTOR))
+			return -EOPNOTSUPP;
+
+		donor = fdget(se.donor_fd);
+		if (!donor.file)
+			return -EBADF;
+
+		if (!(donor.file->f_mode & FMODE_WRITE)) {
+			err = -EBADF;
+			goto sext_out;
+		}
+
+		if (ext4_has_feature_bigalloc(sb) || IS_DAX(inode)) {
+			err = -EOPNOTSUPP;
+			goto sext_out;
+		}
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			goto sext_out;
+
+		err = ext4_ioc_swap_extents(filp, donor.file, se.orig_start,
+					    se.donor_start, se.len, se.instructions, &se);
+		mnt_drop_write_file(filp);
+
+		if (copy_to_user((struct ext_swap_extent __user *)arg,
+				 &se, sizeof(se)))
+			err = -EFAULT;
+sext_out:
+		fdput(donor);
+		return err;
+	}
+
 	case EXT4_IOC_MOVE_EXT: {
 		struct move_extent me;
 		struct fd donor;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 546204e..aa9b632 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -701,3 +701,311 @@
 
 	return ret;
 }
+
+static int
+swap_one_extent(struct file *o_filp, struct inode *donor_inode,
+		ext4_lblk_t orig_blk, ext4_lblk_t donor_blk,
+		ext4_lblk_t len, __u32 insn, int *err, struct ext4_swap_extent *se,
+		unsigned int nexts)
+{
+	struct inode *orig_inode = file_inode(o_filp);
+	handle_t *handle;
+	int jblocks, retries = 0;
+	int replaced_count = 0;
+	struct super_block *sb = orig_inode->i_sb;
+
+	/*
+	 * It needs twice the amount of ordinary journal buffers because
+	 * inode and donor_inode may change each different metadata blocks.
+	 */
+again:
+	*err = 0;
+	/* XXX. Check. 1 block takes ext4_writepage_trans_blocks() (wouldbe)
+	 * but extent can span 1 more bitmap block, that's why +1. And acceptor can have heavily
+	 * fragmented area, up to len pages.
+	 */
+	jblocks = ext4_chunk_trans_blocks(donor_inode, len) + nexts;
+	handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
+	if (IS_ERR(handle)) {
+		*err = PTR_ERR(handle);
+		return 0;
+	}
+
+	ext4_double_down_write_data_sem(orig_inode, donor_inode);
+	replaced_count = ext4_swap_extents_2(handle, orig_inode, donor_inode,
+					   orig_blk, donor_blk, len,
+					   insn, err, se);
+	ext4_double_up_write_data_sem(orig_inode, donor_inode);
+
+	se->s_dirty += handle->h_requested_credits - handle->h_total_credits;
+
+	ext4_journal_stop(handle);
+	if (*err == -ENOSPC &&
+	    ext4_should_retry_alloc(sb, &retries))
+		goto again;
+	/* Buffer was busy because probably is pinned to journal transaction,
+	 * force transaction commit may help to free it.
+	 */
+	if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
+	    jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
+		goto again;
+	return replaced_count;
+}
+
+static int
+expose_one_extent(struct inode *donor_inode,
+		  ext4_lblk_t donor_blk,
+		  ext4_lblk_t len, __u32 insn, int *err, struct ext4_swap_extent *se)
+{
+	handle_t *handle;
+	int jblocks, retries = 0;
+	int replaced_count = 0;
+	struct super_block *sb = donor_inode->i_sb;
+
+again:
+	*err = 0;
+	jblocks = ext4_chunk_trans_blocks(donor_inode, len);
+	handle = ext4_journal_start(donor_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
+	if (IS_ERR(handle)) {
+		*err = PTR_ERR(handle);
+		return 0;
+	}
+
+	down_write(&EXT4_I(donor_inode)->i_data_sem);
+	replaced_count = ext4_expose_extents_2(handle, donor_inode,
+					       donor_blk, len,
+					       insn, err, se);
+	up_write(&EXT4_I(donor_inode)->i_data_sem);
+
+	se->s_dirty += handle->h_requested_credits - handle->h_total_credits;
+	ext4_journal_stop(handle);
+	if (*err == -ENOSPC &&
+	    ext4_should_retry_alloc(sb, &retries))
+		goto again;
+	/* Buffer was busy because probably is pinned to journal transaction,
+	 * force transaction commit may help to free it.
+	 */
+	if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
+	    jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
+		goto again;
+	return replaced_count;
+}
+
+static unsigned int count_credits(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len,
+				  int *erp)
+{
+	unsigned int nexts = 0;
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent *ex;
+	ext4_lblk_t end = start + len;
+	ext4_lblk_t e_blk, e_len;
+
+	*erp = 0;
+
+	while (start < end) {
+		path = ext4_find_extent(inode, start, NULL, EXT4_EX_NOCACHE);
+		if (IS_ERR(path)) {
+			*erp = PTR_ERR(path);
+			return nexts;
+		}
+		ex = path[path->p_depth].p_ext;
+		if (unlikely(!ex)) {
+			*erp = -ENODATA;
+			goto finish;
+		}
+		e_blk = le32_to_cpu(ex->ee_block);
+		e_len = ext4_ext_get_actual_len(ex);
+		if (!in_range(start, e_blk, e_len)) {
+			*erp = -ENODATA;
+			goto finish;
+		}
+		if (e_blk + e_len <= end)
+			nexts += ext4_chunk_trans_blocks(inode, e_blk + e_len - start);
+		else
+			nexts += ext4_chunk_trans_blocks(inode, end - start);
+		start = e_blk + e_len;
+		ext4_free_ext_path(path);
+	}
+	return nexts;
+
+finish:
+	ext4_free_ext_path(path);
+	return nexts;
+}
+
+int
+ext4_ioc_swap_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+		      __u64 donor_blk, __u64 len, __u32 insn, struct ext4_swap_extent *se)
+{
+	struct inode *orig_inode = file_inode(o_filp);
+	struct inode *donor_inode = file_inode(d_filp);
+	struct ext4_ext_path *path = NULL;
+	ext4_lblk_t d_end, d_start = donor_blk;
+	ext4_lblk_t o_start = orig_blk;
+	int bits = orig_inode->i_blkbits;
+	int ret;
+
+	if (orig_inode->i_sb != donor_inode->i_sb)
+		return -EINVAL;
+
+	if (donor_inode->i_blkbits != bits)
+		return -EINVAL;
+
+	/* Lazy to audit the case of block != 4096. Some day. */
+	if (bits != 12)
+		return -EINVAL;
+
+	/* Regular file check */
+	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode))
+		return -EINVAL;
+
+	/* TODO: it's not obvious how to swap blocks for inodes with full
+	 * journaling enabled
+	 */
+	if (ext4_should_journal_data(orig_inode) ||
+	    ext4_should_journal_data(donor_inode))
+		return -EOPNOTSUPP;
+
+	if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode))
+		return -EOPNOTSUPP;
+
+	/* Actually we should assert presence of dirty/writeback pages in the region
+	 * and immeditely bail out in the case we see them. But I did not find any public interface
+	 * to do this.
+	 */
+	if (orig_inode != donor_inode) {
+		ret = filemap_write_and_wait_range(orig_inode->i_mapping,
+						   (loff_t)orig_blk << bits,
+						   ((loff_t)(orig_blk + len) << bits) - 1);
+		if (ret)
+			return ret;
+	}
+
+	ret = filemap_write_and_wait_range(donor_inode->i_mapping,
+					   (donor_blk << bits),
+					   ((loff_t)(donor_blk + len) << bits) - 1);
+	if (ret)
+		return ret;
+
+	if (orig_inode != donor_inode && (insn & EXT4_SWAP_EXTENT_ALLOC_ACCEPTOR)) {
+		/* Fill holes in acceptor.
+		 * NOTE: holes are still not implemented so that this insn is mandatory
+		 */
+		ret = ext4_fallocate(o_filp, 0, (loff_t)orig_blk << bits,
+				     (loff_t)len << bits);
+		if (ret)
+			return ret;
+	}
+
+	/* Protect orig and donor inodes against a truncate */
+	if (orig_inode != donor_inode)
+		lock_two_nondirectories(orig_inode, donor_inode);
+	else
+		inode_lock(donor_inode);
+
+	/* Protect extent tree against block allocations via delalloc */
+	if (orig_inode != donor_inode)
+		ext4_double_down_write_data_sem(orig_inode, donor_inode);
+	else
+		down_write(&EXT4_I(donor_inode)->i_data_sem);
+
+	/* Check the filesystem environment whether move_extent can be done */
+	ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
+				    donor_blk, &len);
+	if (ret)
+		goto out;
+
+	d_end = d_start + len;
+
+	while (d_start < d_end) {
+		struct ext4_extent *ex;
+		ext4_lblk_t cur_blk;
+		int cur_len;
+		int rc;
+
+		ret = get_ext_path(donor_inode, d_start, &path);
+		if (ret)
+			goto out;
+		ex = path[path->p_depth].p_ext;
+		if (!ex) {
+			ret = -ENODATA;
+			goto out;
+		}
+		cur_blk = le32_to_cpu(ex->ee_block);
+		cur_len = ext4_ext_get_actual_len(ex);
+		/* We did not implement !EXT4_SWAP_EXTENT_ASSERT_DONOR, so that donor
+		 * must not have holes and unwritten extents in donor area
+		 */
+		if (d_start > cur_blk + cur_len - 1 || d_start < cur_blk) {
+			ret = -ENODATA;
+			goto out;
+		}
+		if (cur_blk < d_start) {
+			cur_len -= cur_blk - d_start;
+			cur_blk = d_start;
+		}
+		if (cur_blk + cur_len > d_end)
+			cur_len = d_end - cur_blk;
+		/* Why one extent? I dunno. I have no idea how transaction credits work */
+		if (ext4_ext_is_unwritten(ex)) {
+			if (orig_inode != donor_inode) {
+				ret = -ENODATA;
+				goto out;
+			}
+			up_write(&EXT4_I(donor_inode)->i_data_sem);
+			rc = expose_one_extent(donor_inode, d_start, cur_len, insn, &ret, se);
+			down_write(&EXT4_I(donor_inode)->i_data_sem);
+		} else if (orig_inode != donor_inode) {
+			unsigned int nexts = count_credits(orig_inode, o_start, cur_len, &ret);
+
+			if (ret < 0)
+				goto out;
+			ext4_double_up_write_data_sem(orig_inode, donor_inode);
+			rc = swap_one_extent(o_filp, donor_inode,
+					     o_start, d_start,
+					     cur_len, insn, &ret, se, nexts);
+			ext4_double_down_write_data_sem(orig_inode, donor_inode);
+		} else {
+			ret = 0;
+			rc = cur_len;
+		}
+		if (ret < 0)
+			break;
+		o_start += rc;
+		d_start += rc;
+	}
+	se->moved_len = d_start - donor_blk;
+
+out:
+	if (se->moved_len) {
+		if (orig_inode != donor_inode)
+			ext4_discard_preallocations(orig_inode, 0);
+		ext4_discard_preallocations(donor_inode, 0);
+	}
+
+	ext4_free_ext_path(path);
+	if (orig_inode != donor_inode)
+		ext4_double_up_write_data_sem(orig_inode, donor_inode);
+	else
+		up_write(&EXT4_I(donor_inode)->i_data_sem);
+
+	if (se->moved_len) {
+		/* Drop invalid page cache. Until now concurrent reads could return data mixed
+		 * at random from donor and acceptor. This is not our problem - user must take
+		 * care of blocking swapped range. But upon exit from syscall everything must stay
+		 * coherent.
+		 */
+		if (orig_inode != donor_inode)
+			truncate_pagecache_range(orig_inode, (loff_t)orig_blk << bits,
+						 (((loff_t)orig_blk + se->moved_len) << bits) - 1);
+		truncate_pagecache_range(donor_inode, (loff_t)donor_blk << bits,
+					 (((loff_t)donor_blk + se->moved_len) << bits) - 1);
+	}
+
+	if (orig_inode != donor_inode)
+		unlock_two_nondirectories(orig_inode, donor_inode);
+	else
+		inode_unlock(donor_inode);
+
+	return ret;
+}
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index f3f0b97..5e68961 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -30,7 +30,8 @@ struct space_resv {
 					 FALLOC_FL_COLLAPSE_RANGE |	\
 					 FALLOC_FL_ZERO_RANGE |		\
 					 FALLOC_FL_INSERT_RANGE |	\
-					 FALLOC_FL_UNSHARE_RANGE)
+					 FALLOC_FL_UNSHARE_RANGE |	\
+					 FALLOC_FL_PREALLOCATE)
 
 /* on ia32 l_start is on a 32-bit boundary */
 #if defined(CONFIG_X86_64)
diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
index 51398fa..6243183 100644
--- a/include/uapi/linux/falloc.h
+++ b/include/uapi/linux/falloc.h
@@ -77,4 +77,6 @@
  */
 #define FALLOC_FL_UNSHARE_RANGE		0x40
 
+#define FALLOC_FL_PREALLOCATE		0x80
+
 #endif /* _UAPI_FALLOC_H_ */
-- 
1.8.3.1



More information about the Devel mailing list