[Devel] [PATCH RHEL9 COMMIT] fs/ext4: swapext ioctl

Konstantin Khorenko khorenko at virtuozzo.com
Thu Jan 23 21:53:09 MSK 2025


The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.44.1.vz9.80.4
------>
commit 6380f9df0f6756aaecb70427da5d87ea1752dcc0
Author: Alexey Kuznetsov <kuznet at virtuozzo.com>
Date:   Sat Jan 18 02:08:52 2025 +0800

    fs/ext4: swapext ioctl
    
    It is not for mainstream, this is our local kludge,
    which is used to make huge optimizations particularly in csd,
    but not only in it.
    
    What does it do: it allows to swap range of blocks between two files.
    The code is borrowed from existing move_extent ioctl and fallocate.
    
    Obviosuly it is not a good idea to do this without mainstream review,
    but it is hopeless from this viewpoint as its mainstream-ready version
    preserving safety and transparence with user read/write would be extremally
    suboptimal and hairy. Even if this were done in mainstream we would have
    to keep separate optimal branch. And the advantage is too huge just to
    waive this possibility.
    
    Signed-off-by: Alexey Kuznetsov <kuznet at virtuozzo.com>
    Feature: ext4: swapext ioctl: swap range of blocks between two files
---
 fs/ext4/ext4.h        |  30 ++++-
 fs/ext4/extents.c     | 281 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ioctl.c       |  57 ++++++++++
 fs/ext4/move_extent.c | 308 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 675 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8dcee4361714..dbef41145ffc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -729,6 +729,7 @@ enum {
 #define EXT4_IOC_OPEN_BALLOON		_IO('f', 42)
 #define EXT4_IOC_CHECKPOINT		_IOW('f', 43, __u32)
 #define EXT4_IOC_MFSYNC			_IO('f', 43)
+#define EXT4_IOC_SWAP_EXT		_IOWR('f', 43, struct ext4_swap_extent)
 #define EXT4_IOC_GETFSUUID		_IOR('f', 44, struct fsuuid)
 #define EXT4_IOC_SETFSUUID		_IOW('f', 44, struct fsuuid)
 
@@ -868,6 +869,23 @@ struct move_extent {
 	__u64 moved_len;	/* moved block length */
 };
 
+#define EXT4_SWAP_EXTENT_ASSERT_DONOR	1
+#define EXT4_SWAP_EXTENT_UNWRITE_DONOR	2
+#define EXT4_SWAP_EXTENT_ALLOC_ACCEPTOR	4
+
+struct ext4_swap_extent {
+	__u32 instructions;	/* what to do? */
+	__u32 donor_fd;		/* donor file descriptor */
+	__u64 orig_start;	/* logical start offset in block for orig */
+	__u64 donor_start;	/* logical start offset in block for donor */
+	__u64 len;		/* block length to be moved */
+	__u64 moved_len;	/* moved block length */
+	__u32 s_dirty;
+	__u32 s_donor_ext;
+	__u32 s_orig_ext;
+	__u32 s_pad;
+};
+
 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
 #define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)
@@ -3754,9 +3772,19 @@ extern int ext4_get_es_cache(struct inode *inode,
 			     __u64 start, __u64 len);
 extern int ext4_ext_precache(struct inode *inode);
 extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
-				struct inode *inode2, ext4_lblk_t lblk1,
+			     struct inode *inode2, ext4_lblk_t lblk1,
 			     ext4_lblk_t lblk2,  ext4_lblk_t count,
 			     int mark_unwritten,int *err);
+struct ext4_swap_extent;
+extern int ext4_swap_extents_2(handle_t *handle, struct inode *inode1,
+		  struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
+		  ext4_lblk_t count, __u32 insn, int *erp, struct ext4_swap_extent *se);
+extern int ext4_expose_extents_2(handle_t *handle, struct inode *inode,
+		      ext4_lblk_t lblk,
+		      ext4_lblk_t count, __u32 insn, int *erp, struct ext4_swap_extent *se);
+extern int ext4_ioc_swap_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+				 __u64 donor_blk, __u64 len, __u32 insn,
+				 struct ext4_swap_extent *se);
 extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
 extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
 				       int check_cred, int restart_cred,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e541d4a07fe7..09c6555b20a1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5782,6 +5782,287 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 	return replaced_count;
 }
 
+int
+ext4_swap_extents_2(handle_t *handle, struct inode *inode1,
+		  struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
+		  ext4_lblk_t count, __u32 insn, int *erp, struct ext4_swap_extent *se)
+{
+	struct ext4_ext_path *path1 = NULL;
+	struct ext4_ext_path *path2 = NULL;
+	int replaced_count = 0;
+
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
+
+	*erp = ext4_es_remove_extent(inode1, lblk1, count);
+	if (unlikely(*erp))
+		return 0;
+	*erp = ext4_es_remove_extent(inode2, lblk2, count);
+	if (unlikely(*erp))
+		return 0;
+
+	while (count) {
+		struct ext4_extent *ex1, *ex2, tmp_ex;
+		ext4_lblk_t e1_blk, e2_blk;
+		int e1_len, e2_len, len;
+		int split = 0;
+
+		path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
+		if (IS_ERR(path1)) {
+			*erp = PTR_ERR(path1);
+			path1 = NULL;
+finish:
+			count = 0;
+			goto repeat;
+		}
+		path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
+		if (IS_ERR(path2)) {
+			*erp = PTR_ERR(path2);
+			path2 = NULL;
+			goto finish;
+		}
+		ex1 = path1[path1->p_depth].p_ext;
+		ex2 = path2[path2->p_depth].p_ext;
+		/* Do we have something to swap ? */
+		if (unlikely(!ex2 || !ex1)) {
+			*erp = -ENODATA;
+			goto finish;
+		}
+
+		e1_blk = le32_to_cpu(ex1->ee_block);
+		e2_blk = le32_to_cpu(ex2->ee_block);
+		e1_len = ext4_ext_get_actual_len(ex1);
+		e2_len = ext4_ext_get_actual_len(ex2);
+
+		if (!in_range(lblk2, e2_blk, e2_len) ||
+		    ext4_ext_is_unwritten(ex2)) {
+			*erp = -ENODATA;
+			goto finish;
+		}
+
+		/* Hole at inode1, must move extent 2
+		 * in place of hole and make hole at inode1.
+		 * For now fail as we force fallocate the range
+		 */
+		if (!in_range(lblk1, e1_blk, e1_len)) {
+			*erp = -ENODATA;
+			goto finish;
+		}
+
+		/* Prepare left boundary */
+		if (e1_blk < lblk1) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						&path1, lblk1, 0);
+			if (unlikely(*erp))
+				goto finish;
+			se->s_orig_ext++;
+		}
+		if (e2_blk < lblk2) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						&path2,  lblk2, 0);
+			if (unlikely(*erp))
+				goto finish;
+			se->s_donor_ext++;
+		}
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated.
+		 */
+		if (split)
+			goto repeat;
+
+		/* Prepare right boundary */
+		len = count;
+		if (len > e1_blk + e1_len - lblk1)
+			len = e1_blk + e1_len - lblk1;
+		if (len > e2_blk + e2_len - lblk2)
+			len = e2_blk + e2_len - lblk2;
+
+		if (len != e1_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						&path1, lblk1 + len, 0);
+			if (unlikely(*erp))
+				goto finish;
+			se->s_orig_ext++;
+		}
+		if (len != e2_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						&path2, lblk2 + len, 0);
+			if (*erp)
+				goto finish;
+			se->s_donor_ext++;
+		}
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated.
+		 */
+		if (split)
+			goto repeat;
+
+		BUG_ON(e2_len != e1_len);
+		*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+		*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+
+		/* Both extents are fully inside boundaries. Swap it now */
+		tmp_ex = *ex1;
+		ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
+		ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
+		ex1->ee_len = ex2->ee_len;
+		ex2->ee_len = tmp_ex.ee_len;
+		if (insn & EXT4_SWAP_EXTENT_UNWRITE_DONOR)
+			ext4_ext_mark_unwritten(ex2);
+
+		ext4_ext_try_to_merge(handle, inode2, path2, ex2);
+		ext4_ext_try_to_merge(handle, inode1, path1, ex1);
+		*erp = ext4_ext_dirty(handle, inode2, path2 +
+				      path2->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+		*erp = ext4_ext_dirty(handle, inode1, path1 +
+				      path1->p_depth);
+		/*
+		 * Looks scarry ah..? second inode already points to new blocks,
+		 * and it was successfully dirtied. But luckily error may happen
+		 * only due to journal error, so full transaction will be
+		 * aborted anyway.
+		 */
+		if (unlikely(*erp))
+			goto finish;
+
+		se->s_orig_ext++;
+		se->s_donor_ext++;
+
+		lblk1 += len;
+		lblk2 += len;
+		replaced_count += len;
+		count -= len;
+
+repeat:
+		ext4_free_ext_path(path1);
+		ext4_free_ext_path(path2);
+		path1 = path2 = NULL;
+	}
+	return replaced_count;
+}
+
+int
+ext4_expose_extents_2(handle_t *handle, struct inode *inode,
+		      ext4_lblk_t lblk,
+		      ext4_lblk_t count, __u32 insn, int *erp, struct ext4_swap_extent *se)
+{
+	struct ext4_ext_path *path = NULL;
+	int replaced_count = 0;
+
+	*erp = ext4_es_remove_extent(inode, lblk, count);
+	if (unlikely(*erp))
+		return 0;
+
+	while (count) {
+		struct ext4_extent *ex;
+		ext4_lblk_t e_blk;
+		int e_len, len;
+		int split = 0;
+
+		path = ext4_find_extent(inode, lblk, NULL, EXT4_EX_NOCACHE);
+		if (IS_ERR(path)) {
+			*erp = PTR_ERR(path);
+			path = NULL;
+			count = 0;
+			break;
+		}
+
+		ex = path[path->p_depth].p_ext;
+		if (unlikely(!ex)) {
+			*erp = -ENODATA;
+finish:
+			count = 0;
+			goto repeat;
+		}
+
+		e_blk = le32_to_cpu(ex->ee_block);
+		e_len = ext4_ext_get_actual_len(ex);
+
+		if (!in_range(lblk, e_blk, e_len)) {
+			*erp = -ENODATA;
+			goto finish;
+		}
+
+		if (!ext4_ext_is_unwritten(ex)) {
+			ext4_lblk_t skip = e_blk + e_len;
+
+			if (skip > lblk + count)
+				skip = lblk + count;
+			count -= skip - lblk;
+			replaced_count += skip - lblk;
+			lblk += skip - lblk;
+			goto repeat;
+		}
+
+		/* Prepare left boundary */
+		if (e_blk < lblk) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode,
+						&path, lblk, 0);
+			if (unlikely(*erp))
+				goto finish;
+			se->s_donor_ext++;
+		}
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated.
+		 */
+		if (split)
+			goto repeat;
+
+		/* Prepare right boundary */
+		len = count;
+		if (len > e_blk + e_len - lblk)
+			len = e_blk + e_len - lblk;
+
+		if (len != e_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode,
+						&path, lblk + len, 0);
+			if (unlikely(*erp))
+				goto finish;
+			se->s_donor_ext++;
+		}
+
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated.
+		 */
+		if (split)
+			goto repeat;
+
+		*erp = ext4_ext_get_access(handle, inode, path + path->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+
+		ex->ee_len = e_len;
+		se->s_donor_ext++;
+
+		ext4_ext_try_to_merge(handle, inode, path, ex);
+		*erp = ext4_ext_dirty(handle, inode, path +
+				      path->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+
+		lblk += len;
+		replaced_count += len;
+		count -= len;
+
+repeat:
+		ext4_free_ext_path(path);
+		path = NULL;
+	}
+	return replaced_count;
+}
+
 /*
  * ext4_clu_mapped - determine whether any block in a logical cluster has
  *                   been mapped to a physical cluster
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index af70c299538b..22983db0a162 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1426,6 +1426,63 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return err;
 	}
 
+	case EXT4_IOC_SWAP_EXT: {
+		struct ext4_swap_extent se;
+		struct fd donor;
+		int err;
+
+		if (!(filp->f_mode & FMODE_READ) ||
+		    !(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (copy_from_user(&se,
+			(struct swap_extent __user *)arg, sizeof(se)))
+			return -EFAULT;
+		se.moved_len = 0;
+		se.s_dirty = 0;
+		se.s_donor_ext = 0;
+		se.s_orig_ext = 0;
+		se.s_pad = 0;
+
+		if (se.instructions & ~(EXT4_SWAP_EXTENT_ASSERT_DONOR|
+					EXT4_SWAP_EXTENT_UNWRITE_DONOR|
+					EXT4_SWAP_EXTENT_ALLOC_ACCEPTOR))
+			return -EOPNOTSUPP;
+		/* For now we still implement only forced instructions */
+		if ((se.instructions & ~EXT4_SWAP_EXTENT_UNWRITE_DONOR) !=
+		    (EXT4_SWAP_EXTENT_ASSERT_DONOR|EXT4_SWAP_EXTENT_ALLOC_ACCEPTOR))
+			return -EOPNOTSUPP;
+
+		donor = fdget(se.donor_fd);
+		if (!donor.file)
+			return -EBADF;
+
+		if (!(donor.file->f_mode & FMODE_WRITE)) {
+			err = -EBADF;
+			goto sext_out;
+		}
+
+		if (ext4_has_feature_bigalloc(sb) || IS_DAX(inode)) {
+			err = -EOPNOTSUPP;
+			goto sext_out;
+		}
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			goto sext_out;
+
+		err = ext4_ioc_swap_extents(filp, donor.file, se.orig_start,
+					    se.donor_start, se.len, se.instructions, &se);
+		mnt_drop_write_file(filp);
+
+		if (copy_to_user((struct ext_swap_extent __user *)arg,
+				 &se, sizeof(se)))
+			err = -EFAULT;
+sext_out:
+		fdput(donor);
+		return err;
+	}
+
 	case EXT4_IOC_MOVE_EXT: {
 		struct move_extent me;
 		struct fd donor;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 546204e286ee..aa9b6329e4af 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -701,3 +701,311 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
 
 	return ret;
 }
+
+static int
+swap_one_extent(struct file *o_filp, struct inode *donor_inode,
+		ext4_lblk_t orig_blk, ext4_lblk_t donor_blk,
+		ext4_lblk_t len, __u32 insn, int *err, struct ext4_swap_extent *se,
+		unsigned int nexts)
+{
+	struct inode *orig_inode = file_inode(o_filp);
+	handle_t *handle;
+	int jblocks, retries = 0;
+	int replaced_count = 0;
+	struct super_block *sb = orig_inode->i_sb;
+
+	/*
+	 * It needs twice the amount of ordinary journal buffers because
+	 * inode and donor_inode may change each different metadata blocks.
+	 */
+again:
+	*err = 0;
+	/* XXX. Check. 1 block takes ext4_writepage_trans_blocks() (wouldbe)
+	 * but extent can span 1 more bitmap block, that's why +1. And acceptor can have heavily
+	 * fragmented area, up to len pages.
+	 */
+	jblocks = ext4_chunk_trans_blocks(donor_inode, len) + nexts;
+	handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
+	if (IS_ERR(handle)) {
+		*err = PTR_ERR(handle);
+		return 0;
+	}
+
+	ext4_double_down_write_data_sem(orig_inode, donor_inode);
+	replaced_count = ext4_swap_extents_2(handle, orig_inode, donor_inode,
+					   orig_blk, donor_blk, len,
+					   insn, err, se);
+	ext4_double_up_write_data_sem(orig_inode, donor_inode);
+
+	se->s_dirty += handle->h_requested_credits - handle->h_total_credits;
+
+	ext4_journal_stop(handle);
+	if (*err == -ENOSPC &&
+	    ext4_should_retry_alloc(sb, &retries))
+		goto again;
+	/* Buffer was busy because probably is pinned to journal transaction,
+	 * force transaction commit may help to free it.
+	 */
+	if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
+	    jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
+		goto again;
+	return replaced_count;
+}
+
+static int
+expose_one_extent(struct inode *donor_inode,
+		  ext4_lblk_t donor_blk,
+		  ext4_lblk_t len, __u32 insn, int *err, struct ext4_swap_extent *se)
+{
+	handle_t *handle;
+	int jblocks, retries = 0;
+	int replaced_count = 0;
+	struct super_block *sb = donor_inode->i_sb;
+
+again:
+	*err = 0;
+	jblocks = ext4_chunk_trans_blocks(donor_inode, len);
+	handle = ext4_journal_start(donor_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
+	if (IS_ERR(handle)) {
+		*err = PTR_ERR(handle);
+		return 0;
+	}
+
+	down_write(&EXT4_I(donor_inode)->i_data_sem);
+	replaced_count = ext4_expose_extents_2(handle, donor_inode,
+					       donor_blk, len,
+					       insn, err, se);
+	up_write(&EXT4_I(donor_inode)->i_data_sem);
+
+	se->s_dirty += handle->h_requested_credits - handle->h_total_credits;
+	ext4_journal_stop(handle);
+	if (*err == -ENOSPC &&
+	    ext4_should_retry_alloc(sb, &retries))
+		goto again;
+	/* Buffer was busy because probably is pinned to journal transaction,
+	 * force transaction commit may help to free it.
+	 */
+	if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
+	    jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
+		goto again;
+	return replaced_count;
+}
+
+static unsigned int count_credits(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len,
+				  int *erp)
+{
+	unsigned int nexts = 0;
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent *ex;
+	ext4_lblk_t end = start + len;
+	ext4_lblk_t e_blk, e_len;
+
+	*erp = 0;
+
+	while (start < end) {
+		path = ext4_find_extent(inode, start, NULL, EXT4_EX_NOCACHE);
+		if (IS_ERR(path)) {
+			*erp = PTR_ERR(path);
+			return nexts;
+		}
+		ex = path[path->p_depth].p_ext;
+		if (unlikely(!ex)) {
+			*erp = -ENODATA;
+			goto finish;
+		}
+		e_blk = le32_to_cpu(ex->ee_block);
+		e_len = ext4_ext_get_actual_len(ex);
+		if (!in_range(start, e_blk, e_len)) {
+			*erp = -ENODATA;
+			goto finish;
+		}
+		if (e_blk + e_len <= end)
+			nexts += ext4_chunk_trans_blocks(inode, e_blk + e_len - start);
+		else
+			nexts += ext4_chunk_trans_blocks(inode, end - start);
+		start = e_blk + e_len;
+		ext4_free_ext_path(path);
+	}
+	return nexts;
+
+finish:
+	ext4_free_ext_path(path);
+	return nexts;
+}
+
+int
+ext4_ioc_swap_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+		      __u64 donor_blk, __u64 len, __u32 insn, struct ext4_swap_extent *se)
+{
+	struct inode *orig_inode = file_inode(o_filp);
+	struct inode *donor_inode = file_inode(d_filp);
+	struct ext4_ext_path *path = NULL;
+	ext4_lblk_t d_end, d_start = donor_blk;
+	ext4_lblk_t o_start = orig_blk;
+	int bits = orig_inode->i_blkbits;
+	int ret;
+
+	if (orig_inode->i_sb != donor_inode->i_sb)
+		return -EINVAL;
+
+	if (donor_inode->i_blkbits != bits)
+		return -EINVAL;
+
+	/* Lazy to audit the case of block != 4096. Some day. */
+	if (bits != 12)
+		return -EINVAL;
+
+	/* Regular file check */
+	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode))
+		return -EINVAL;
+
+	/* TODO: it's not obvious how to swap blocks for inodes with full
+	 * journaling enabled
+	 */
+	if (ext4_should_journal_data(orig_inode) ||
+	    ext4_should_journal_data(donor_inode))
+		return -EOPNOTSUPP;
+
+	if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode))
+		return -EOPNOTSUPP;
+
+	/* Actually we should assert presence of dirty/writeback pages in the region
+	 * and immeditely bail out in the case we see them. But I did not find any public interface
+	 * to do this.
+	 */
+	if (orig_inode != donor_inode) {
+		ret = filemap_write_and_wait_range(orig_inode->i_mapping,
+						   (loff_t)orig_blk << bits,
+						   ((loff_t)(orig_blk + len) << bits) - 1);
+		if (ret)
+			return ret;
+	}
+
+	ret = filemap_write_and_wait_range(donor_inode->i_mapping,
+					   (donor_blk << bits),
+					   ((loff_t)(donor_blk + len) << bits) - 1);
+	if (ret)
+		return ret;
+
+	if (orig_inode != donor_inode && (insn & EXT4_SWAP_EXTENT_ALLOC_ACCEPTOR)) {
+		/* Fill holes in acceptor.
+		 * NOTE: holes are still not implemented so that this insn is mandatory
+		 */
+		ret = ext4_fallocate(o_filp, 0, (loff_t)orig_blk << bits,
+				     (loff_t)len << bits);
+		if (ret)
+			return ret;
+	}
+
+	/* Protect orig and donor inodes against a truncate */
+	if (orig_inode != donor_inode)
+		lock_two_nondirectories(orig_inode, donor_inode);
+	else
+		inode_lock(donor_inode);
+
+	/* Protect extent tree against block allocations via delalloc */
+	if (orig_inode != donor_inode)
+		ext4_double_down_write_data_sem(orig_inode, donor_inode);
+	else
+		down_write(&EXT4_I(donor_inode)->i_data_sem);
+
+	/* Check the filesystem environment whether move_extent can be done */
+	ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
+				    donor_blk, &len);
+	if (ret)
+		goto out;
+
+	d_end = d_start + len;
+
+	while (d_start < d_end) {
+		struct ext4_extent *ex;
+		ext4_lblk_t cur_blk;
+		int cur_len;
+		int rc;
+
+		ret = get_ext_path(donor_inode, d_start, &path);
+		if (ret)
+			goto out;
+		ex = path[path->p_depth].p_ext;
+		if (!ex) {
+			ret = -ENODATA;
+			goto out;
+		}
+		cur_blk = le32_to_cpu(ex->ee_block);
+		cur_len = ext4_ext_get_actual_len(ex);
+		/* We did not implement !EXT4_SWAP_EXTENT_ASSERT_DONOR, so that donor
+		 * must not have holes and unwritten extents in donor area
+		 */
+		if (d_start > cur_blk + cur_len - 1 || d_start < cur_blk) {
+			ret = -ENODATA;
+			goto out;
+		}
+		if (cur_blk < d_start) {
+			cur_len -= cur_blk - d_start;
+			cur_blk = d_start;
+		}
+		if (cur_blk + cur_len > d_end)
+			cur_len = d_end - cur_blk;
+		/* Why one extent? I dunno. I have no idea how transaction credits work */
+		if (ext4_ext_is_unwritten(ex)) {
+			if (orig_inode != donor_inode) {
+				ret = -ENODATA;
+				goto out;
+			}
+			up_write(&EXT4_I(donor_inode)->i_data_sem);
+			rc = expose_one_extent(donor_inode, d_start, cur_len, insn, &ret, se);
+			down_write(&EXT4_I(donor_inode)->i_data_sem);
+		} else if (orig_inode != donor_inode) {
+			unsigned int nexts = count_credits(orig_inode, o_start, cur_len, &ret);
+
+			if (ret < 0)
+				goto out;
+			ext4_double_up_write_data_sem(orig_inode, donor_inode);
+			rc = swap_one_extent(o_filp, donor_inode,
+					     o_start, d_start,
+					     cur_len, insn, &ret, se, nexts);
+			ext4_double_down_write_data_sem(orig_inode, donor_inode);
+		} else {
+			ret = 0;
+			rc = cur_len;
+		}
+		if (ret < 0)
+			break;
+		o_start += rc;
+		d_start += rc;
+	}
+	se->moved_len = d_start - donor_blk;
+
+out:
+	if (se->moved_len) {
+		if (orig_inode != donor_inode)
+			ext4_discard_preallocations(orig_inode, 0);
+		ext4_discard_preallocations(donor_inode, 0);
+	}
+
+	ext4_free_ext_path(path);
+	if (orig_inode != donor_inode)
+		ext4_double_up_write_data_sem(orig_inode, donor_inode);
+	else
+		up_write(&EXT4_I(donor_inode)->i_data_sem);
+
+	if (se->moved_len) {
+		/* Drop invalid page cache. Until now concurrent reads could return data mixed
+		 * at random from donor and acceptor. This is not our problem - user must take
+		 * care of blocking swapped range. But upon exit from syscall everything must stay
+		 * coherent.
+		 */
+		if (orig_inode != donor_inode)
+			truncate_pagecache_range(orig_inode, (loff_t)orig_blk << bits,
+						 (((loff_t)orig_blk + se->moved_len) << bits) - 1);
+		truncate_pagecache_range(donor_inode, (loff_t)donor_blk << bits,
+					 (((loff_t)donor_blk + se->moved_len) << bits) - 1);
+	}
+
+	if (orig_inode != donor_inode)
+		unlock_two_nondirectories(orig_inode, donor_inode);
+	else
+		inode_unlock(donor_inode);
+
+	return ret;
+}


More information about the Devel mailing list