[Devel] [PATCH VZ9 3/3] fs: fuse: relax serialization of fallocate

Alexey Kuznetsov kuznet at virtuozzo.com
Fri Oct 25 17:25:34 MSK 2024


Drop inode_lock while executing fallocate.

This is a kludge. Why this cannot be a genuine thing.

Traditionally all synchchronous linux file operations are
strictly serialized. The axiom is that file modifications
are to be atomic in the sense that if two writes are issued
from different threads simultaneously they can be applied
in unpredictable order, but nevertheless only as whole:
either one goes first or the other one. This patch violates
this property for fallocate.

Note that AIO calls do not respect this property, though
they are strutted f.e. if AIO expands file size it is
forced to be synchronous and fully serialized, but the property
is still violated. We do the same thing for fallocate.

Why serialization does not hurt local fs. Because fallocate
like write through page cache is not actual disk operation.
It is done in cache and only commit makes actual disk access.

What would be a honest solution. First, get rid threaded IO
in applications, it is deficient in any case, just wastes resources
blocking of inode_mutex. We could instead introduse zero_range/punch_hole
to aio/io_uring, essentially it is aio_write(zeros) yet.
But this is too radical. Second, we could implement something
like cached fallocate, it is doable, moreover fuse already
has this code used to background writeback, but it is still chumbersome.
Probably we will go this way one day. Third, the weakest,
we can introduce "area lock" for affected area, which will block
writes/reads/truncates/fallocates affecting overlapped areas.
It is possible, but not as a hotfix, the fix would have impact
on usual io paths.

Current opinion: stay with this fix for a while. If we (or someone)
has tests deliberately testing atomicity, they will fail, but
other than that it is going to be just fine. Key check:
if application issues fallocate from threads and simulteneosly
does AIO, and it will not break if threaded synchronous
zero_range is equivalent to aio_write(zeros), we are good to go.

Affects: #VSTOR-94452

Signed-off-by: Alexey Kuznetsov <kuznet at virtuozzo.com>
---
 fs/fuse/file.c | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e2133cc..41225f5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -23,6 +23,9 @@
 #include <linux/file.h>
 #include <linux/jhash.h>
 
+unsigned int relax_fallocate = 1;
+module_param(relax_fallocate, uint, 0644);
+
 struct workqueue_struct *fuse_fput_wq;
 static DEFINE_SPINLOCK(fuse_fput_lock);
 static LIST_HEAD(fuse_fput_head);
@@ -1546,7 +1549,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
 	int err = 0;
 	ssize_t res = 0;
 
-	if (inode->i_size < pos + iov_iter_count(ii))
+	if (!fc->writeback_cache && inode->i_size < pos + iov_iter_count(ii))
 		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 
 	do {
@@ -1585,7 +1588,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
 	if (res > 0)
 		fuse_write_update_size(inode, pos);
 
-	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+	if (!fc->writeback_cache)
+		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 	fuse_invalidate_attr(inode);
 
 	return res > 0 ? res : err;
@@ -3370,13 +3374,18 @@ static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
 	return ret;
 }
 
-static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
+static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end, int revoke_lock)
 {
 	int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
 
 	if (!err) {
-		fuse_sync_writes(inode);
-		fuse_read_dio_wait(get_fuse_inode(inode));
+		pgoff_t idx_from = start >> PAGE_SHIFT;
+		pgoff_t idx_to = end >> PAGE_SHIFT;
+
+		if (!revoke_lock || fuse_range_is_writeback(inode, idx_from, idx_to)) {
+			fuse_sync_writes(inode);
+			fuse_read_dio_wait(get_fuse_inode(inode));
+		}
 	}
 
 	return err;
@@ -3400,6 +3409,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	bool block_faults = FUSE_IS_DAX(inode) &&
 		(!(mode & FALLOC_FL_KEEP_SIZE) ||
 		 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)));
+	int revoke_lock = 0;
 
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
 		     FALLOC_FL_ZERO_RANGE))
@@ -3414,12 +3424,15 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 		err = fuse_dax_break_layouts(inode, 0, 0);
 		if (err)
 			goto out;
-	}
+	} else if (relax_fallocate && fm->fc->close_wait &&
+		   ((mode & FALLOC_FL_KEEP_SIZE) ||
+		    offset + length <= i_size_read(inode)))
+		revoke_lock = 1;
 
 	if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
 		loff_t endbyte = offset + length - 1;
 
-		err = fuse_writeback_range(inode, offset, endbyte);
+		err = fuse_writeback_range(inode, offset, endbyte, revoke_lock);
 		if (err)
 			goto out;
 	}
@@ -3435,9 +3448,12 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	if (err)
 		goto out;
 
-	if (!(mode & FALLOC_FL_KEEP_SIZE))
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && fm->fc->writeback_cache)
 		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 
+	if (revoke_lock)
+		inode_unlock(inode);
+
 	args.opcode = FUSE_FALLOCATE;
 	args.io_inode = inode;
 	args.nodeid = ff->nodeid;
@@ -3445,6 +3461,8 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	args.in_args[0].size = sizeof(inarg);
 	args.in_args[0].value = &inarg;
 	err = fuse_simple_request(fm, &args);
+	if (revoke_lock)
+		inode_lock(inode);
 	if (err == -ENOSYS) {
 		fm->fc->no_fallocate = 1;
 		err = -EOPNOTSUPP;
@@ -3466,7 +3484,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	fuse_invalidate_attr(inode);
 
 out:
-	if (!(mode & FALLOC_FL_KEEP_SIZE))
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && fm->fc->writeback_cache)
 		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 
 	if (block_faults)
@@ -3512,7 +3530,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
 		return -EXDEV;
 
 	inode_lock(inode_in);
-	err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
+	err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1, 0);
 	inode_unlock(inode_in);
 	if (err)
 		return err;
@@ -3541,7 +3559,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
 	 * To fix this a i_mmap_sem style lock could be used to prevent new
 	 * faults while the copy is ongoing.
 	 */
-	err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
+	err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1, 0);
 	if (err)
 		goto out;
 
-- 
1.8.3.1



More information about the Devel mailing list