[Devel] [PATCH RHEL9 COMMIT] fs/fuse: relax serialization of fallocate

Konstantin Khorenko khorenko at virtuozzo.com
Fri Oct 25 19:33:06 MSK 2024


The commit is pushed to "branch-rh9-5.14.0-427.37.1.vz9.78.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.37.1.vz9.78.1
------>
commit cfdd1dfd9bfbf5eff20798cc994b8f66321e3a86
Author: Alexey Kuznetsov <kuznet at virtuozzo.com>
Date:   Fri Oct 25 22:25:34 2024 +0800

    fs/fuse: relax serialization of fallocate
    
    Drop inode_lock while executing fallocate.
    
    This is a kludge. Why this cannot be a genuine thing.
    
    Traditionally all synchchronous linux file operations are
    strictly serialized. The axiom is that file modifications
    are to be atomic in the sense that if two writes are issued
    from different threads simultaneously they can be applied
    in unpredictable order, but nevertheless only as whole:
    either one goes first or the other one. This patch violates
    this property for fallocate.
    
    Note that AIO calls do not respect this property, though
    they are strutted f.e. if AIO expands file size it is
    forced to be synchronous and fully serialized, but the property
    is still violated. We do the same thing for fallocate.
    
    Why serialization does not hurt local fs. Because fallocate
    like write through page cache is not actual disk operation.
    It is done in cache and only commit makes actual disk access.
    
    What would be a honest solution. First, get rid threaded IO
    in applications, it is deficient in any case, just wastes resources
    blocking of inode_mutex. We could instead introduse zero_range/punch_hole
    to aio/io_uring, essentially it is aio_write(zeros) yet.
    But this is too radical. Second, we could implement something
    like cached fallocate, it is doable, moreover fuse already
    has this code used to background writeback, but it is still chumbersome.
    Probably we will go this way one day. Third, the weakest,
    we can introduce "area lock" for affected area, which will block
    writes/reads/truncates/fallocates affecting overlapped areas.
    It is possible, but not as a hotfix, the fix would have impact
    on usual io paths.
    
    Current opinion: stay with this fix for a while. If we (or someone)
    has tests deliberately testing atomicity, they will fail, but
    other than that it is going to be just fine. Key check:
    if application issues fallocate from threads and simultaneosly
    does AIO, and it will not break if threaded synchronous
    zero_range is equivalent to aio_write(zeros), we are good to go.
    
    Affects: #VSTOR-94452
    https://virtuozzo.atlassian.net/browse/VSTOR-94452
    
    Signed-off-by: Alexey Kuznetsov <kuznet at virtuozzo.com>
    Feature: vStorage
---
 fs/fuse/file.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e2133ccde857..7bf7c9e45b98 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -23,6 +23,9 @@
 #include <linux/file.h>
 #include <linux/jhash.h>
 
+unsigned int relax_fallocate = 1;
+module_param(relax_fallocate, uint, 0644);
+
 struct workqueue_struct *fuse_fput_wq;
 static DEFINE_SPINLOCK(fuse_fput_lock);
 static LIST_HEAD(fuse_fput_head);
@@ -3370,13 +3373,18 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	return ret;
 }
 
-static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
+static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end, int revoke_lock)
 {
 	int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
 
 	if (!err) {
-		fuse_sync_writes(inode);
-		fuse_read_dio_wait(get_fuse_inode(inode));
+		pgoff_t idx_from = start >> PAGE_SHIFT;
+		pgoff_t idx_to = end >> PAGE_SHIFT;
+
+		if (!revoke_lock || fuse_range_is_writeback(inode, idx_from, idx_to)) {
+			fuse_sync_writes(inode);
+			fuse_read_dio_wait(get_fuse_inode(inode));
+		}
 	}
 
 	return err;
@@ -3400,6 +3408,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	bool block_faults = FUSE_IS_DAX(inode) &&
 		(!(mode & FALLOC_FL_KEEP_SIZE) ||
 		 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)));
+	int revoke_lock = 0;
 
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
 		     FALLOC_FL_ZERO_RANGE))
@@ -3414,12 +3423,15 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 		err = fuse_dax_break_layouts(inode, 0, 0);
 		if (err)
 			goto out;
-	}
+	} else if (relax_fallocate && fm->fc->close_wait &&
+		   ((mode & FALLOC_FL_KEEP_SIZE) ||
+		    offset + length <= i_size_read(inode)))
+		revoke_lock = 1;
 
 	if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
 		loff_t endbyte = offset + length - 1;
 
-		err = fuse_writeback_range(inode, offset, endbyte);
+		err = fuse_writeback_range(inode, offset, endbyte, revoke_lock);
 		if (err)
 			goto out;
 	}
@@ -3438,6 +3450,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	if (!(mode & FALLOC_FL_KEEP_SIZE))
 		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 
+	if (revoke_lock)
+		inode_unlock(inode);
+
 	args.opcode = FUSE_FALLOCATE;
 	args.io_inode = inode;
 	args.nodeid = ff->nodeid;
@@ -3445,6 +3460,8 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	args.in_args[0].size = sizeof(inarg);
 	args.in_args[0].value = &inarg;
 	err = fuse_simple_request(fm, &args);
+	if (revoke_lock)
+		inode_lock(inode);
 	if (err == -ENOSYS) {
 		fm->fc->no_fallocate = 1;
 		err = -EOPNOTSUPP;
@@ -3512,7 +3529,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
 		return -EXDEV;
 
 	inode_lock(inode_in);
-	err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
+	err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1, 0);
 	inode_unlock(inode_in);
 	if (err)
 		return err;
@@ -3541,7 +3558,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
 	 * To fix this a i_mmap_sem style lock could be used to prevent new
 	 * faults while the copy is ongoing.
 	 */
-	err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
+	err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1, 0);
 	if (err)
 		goto out;
 


More information about the Devel mailing list