[Devel] [PATCH RHEL9 COMMIT] fs: fuse: kio: resolve race condition fallocate vs shrink

Konstantin Khorenko khorenko at virtuozzo.com
Thu Nov 14 00:15:31 MSK 2024


The commit is pushed to "branch-rh9-5.14.0-427.37.1.vz9.78.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.37.1.vz9.78.3
------>
commit ead9b9aec946bcf8f4395f89e82310d180221f0b
Author: Alexey Kuznetsov <kuznet at virtuozzo.com>
Date:   Fri Nov 8 00:01:10 2024 +0800

    fs: fuse: kio: resolve race condition fallocate vs shrink
    
    It was missed kio/pcs module has warning on out of mutex execution
    of fallocate request. It is not a bug per se, yet the situation
    is dubious. The rationale was that logically fallocate(zerorange) is
    _not_ different of aio direct write of zeros to the same area,
    and that write used to be executed out of inode_lock. So asserting
    the lock in fallocate might be a noise.
    
    Yet we must comply completely. There is one difference, aio takes
    dio_write counter, which blocks concurrent truncates until write
    is completed. This is important mostly for shrinks, as expanding
    writes are not executed asynchronously, fallocates likewise.
    
    So, we take dio_write for async fallocate out of inode_lock,
    making it 100% equivalent to write.
    
    NOTE: the situation with expanding writes/fallocates is complicated.
    We might even have a zero day bug here, I do not see one now, but this does
    not mean we did not have there any holes. Just a note.
    
    Affects: #VSTOR-94829
    https://virtuozzo.atlassian.net/browse/VSTOR-94829
    
    Signed-off-by: Alexey Kuznetsov <kuznet at virtuozzo.com>
    Feature: vStorage
---
 fs/fuse/file.c                     | 11 +++++++++--
 fs/fuse/kio/pcs/pcs_fuse_kdirect.c | 21 ++++++++++++++++-----
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7bf7c9e45b98..195b77114d02 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -3450,8 +3450,13 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	if (!(mode & FALLOC_FL_KEEP_SIZE))
 		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 
-	if (revoke_lock)
+	if (revoke_lock) {
+		/* This blocks truncates which might happen after
+		 * inode_lock is released
+		 */
+		fuse_write_dio_begin(fi);
 		inode_unlock(inode);
+	}
 
 	args.opcode = FUSE_FALLOCATE;
 	args.io_inode = inode;
@@ -3460,8 +3465,10 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	args.in_args[0].size = sizeof(inarg);
 	args.in_args[0].value = &inarg;
 	err = fuse_simple_request(fm, &args);
-	if (revoke_lock)
+	if (revoke_lock) {
 		inode_lock(inode);
+		fuse_write_dio_end(fi);
+	}
 	if (err == -ENOSYS) {
 		fm->fc->no_fallocate = 1;
 		err = -EOPNOTSUPP;
diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
index 1bd43ecf3b74..5b141fe5bbd4 100644
--- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
+++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
@@ -1019,19 +1019,19 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req)
 		break;
 	case FUSE_FALLOCATE: {
 		struct fuse_fallocate_in *inarg = (void*) args->in_args[0].value;
+		size_t sz = READ_ONCE(di->fileinfo.attr.size);
 
 		if (pfc->fc->no_fallocate) {
 			req->out.h.error = -EOPNOTSUPP;
 			goto error;
 		}
 
-		if (inarg->offset >= di->fileinfo.attr.size)
+		if (inarg->offset >= sz)
 			inarg->mode &= ~FALLOC_FL_ZERO_RANGE;
 
 		if (inarg->mode == FALLOC_FL_KEEP_SIZE)
 			break; /* NOPE */
 
-		WARN_ON_ONCE(!inode_is_locked(&fi->inode));
 		if (inarg->mode & (FALLOC_FL_ZERO_RANGE|FALLOC_FL_PUNCH_HOLE)) {
 			if ((inarg->offset & (PAGE_SIZE - 1)) || (inarg->length & (PAGE_SIZE - 1))) {
 				req->out.h.error = -EINVAL;
@@ -1040,10 +1040,21 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req)
 		}
 
 		if (inarg->mode & FALLOC_FL_KEEP_SIZE) {
-			if (inarg->offset > di->fileinfo.attr.size)
+			if (inarg->offset > sz)
 				break; /* NOPE */
-			if (inarg->offset + inarg->length > di->fileinfo.attr.size)
-				inarg->length = di->fileinfo.attr.size - inarg->offset;
+			if (inarg->offset + inarg->length > sz)
+				inarg->length = sz - inarg->offset;
+		} else {
+			/* attr.size can expand, but not shrink. If the file is expanded
+			 * by fallocate inode_lock cannot be dropped.
+			 *
+			 * Interesting new situation to consider: update i_size happens of exit
+			 * from allocating write/falloc and we have a window where attr.size is ahead
+			 * of i_size. It looks safe as allocating call is deemed to be synchronous,
+			 * and as i_size is still not advanced all the following ones are.
+			 */
+			WARN_ON_ONCE(inarg->offset + inarg->length > sz &&
+				     !inode_is_locked(&fi->inode));
 		}
 
 		ret = pcs_fuse_prep_rw(r);


More information about the Devel mailing list