[Devel] [PATCH RHEL10 COMMIT] fs/fuse: order FUSE_OPEN reply against FUSE_NOTIFY_INVAL_FILES

Konstantin Khorenko khorenko at virtuozzo.com
Fri Jun 19 13:34:04 MSK 2026


The commit is pushed to "branch-rh10-6.12.0-211.16.1.12.x.vz10-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh10-6.12.0-211.16.1.12.4.vz10
------>
commit c26931e7e7a48d9c1d89facd316654afecdb59cf
Author: Liu Kui <kui.liu at virtuozzo.com>
Date:   Fri Jun 19 11:55:34 2026 +0800

    fs/fuse: order FUSE_OPEN reply against FUSE_NOTIFY_INVAL_FILES
    
    The userspace daemon's send order of a FUSE_OPEN reply and a
    FUSE_NOTIFY_INVAL_FILES notification can be reversed by the time the
    kernel processes them. The kernel must distinguish the two cases:
    
      - reply sent before the notification: the open is stale and must
        be flagged FUSE_S_FAIL_IMMEDIATELY;
      - reply sent after the notification: the open is valid but must
        wait for fuse_inval_files_work() to complete before returning,
        or the caller hits IO errors on an inode still being invalidated.
    
    The FUSE_I_INVAL_FILES bit carries no ordering and cannot tell these
    apart. Add a per-fuse_dev write_seq bumped on every reply/notification
    write, saved to ff->open_seq on OPEN replies and fi->inval_files_seq
    on FUSE_NOTIFY_INVAL_FILES, and compared in fuse_link_rw_file() to
    flag stale opens. fuse_finish_open() then waits on FUSE_I_INVAL_FILES
    for the remaining case.
    
    khorenko@ notes:
      fuse_link_rw_file() now runs only for close_wait connections. The
      rw_files list and the FUSE_NOTIFY_INVAL_FILES machinery are a
      vStorage-only extension, so populating rw_files for other mounts is
      pointless. fuse_request_fiemap() is adjusted to match: it skips files
      flagged FUSE_S_FAIL_IMMEDIATELY and falls back to write_files when
      rw_files is empty.
    
      The write_seq ordering is only meaningful when the daemon sends the
      FUSE_OPEN reply and the FUSE_NOTIFY_INVAL_FILES notification for a
      given inode over the same /dev/fuse instance and serializes writes on
      it. The vStorage daemon guarantees this, hence write_seq is kept
      per-fuse_dev and bumped without extra locking.
    
    Feature: vStorage
    https://virtuozzo.atlassian.net/browse/VSTOR-133093
    Signed-off-by: Liu Kui <kui.liu at virtuozzo.com>
---
 fs/fuse/dev.c    | 17 +++++++++----
 fs/fuse/dir.c    |  1 +
 fs/fuse/file.c   | 72 ++++++++++++++++++++++++++++++++++++++++----------------
 fs/fuse/fuse_i.h | 14 ++++++++++-
 fs/fuse/inode.c  | 23 +++++++++++-------
 5 files changed, 92 insertions(+), 35 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a3cb9c358fccf..937ea9418cc3c 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2002,9 +2002,10 @@ static int fuse_notify_resend(struct fuse_conn *fc)
 	return 0;
 }
 
-static int fuse_notify_inval_files(struct fuse_conn *fc, unsigned int size,
+static int fuse_notify_inval_files(struct fuse_dev *fud, unsigned int size,
 				   struct fuse_copy_state *cs)
 {
+	struct fuse_conn *fc = fud->fc;
 	struct fuse_notify_inval_files_out outarg;
 	int err = -EINVAL;
 
@@ -2018,7 +2019,7 @@ static int fuse_notify_inval_files(struct fuse_conn *fc, unsigned int size,
 
 	down_read(&fc->killsb);
 
-	err = fuse_invalidate_files(fc, outarg.ino);
+	err = fuse_invalidate_files(fud, outarg.ino);
 
 	up_read(&fc->killsb);
 	return err;
@@ -2028,9 +2029,11 @@ static int fuse_notify_inval_files(struct fuse_conn *fc, unsigned int size,
 	return err;
 }
 
-static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
+static int fuse_notify(struct fuse_dev *fud, enum fuse_notify_code code,
 		       unsigned int size, struct fuse_copy_state *cs)
 {
+	struct fuse_conn *fc = fud->fc;
+
 	/* Don't try to move pages (yet) */
 	cs->move_pages = 0;
 
@@ -2061,7 +2064,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		return fuse_notify_resend(fc);
 
 	case FUSE_NOTIFY_INVAL_FILES:
-		return fuse_notify_inval_files(fc, size, cs);
+		return fuse_notify_inval_files(fud, size, cs);
 
 	default:
 		fuse_copy_finish(cs);
@@ -2374,6 +2377,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 	struct fuse_req *req;
 	struct fuse_out_header oh;
 
+	/* Monotonic sequence number matching the daemon's reply/notification send order. */
+	fud->write_seq++;
+
 	err = -EINVAL;
 	if (nbytes < sizeof(struct fuse_out_header))
 		goto out;
@@ -2391,7 +2397,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 	 * and error contains notification code.
 	 */
 	if (!oh.unique) {
-		err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
+		err = fuse_notify(fud, oh.error, nbytes - sizeof(oh), cs);
 		goto out;
 	}
 
@@ -2461,6 +2467,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 		list_del_init(&req->list);
 	spin_unlock(&fpq->lock);
 
+	req->args->reply_seq = fud->write_seq;
 	fuse_request_end(req);
 out:
 	return err ? err : nbytes;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 6637914c8f282..45ee4f1e4b683 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -693,6 +693,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
 	ff->fh = outopenp->fh;
 	ff->nodeid = outentry.nodeid;
 	ff->open_flags = outopenp->open_flags;
+	ff->open_seq = args.reply_seq;
 	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
 			  &outentry.attr, ATTR_TIMEOUT(&outentry), 0);
 	if (!inode) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e47784f091c20..40cded85037f0 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -38,10 +38,11 @@ module_param(relax_fallocate, uint, 0644);
 
 static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
 			  unsigned int open_flags, int opcode,
-			  struct fuse_open_out *outargp)
+			  struct fuse_open_out *outargp, u64 *reply_seq)
 {
 	struct fuse_open_in inarg;
 	FUSE_ARGS(args);
+	int err;
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
@@ -62,7 +63,14 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
 	args.out_args[0].size = sizeof(*outargp);
 	args.out_args[0].value = outargp;
 
-	return fuse_simple_request(fm, &args);
+	err = fuse_simple_request(fm, &args);
+	/*
+	 * Propagate the dev write_seq so fuse_link_rw_file() can order
+	 * this open against any concurrent FUSE_NOTIFY_INVAL_FILES.
+	 */
+	*reply_seq = args.reply_seq;
+
+	return err;
 }
 
 struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release)
@@ -185,7 +193,7 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
 		struct fuse_open_out *outargp = &ff->args->open_outarg;
 		int err;
 
-		err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp);
+		err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp, &ff->open_seq);
 		if (!err) {
 			ff->fh = outargp->fh;
 			ff->open_flags = outargp->open_flags;
@@ -238,6 +246,18 @@ static void fuse_link_write_file(struct file *file)
 	spin_unlock(&fi->lock);
 }
 
+static int fuse_wait_on_inval_files(struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = get_fuse_mount(inode)->fc;
+
+	if (likely(!test_bit(FUSE_I_INVAL_FILES, &fi->state)))
+		return 0;
+
+	fuse_ktrace(fc, "waiting for invalidate_files on [%llu] to complete", fi->nodeid);
+	return wait_on_bit(&fi->state, FUSE_I_INVAL_FILES, TASK_KILLABLE);
+}
+
 static void fuse_link_rw_file(struct file *file)
 {
 	struct inode *inode = file_inode(file);
@@ -245,14 +265,17 @@ static void fuse_link_rw_file(struct file *file)
 	struct fuse_file *ff = file->private_data;
 
 	spin_lock(&fi->lock);
-	if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
-		spin_lock(&ff->lock);
-		set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
-		spin_unlock(&ff->lock);
-		fuse_ktrace(ff->fm->fc, "fuse_file[%llu] --> invalidate_file on [%llu] pending", ff->fh, ff->nodeid);
-	}
 	if (list_empty(&ff->rw_entry))
 		list_add(&ff->rw_entry, &fi->rw_files);
+
+	/*
+	 * If this open reply was received before the last
+	 * FUSE_NOTIFY_INVAL_FILES on this inode, the open is
+	 * stale, fail it immediately.
+	 */
+	if (ff->open_seq < fi->inval_files_seq)
+		set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
+
 	spin_unlock(&fi->lock);
 }
 
@@ -278,9 +301,20 @@ int fuse_finish_open(struct inode *inode, struct file *file)
 	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
 		fuse_link_write_file(file);
 
-	fuse_link_rw_file(file);
+	/* Only apply to vStorage */
+	if (fc->close_wait) {
+		fuse_link_rw_file(file);
 
-	return 0;
+		/*
+		* Wait for fuse_inval_files_work() on this inode to complete
+		* before returning, otherwise the caller can hit IO errors
+		* on an inode still being invalidated.
+		*/
+		if (!test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state))
+			err = fuse_wait_on_inval_files(inode);
+	}
+
+	return err;
 }
 
 static void fuse_truncate_update_attr(struct inode *inode, struct file *file)
@@ -313,12 +347,9 @@ static int fuse_open(struct inode *inode, struct file *file)
 	if ((file->f_flags & O_DIRECT) && !fc->direct_enable)
 		return -EINVAL;
 
-	if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
-		fuse_ktrace(fc, "waiting for invalidate_file on [%llu] to complete", fi->nodeid);
-		err = wait_on_bit(&fi->state, FUSE_I_INVAL_FILES, TASK_KILLABLE);
-		if (err)
-			return err;
-	}
+	err = fuse_wait_on_inval_files(inode);
+	if (err)
+		return err;
 
 	err = generic_file_open(inode, file);
 	if (err)
@@ -3833,14 +3864,15 @@ static int fuse_request_fiemap(struct inode *inode, u32 cur_max,
 	if (!list_empty(&fi->rw_files)) {
 		struct fuse_file *t_ff;
 		list_for_each_entry(t_ff, &fi->rw_files, rw_entry) {
-			if (!test_bit(FUSE_S_CLOSING, &t_ff->ff_state)) {
+			if (!test_bit(FUSE_S_CLOSING, &t_ff->ff_state) &&
+				!fuse_file_fail_immediately(t_ff)) {
 				ff = t_ff;
 				break;
 			}
 		}
-	}
-	if (!ff && !list_empty(&fi->write_files))
+	} else if (!list_empty(&fi->write_files)) {
 		ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
+	}
 	if (ff) {
 		fuse_file_get(ff);
 		inarg.fh = ff->fh;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 9dd84e9406d4c..8a86b62541ee5 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -218,6 +218,9 @@ struct fuse_inode {
 
 	/** Entry on fc->inval_files_list list */
 	struct list_head inval_files_entry;
+
+	/** fud->write_seq when the last FUSE_NOTIFY_INVAL_FILES notification was received */
+	u64 inval_files_seq;
 };
 
 /** FUSE inode state bits */
@@ -315,6 +318,9 @@ struct fuse_file {
 
 	/** List of requests that may be killed **/
 	struct list_head revoke_list;
+
+	/** fud->write_seq when the FUSE_OPEN reply was received */
+	u64 open_seq;
 };
 
 /** FUSE file states (ff_state) */
@@ -379,6 +385,9 @@ struct fuse_args {
 
 	/** Fuse file used in the request or NULL*/
 	struct fuse_file *ff;
+
+	/** fud->write_seq when the reply was received */
+	u64 reply_seq;
 };
 
 struct fuse_args_pages {
@@ -623,6 +632,9 @@ struct fuse_dev {
 
 	/** list entry on fc->devices */
 	struct list_head entry;
+
+	/** Monotonic sequence number, bumped on every reply/notification write */
+	u64 write_seq;
 };
 
 enum fuse_dax_mode {
@@ -1602,7 +1614,7 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
  * File-system tells the kernel to invalidate all fuse-files (and cache)
  * for the given node id.
  */
-int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid);
+int fuse_invalidate_files(struct fuse_dev *fud, u64 nodeid);
 
 int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
 		 bool isdir);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1e0c86f4b37a2..ea158367a1efd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -118,6 +118,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi->submount_lookup = NULL;
 	fi->i_size_unstable = 0;
 	fi->private = NULL;
+	fi->inval_files_seq = 0;
 	INIT_LIST_HEAD(&fi->rw_files);
 	INIT_LIST_HEAD(&fi->inval_files_entry);
 	mutex_init(&fi->mutex);
@@ -424,6 +425,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	fi->num_openers = 0;
+	fi->inval_files_seq = 0;
 	inode->i_mode = attr->mode & S_IFMT;
 	inode->i_size = attr->size;
 	inode_set_mtime(inode, attr->mtime, attr->mtimensec);
@@ -623,7 +625,7 @@ static void fuse_inval_files_work(struct work_struct *w)
 
 		fi = list_first_entry(&inval_files_list, struct fuse_inode, inval_files_entry);
 		list_del(&fi->inval_files_entry);
-		fuse_ktrace(fc, "invalidate_file on [%llu] starts", fi->nodeid);
+		fuse_ktrace(fc, "invalidate_files on [%llu] starts", fi->nodeid);
 
 		spin_lock(&fi->lock);
 		list_for_each_entry(ff, &fi->rw_files, rw_entry)
@@ -640,15 +642,16 @@ static void fuse_inval_files_work(struct work_struct *w)
 		wake_up_bit(&fi->state, FUSE_I_INVAL_FILES);
 		spin_unlock(&fi->lock);
 
-		fuse_ktrace(fc, "invalidate_file on [%llu] ends", fi->nodeid);
+		fuse_ktrace(fc, "invalidate_files on [%llu] ends", fi->nodeid);
 		iput(&fi->inode);
 	}
 
 	fuse_drop_waiting(fc);
 }
 
-int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
+int fuse_invalidate_files(struct fuse_dev *fud, u64 nodeid)
 {
+	struct fuse_conn *fc = fud->fc;
 	struct inode *inode;
 	struct fuse_inode *fi;
 	struct fuse_file *ff;
@@ -666,19 +669,20 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
 
 	fi = get_fuse_inode(inode);
 
-	/* Mark that invalidate files is in progress */
+	/*
+	 * Save this notification's write_seq and fail every fuse_file
+	 * currently linked on the inode.
+	 */
 	spin_lock(&fi->lock);
+	fi->inval_files_seq = fud->write_seq;
+	list_for_each_entry(ff, &fi->rw_files, rw_entry)
+		set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
 	if (test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
 		spin_unlock(&fi->lock);
 		iput(inode);
 		return 0;
 	}
 	set_bit(FUSE_I_INVAL_FILES, &fi->state);
-	list_for_each_entry(ff, &fi->rw_files, rw_entry) {
-		spin_lock(&ff->lock);
-		set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
-		spin_unlock(&ff->lock);
-	}
 	fi->i_size_unstable = 1;
 	spin_unlock(&fi->lock);
 
@@ -1921,6 +1925,7 @@ struct fuse_dev *fuse_dev_alloc(void)
 
 	fud->pq.processing = pq;
 	fuse_pqueue_init(&fud->pq);
+	fud->write_seq = 0;
 
 	return fud;
 }


More information about the Devel mailing list