[Devel] [PATCH VZ10 v2] fs/fuse: order FUSE_OPEN reply against FUSE_NOTIFY_INVAL_FILES
Liu Kui
kui.liu at virtuozzo.com
Tue Jun 16 19:51:15 MSK 2026
The userspace daemon's send order of a FUSE_OPEN reply and a
FUSE_NOTIFY_INVAL_FILES notification can be reversed by the time the
kernel processes them. The kernel must distinguish the two cases:
- reply sent before the notification: the open is stale and must
be flagged FUSE_S_FAIL_IMMEDIATELY;
- reply sent after the notification: the open is valid but must
wait for fuse_inval_files_work() to complete before returning,
or the caller hits IO errors on an inode still being invalidated.
The FUSE_I_INVAL_FILES bit carries no ordering and cannot tell these
apart. Add a per-fuse_dev write_seq bumped on every reply/notification
write, saved to ff->open_seq on OPEN replies and fi->inval_files_seq
on FUSE_NOTIFY_INVAL_FILES, and compared in fuse_link_rw_file() to
flag stale opens. fuse_finish_open() then waits on FUSE_I_INVAL_FILES
for the remaining case.
https://virtuozzo.atlassian.net/browse/VSTOR-133093
Signed-off-by: Liu Kui <kui.liu at virtuozzo.com>
---
Changes in v2:
- Rename write_counter / *_ts fields to write_seq / *_seq throughout
(write_counter -> write_seq, reply_ts -> reply_seq, open_ts ->
open_seq, inval_file_ts -> inval_files_seq). The values are a
logical sequence number, not a wall-clock timestamp; "_seq" is the
kernel-idiomatic name for a Lamport-style monotonic counter. Also
pluralise inval_file_ts -> inval_files_seq to match the surrounding
inval_files_* family.
- Propagate args.reply_seq into ff->open_seq from fuse_create_open()
in dir.c (the create+open path missed it in v1).
- Gate the new fuse_link_rw_file() / fuse_wait_on_inval_files() block
in fuse_finish_open() behind fc->close_wait so only vStorage runs
the new ordering work.
- Also initialise fi->inval_files_seq in fuse_init_inode(), not only
in fuse_alloc_inode().
- Update kernel-doc and inline comments to use "sequence number" /
write_seq wording instead of "timestamp" / "write counter".
fs/fuse/dev.c | 17 +++++++++----
fs/fuse/dir.c | 1 +
fs/fuse/file.c | 65 +++++++++++++++++++++++++++++++++++-------------
fs/fuse/fuse_i.h | 14 ++++++++++-
fs/fuse/inode.c | 26 ++++++++++++-------
5 files changed, 91 insertions(+), 32 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 4fcfd644dcf6..3304082a635b 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2003,9 +2003,10 @@ static int fuse_notify_resend(struct fuse_conn *fc)
return 0;
}
-static int fuse_notify_inval_files(struct fuse_conn *fc, unsigned int size,
+static int fuse_notify_inval_files(struct fuse_dev *fud, unsigned int size,
struct fuse_copy_state *cs)
{
+ struct fuse_conn *fc = fud->fc;
struct fuse_notify_inval_files_out outarg;
int err = -EINVAL;
@@ -2019,7 +2020,7 @@ static int fuse_notify_inval_files(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
- err = fuse_invalidate_files(fc, outarg.ino);
+ err = fuse_invalidate_files(fud, outarg.ino);
up_read(&fc->killsb);
return err;
@@ -2029,9 +2030,11 @@ static int fuse_notify_inval_files(struct fuse_conn *fc, unsigned int size,
return err;
}
-static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
+static int fuse_notify(struct fuse_dev *fud, enum fuse_notify_code code,
unsigned int size, struct fuse_copy_state *cs)
{
+ struct fuse_conn *fc = fud->fc;
+
/* Don't try to move pages (yet) */
cs->move_pages = 0;
@@ -2062,7 +2065,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
return fuse_notify_resend(fc);
case FUSE_NOTIFY_INVAL_FILES:
- return fuse_notify_inval_files(fc, size, cs);
+ return fuse_notify_inval_files(fud, size, cs);
default:
fuse_copy_finish(cs);
@@ -2375,6 +2378,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
struct fuse_req *req;
struct fuse_out_header oh;
+ /* Monotonic sequence number matching the daemon's reply/notification send order. */
+ fud->write_seq++;
+
err = -EINVAL;
if (nbytes < sizeof(struct fuse_out_header))
goto out;
@@ -2392,7 +2398,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
* and error contains notification code.
*/
if (!oh.unique) {
- err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
+ err = fuse_notify(fud, oh.error, nbytes - sizeof(oh), cs);
goto out;
}
@@ -2462,6 +2468,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
list_del_init(&req->list);
spin_unlock(&fpq->lock);
+ req->args->reply_seq = fud->write_seq;
fuse_request_end(req);
out:
return err ? err : nbytes;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 6637914c8f28..45ee4f1e4b68 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -693,6 +693,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
ff->fh = outopenp->fh;
ff->nodeid = outentry.nodeid;
ff->open_flags = outopenp->open_flags;
+ ff->open_seq = args.reply_seq;
inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
&outentry.attr, ATTR_TIMEOUT(&outentry), 0);
if (!inode) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index edea59ee4839..504492845159 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -45,10 +45,11 @@ static DECLARE_WORK(fuse_fput_work, fuse_fput_routine);
static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, int opcode,
- struct fuse_open_out *outargp)
+ struct fuse_open_out *outargp, uint64_t *reply_seq)
{
struct fuse_open_in inarg;
FUSE_ARGS(args);
+ int err;
memset(&inarg, 0, sizeof(inarg));
inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
@@ -69,7 +70,14 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
args.out_args[0].size = sizeof(*outargp);
args.out_args[0].value = outargp;
- return fuse_simple_request(fm, &args);
+ err = fuse_simple_request(fm, &args);
+ /*
+ * Propagate the dev write_seq so fuse_link_rw_file() can order
+ * this open against any concurrent FUSE_NOTIFY_INVAL_FILES.
+ */
+ *reply_seq = args.reply_seq;
+
+ return err;
}
struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release)
@@ -192,7 +200,7 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
struct fuse_open_out *outargp = &ff->args->open_outarg;
int err;
- err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp);
+ err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp, &ff->open_seq);
if (!err) {
ff->fh = outargp->fh;
ff->open_flags = outargp->open_flags;
@@ -245,6 +253,18 @@ static void fuse_link_write_file(struct file *file)
spin_unlock(&fi->lock);
}
+static int fuse_wait_on_inval_files(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_mount(inode)->fc;
+
+ if (likely(!test_bit(FUSE_I_INVAL_FILES, &fi->state)))
+ return 0;
+
+ fuse_ktrace(fc, "waiting for invalidate_files on [%llu] to complete", fi->nodeid);
+ return wait_on_bit(&fi->state, FUSE_I_INVAL_FILES, TASK_KILLABLE);
+}
+
static void fuse_link_rw_file(struct file *file)
{
struct inode *inode = file_inode(file);
@@ -252,14 +272,17 @@ static void fuse_link_rw_file(struct file *file)
struct fuse_file *ff = file->private_data;
spin_lock(&fi->lock);
- if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
- spin_lock(&ff->lock);
- set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
- spin_unlock(&ff->lock);
- fuse_ktrace(ff->fm->fc, "fuse_file[%llu] --> invalidate_file on [%llu] pending", ff->fh, ff->nodeid);
- }
if (list_empty(&ff->rw_entry))
list_add(&ff->rw_entry, &fi->rw_files);
+
+ /*
+ * If this open reply was received before the last
+ * FUSE_NOTIFY_INVAL_FILES on this inode, the open is
+ * stale, fail it immediately.
+ */
+ if (ff->open_seq < fi->inval_files_seq)
+ set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
+
spin_unlock(&fi->lock);
}
@@ -285,9 +308,20 @@ int fuse_finish_open(struct inode *inode, struct file *file)
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
- fuse_link_rw_file(file);
+ /* Only apply to vStorage */
+ if (fc->close_wait) {
+ fuse_link_rw_file(file);
- return 0;
+ /*
+ * Wait for fuse_inval_files_work() on this inode to complete
+ * before returning, otherwise the caller can hit IO errors
+ * on an inode still being invalidated.
+ */
+ if (!test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state))
+ err = fuse_wait_on_inval_files(inode);
+ }
+
+ return err;
}
static void fuse_truncate_update_attr(struct inode *inode, struct file *file)
@@ -320,12 +354,9 @@ static int fuse_open(struct inode *inode, struct file *file)
if ((file->f_flags & O_DIRECT) && !fc->direct_enable)
return -EINVAL;
- if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
- fuse_ktrace(fc, "waiting for invalidate_file on [%llu] to complete", fi->nodeid);
- err = wait_on_bit(&fi->state, FUSE_I_INVAL_FILES, TASK_KILLABLE);
- if (err)
- return err;
- }
+ err = fuse_wait_on_inval_files(inode);
+ if (err)
+ return err;
err = generic_file_open(inode, file);
if (err)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 966c8e6c2ab7..693ec756dcf3 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -218,6 +218,9 @@ struct fuse_inode {
/** Entry on fc->inval_files_list list */
struct list_head inval_files_entry;
+
+ /** fud->write_seq when the last FUSE_NOTIFY_INVAL_FILES notification was received */
+ uint64_t inval_files_seq;
};
/** FUSE inode state bits */
@@ -315,6 +318,9 @@ struct fuse_file {
/** List of requests that may be killed **/
struct list_head revoke_list;
+
+ /** fud->write_seq when the FUSE_OPEN reply was received */
+ uint64_t open_seq;
};
/** FUSE file states (ff_state) */
@@ -379,6 +385,9 @@ struct fuse_args {
/** Fuse file used in the request or NULL*/
struct fuse_file *ff;
+
+ /** fud->write_seq when the reply was received */
+ uint64_t reply_seq;
};
struct fuse_args_pages {
@@ -625,6 +634,9 @@ struct fuse_dev {
/** list entry on fc->devices */
struct list_head entry;
+
+ /** Monotonic sequence number, bumped on every reply/notification write */
+ uint64_t write_seq;
};
enum fuse_dax_mode {
@@ -1604,7 +1616,7 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
* File-system tells the kernel to invalidate all fuse-files (and cache)
* for the given node id.
*/
-int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid);
+int fuse_invalidate_files(struct fuse_dev *fud, u64 nodeid);
int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1e0c86f4b37a..841dfaca96a7 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -118,6 +118,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->submount_lookup = NULL;
fi->i_size_unstable = 0;
fi->private = NULL;
+ fi->inval_files_seq = 0;
INIT_LIST_HEAD(&fi->rw_files);
INIT_LIST_HEAD(&fi->inval_files_entry);
mutex_init(&fi->mutex);
@@ -424,6 +425,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
struct fuse_inode *fi = get_fuse_inode(inode);
fi->num_openers = 0;
+ fi->inval_files_seq = 0;
inode->i_mode = attr->mode & S_IFMT;
inode->i_size = attr->size;
inode_set_mtime(inode, attr->mtime, attr->mtimensec);
@@ -623,7 +625,7 @@ static void fuse_inval_files_work(struct work_struct *w)
fi = list_first_entry(&inval_files_list, struct fuse_inode, inval_files_entry);
list_del(&fi->inval_files_entry);
- fuse_ktrace(fc, "invalidate_file on [%llu] starts", fi->nodeid);
+ fuse_ktrace(fc, "invalidate_files on [%llu] starts", fi->nodeid);
spin_lock(&fi->lock);
list_for_each_entry(ff, &fi->rw_files, rw_entry)
@@ -640,15 +642,16 @@ static void fuse_inval_files_work(struct work_struct *w)
wake_up_bit(&fi->state, FUSE_I_INVAL_FILES);
spin_unlock(&fi->lock);
- fuse_ktrace(fc, "invalidate_file on [%llu] ends", fi->nodeid);
+ fuse_ktrace(fc, "invalidate_files on [%llu] ends", fi->nodeid);
iput(&fi->inode);
}
fuse_drop_waiting(fc);
}
-int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
+int fuse_invalidate_files(struct fuse_dev *fud, u64 nodeid)
{
+ struct fuse_conn *fc = fud->fc;
struct inode *inode;
struct fuse_inode *fi;
struct fuse_file *ff;
@@ -666,19 +669,23 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
fi = get_fuse_inode(inode);
- /* Mark that invalidate files is in progress */
+ /*
+ * Save this notification's write_seq and fail every fuse_file
+ * currently linked on the inode.
+ */
spin_lock(&fi->lock);
+ fi->inval_files_seq = fud->write_seq;
+ list_for_each_entry(ff, &fi->rw_files, rw_entry) {
+ spin_lock(&ff->lock);
+ set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
+ spin_unlock(&ff->lock);
+ }
if (test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
spin_unlock(&fi->lock);
iput(inode);
return 0;
}
set_bit(FUSE_I_INVAL_FILES, &fi->state);
- list_for_each_entry(ff, &fi->rw_files, rw_entry) {
- spin_lock(&ff->lock);
- set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
- spin_unlock(&ff->lock);
- }
fi->i_size_unstable = 1;
spin_unlock(&fi->lock);
@@ -1921,6 +1928,7 @@ struct fuse_dev *fuse_dev_alloc(void)
fud->pq.processing = pq;
fuse_pqueue_init(&fud->pq);
+ fud->write_seq = 0;
return fud;
}
--
2.50.1 (Apple Git-155)
More information about the Devel
mailing list