[Devel] [PATCH VZ10] fs/fuse: order FUSE_OPEN reply against FUSE_NOTIFY_INVAL_FILES
Alexey Kuznetsov
kuznet at virtuozzo.com
Fri Jun 5 16:24:37 MSK 2026
Ack.
This solution is _not_ nice, but we were not able to find anything
better while brain storming the problem.
We have alternate solutions but all they are more expensive from
maintanance viewpoint.
On Fri, Jun 5, 2026 at 5:26 PM Liu Kui <kui.liu at virtuozzo.com> wrote:
>
> The userspace daemon's send order of a FUSE_OPEN reply and a
> FUSE_NOTIFY_INVAL_FILES notification can be reversed by the time the
> kernel processes them. The kernel must distinguish the two cases:
>
> - reply sent before the notification: the open is stale and must
> be flagged FUSE_S_FAIL_IMMEDIATELY;
> - reply sent after the notification: the open is valid but must
> wait for fuse_inval_files_work() to complete before returning,
> or the caller hits IO errors on an inode still being invalidated.
>
> The FUSE_I_INVAL_FILES bit carries no ordering and cannot tell these
> apart. Add a per-fuse_dev write_counter bumped on every reply/
> notification write, and saved to ff->open_ts on OPEN replies and
> fi->inval_file_ts on FUSE_NOTIFY_INVAL_FILES, and compare them in
> fuse_link_rw_file() to flag stale opens. fuse_finish_open() then
> waits on FUSE_I_INVAL_FILES for the remaining case.
>
> https://virtuozzo.atlassian.net/browse/VSTOR-133093
> Signed-off-by: Liu Kui <kui.liu at virtuozzo.com>
> ---
> fs/fuse/dev.c | 17 ++++++++++----
> fs/fuse/file.c | 60 +++++++++++++++++++++++++++++++++++-------------
> fs/fuse/fuse_i.h | 14 ++++++++++-
> fs/fuse/inode.c | 25 ++++++++++++--------
> 4 files changed, 85 insertions(+), 31 deletions(-)
>
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index 4fcfd644dcf6..873d01163ead 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -2003,9 +2003,10 @@ static int fuse_notify_resend(struct fuse_conn *fc)
> return 0;
> }
>
> -static int fuse_notify_inval_files(struct fuse_conn *fc, unsigned int size,
> +static int fuse_notify_inval_files(struct fuse_dev *fud, unsigned int size,
> struct fuse_copy_state *cs)
> {
> + struct fuse_conn *fc = fud->fc;
> struct fuse_notify_inval_files_out outarg;
> int err = -EINVAL;
>
> @@ -2019,7 +2020,7 @@ static int fuse_notify_inval_files(struct fuse_conn *fc, unsigned int size,
>
> down_read(&fc->killsb);
>
> - err = fuse_invalidate_files(fc, outarg.ino);
> + err = fuse_invalidate_files(fud, outarg.ino);
>
> up_read(&fc->killsb);
> return err;
> @@ -2029,9 +2030,11 @@ static int fuse_notify_inval_files(struct fuse_conn *fc, unsigned int size,
> return err;
> }
>
> -static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
> +static int fuse_notify(struct fuse_dev *fud, enum fuse_notify_code code,
> unsigned int size, struct fuse_copy_state *cs)
> {
> + struct fuse_conn *fc = fud->fc;
> +
> /* Don't try to move pages (yet) */
> cs->move_pages = 0;
>
> @@ -2062,7 +2065,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
> return fuse_notify_resend(fc);
>
> case FUSE_NOTIFY_INVAL_FILES:
> - return fuse_notify_inval_files(fc, size, cs);
> + return fuse_notify_inval_files(fud, size, cs);
>
> default:
> fuse_copy_finish(cs);
> @@ -2375,6 +2378,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
> struct fuse_req *req;
> struct fuse_out_header oh;
>
> + /* Monotonic timestamp matching the daemon's reply/notification send order. */
> + fud->write_counter++;
> +
> err = -EINVAL;
> if (nbytes < sizeof(struct fuse_out_header))
> goto out;
> @@ -2392,7 +2398,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
> * and error contains notification code.
> */
> if (!oh.unique) {
> - err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
> + err = fuse_notify(fud, oh.error, nbytes - sizeof(oh), cs);
> goto out;
> }
>
> @@ -2462,6 +2468,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
> list_del_init(&req->list);
> spin_unlock(&fpq->lock);
>
> + req->args->reply_ts = fud->write_counter;
> fuse_request_end(req);
> out:
> return err ? err : nbytes;
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index edea59ee4839..206e24a5dbce 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -45,10 +45,11 @@ static DECLARE_WORK(fuse_fput_work, fuse_fput_routine);
>
> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
> unsigned int open_flags, int opcode,
> - struct fuse_open_out *outargp)
> + struct fuse_open_out *outargp, uint64_t *reply_ts)
> {
> struct fuse_open_in inarg;
> FUSE_ARGS(args);
> + int err;
>
> memset(&inarg, 0, sizeof(inarg));
> inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
> @@ -69,7 +70,14 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
> args.out_args[0].size = sizeof(*outargp);
> args.out_args[0].value = outargp;
>
> - return fuse_simple_request(fm, &args);
> + err = fuse_simple_request(fm, &args);
> + /*
> + * Propagate the dev write timestamp so fuse_link_rw_file() can order
> + * this open against any concurrent FUSE_NOTIFY_INVAL_FILES.
> + */
> + *reply_ts = args.reply_ts;
> +
> + return err;
> }
>
> struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release)
> @@ -192,7 +200,7 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
> struct fuse_open_out *outargp = &ff->args->open_outarg;
> int err;
>
> - err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp);
> + err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp, &ff->open_ts);
> if (!err) {
> ff->fh = outargp->fh;
> ff->open_flags = outargp->open_flags;
> @@ -245,6 +253,18 @@ static void fuse_link_write_file(struct file *file)
> spin_unlock(&fi->lock);
> }
>
> +static int fuse_wait_on_inval_files(struct inode *inode)
> +{
> + struct fuse_inode *fi = get_fuse_inode(inode);
> + struct fuse_conn *fc = get_fuse_mount(inode)->fc;
> +
> + if (likely(!test_bit(FUSE_I_INVAL_FILES, &fi->state)))
> + return 0;
> +
> + fuse_ktrace(fc, "waiting for invalidate_files on [%llu] to complete", fi->nodeid);
> + return wait_on_bit(&fi->state, FUSE_I_INVAL_FILES, TASK_KILLABLE);
> +}
> +
> static void fuse_link_rw_file(struct file *file)
> {
> struct inode *inode = file_inode(file);
> @@ -252,14 +272,17 @@ static void fuse_link_rw_file(struct file *file)
> struct fuse_file *ff = file->private_data;
>
> spin_lock(&fi->lock);
> - if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
> - spin_lock(&ff->lock);
> - set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
> - spin_unlock(&ff->lock);
> - fuse_ktrace(ff->fm->fc, "fuse_file[%llu] --> invalidate_file on [%llu] pending", ff->fh, ff->nodeid);
> - }
> if (list_empty(&ff->rw_entry))
> list_add(&ff->rw_entry, &fi->rw_files);
> +
> + /*
> + * If this open reply was received before the last
> + * FUSE_NOTIFY_INVAL_FILES on this inode, the open is
> + * stale, fail it immediately.
> + */
> + if (ff->open_ts < fi->inval_files_ts)
> + set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
> +
> spin_unlock(&fi->lock);
> }
>
> @@ -287,7 +310,15 @@ int fuse_finish_open(struct inode *inode, struct file *file)
>
> fuse_link_rw_file(file);
>
> - return 0;
> + /*
> + * Wait for fuse_inval_files_work() on this inode to complete
> + * before returning, otherwise the caller can hit IO errors
> + * on an inode still being invalidated.
> + */
> + if (!test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state))
> + err = fuse_wait_on_inval_files(inode);
> +
> + return err;
> }
>
> static void fuse_truncate_update_attr(struct inode *inode, struct file *file)
> @@ -320,12 +351,9 @@ static int fuse_open(struct inode *inode, struct file *file)
> if ((file->f_flags & O_DIRECT) && !fc->direct_enable)
> return -EINVAL;
>
> - if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
> - fuse_ktrace(fc, "waiting for invalidate_file on [%llu] to complete", fi->nodeid);
> - err = wait_on_bit(&fi->state, FUSE_I_INVAL_FILES, TASK_KILLABLE);
> - if (err)
> - return err;
> - }
> + err = fuse_wait_on_inval_files(inode);
> + if (err)
> + return err;
>
> err = generic_file_open(inode, file);
> if (err)
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 966c8e6c2ab7..925233427339 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -218,6 +218,9 @@ struct fuse_inode {
>
> /** Entry on fc->inval_files_list list */
> struct list_head inval_files_entry;
> +
> + /** fud->write_counter when the last FUSE_NOTIFY_INVAL_FILES notification was received */
> + uint64_t inval_files_ts;
> };
>
> /** FUSE inode state bits */
> @@ -315,6 +318,9 @@ struct fuse_file {
>
> /** List of requests that may be killed **/
> struct list_head revoke_list;
> +
> + /** fud->write_counter when the FUSE_OPEN reply was received */
> + uint64_t open_ts;
> };
>
> /** FUSE file states (ff_state) */
> @@ -379,6 +385,9 @@ struct fuse_args {
>
> /** Fuse file used in the request or NULL*/
> struct fuse_file *ff;
> +
> + /** fud->write_counter when the reply was received */
> + uint64_t reply_ts;
> };
>
> struct fuse_args_pages {
> @@ -625,6 +634,9 @@ struct fuse_dev {
>
> /** list entry on fc->devices */
> struct list_head entry;
> +
> + /** Monotonic timestamp, bumped on every reply/notification write */
> + uint64_t write_counter;
> };
>
> enum fuse_dax_mode {
> @@ -1604,7 +1616,7 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
> * File-system tells the kernel to invalidate all fuse-files (and cache)
> * for the given node id.
> */
> -int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid);
> +int fuse_invalidate_files(struct fuse_dev *fud, u64 nodeid);
>
> int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
> bool isdir);
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 1e0c86f4b37a..dc603c649eef 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -118,6 +118,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
> fi->submount_lookup = NULL;
> fi->i_size_unstable = 0;
> fi->private = NULL;
> + fi->inval_files_ts = 0;
> INIT_LIST_HEAD(&fi->rw_files);
> INIT_LIST_HEAD(&fi->inval_files_entry);
> mutex_init(&fi->mutex);
> @@ -623,7 +624,7 @@ static void fuse_inval_files_work(struct work_struct *w)
>
> fi = list_first_entry(&inval_files_list, struct fuse_inode, inval_files_entry);
> list_del(&fi->inval_files_entry);
> - fuse_ktrace(fc, "invalidate_file on [%llu] starts", fi->nodeid);
> + fuse_ktrace(fc, "invalidate_files on [%llu] starts", fi->nodeid);
>
> spin_lock(&fi->lock);
> list_for_each_entry(ff, &fi->rw_files, rw_entry)
> @@ -640,15 +641,16 @@ static void fuse_inval_files_work(struct work_struct *w)
> wake_up_bit(&fi->state, FUSE_I_INVAL_FILES);
> spin_unlock(&fi->lock);
>
> - fuse_ktrace(fc, "invalidate_file on [%llu] ends", fi->nodeid);
> + fuse_ktrace(fc, "invalidate_files on [%llu] ends", fi->nodeid);
> iput(&fi->inode);
> }
>
> fuse_drop_waiting(fc);
> }
>
> -int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
> +int fuse_invalidate_files(struct fuse_dev *fud, u64 nodeid)
> {
> + struct fuse_conn *fc = fud->fc;
> struct inode *inode;
> struct fuse_inode *fi;
> struct fuse_file *ff;
> @@ -666,19 +668,23 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
>
> fi = get_fuse_inode(inode);
>
> - /* Mark that invalidate files is in progress */
> + /*
> + * Save this notification's timestamp and fail every fuse_file
> + * currently linked on the inode.
> + */
> spin_lock(&fi->lock);
> + fi->inval_files_ts = fud->write_counter;
> + list_for_each_entry(ff, &fi->rw_files, rw_entry) {
> + spin_lock(&ff->lock);
> + set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
> + spin_unlock(&ff->lock);
> + }
> if (test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
> spin_unlock(&fi->lock);
> iput(inode);
> return 0;
> }
> set_bit(FUSE_I_INVAL_FILES, &fi->state);
> - list_for_each_entry(ff, &fi->rw_files, rw_entry) {
> - spin_lock(&ff->lock);
> - set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
> - spin_unlock(&ff->lock);
> - }
> fi->i_size_unstable = 1;
> spin_unlock(&fi->lock);
>
> @@ -1921,6 +1927,7 @@ struct fuse_dev *fuse_dev_alloc(void)
>
> fud->pq.processing = pq;
> fuse_pqueue_init(&fud->pq);
> + fud->write_counter = 0;
>
> return fud;
> }
> --
> 2.50.1 (Apple Git-155)
More information about the Devel
mailing list