[Devel] [PATCH VZ10 v2] fs/fuse: revamp fuse_invalidate_files() to avoid blocking the userspace evloop

Konstantin Khorenko khorenko at virtuozzo.com
Thu Apr 9 12:11:10 MSK 2026


Just to clarify the status: no ack on this version of the patch (so not merging),
next version is pending.

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 3/25/26 08:15, Kui Liu wrote:
> changes in v2
> 
> fix all problems found by AI reviews except
> 
> 3. Incorrect nodeid in trace output
> 
>     nodeid = get_node_id(&fi->inode) - FUSE_ROOT_ID;
> 
>     FUSE_ROOT_ID is 1. This subtracts 1 from every nodeid in trace messages, producing wrong values.
> Should just be
>     get_node_id(&fi->inode).
> 
> Using "get_node_id(&fi->inode) - FUSE_ROOT_ID" is intentional to make it align with userspace.
> 
> ------------------------------------------------------------------------------------------------------
> *From:* Liu Kui <kui.liu at virtuozzo.com>
> *Sent:* 25 March 2026 15:12
> *To:* devel at openvz.org <devel at openvz.org>
> *Cc:* Alexey Kuznetsov <kuznet at virtuozzo.com>; Andrey Zaitsev <azaitsev at virtuozzo.com>; Konstantin 
> Khorenko <khorenko at virtuozzo.com>; Kui Liu <kui.liu at virtuozzo.com>
> *Subject:* [PATCH VZ10 v2] fs/fuse: revamp fuse_invalidate_files() to avoid blocking the userspace evloop
> On large files, fuse_invalidate_files() can take a very long time to complete.
> This is caused by two slow operations that cannot be optimized:
>   - filemap_write_and_wait() when the file is under heavy write load, and
>   - invalidate_inode_pages2() when the page cache is heavily populated.
> 
> These long delays block the userspace evloop (which must not be blocked) and
> can trigger a shaman reboot in the worst case.
> 
> To fix this, the following changes are made:
> 
> 1. Move the execution of filemap_write_and_wait() and invalidate_inode_pages2()
>     into a dedicated kernel workqueue item.
> 
> 2. In fuse_invalidate_files(), only set the FUSE_I_INVAL_FILES bit in fi->state
>     and schedule the invalidation work for the fuse_inode.
> 
> 3. Block new opens of the file while the FUSE_I_INVAL_FILES bit is set.
>     The bit is cleared only after the file has been fully invalidated.
>     This is necessary because userspace views the file as fully invalidated
>     as soon as fuse_invalidate_files() returns.
> 
> Additionally, make the fuse trace function available in fuse module so
> that fuse_invalidate_files events can be traced and logged.
> 
> Related to
> https://virtuozzo.atlassian.net/browse/VSTOR-124254 <https://virtuozzo.atlassian.net/browse/VSTOR-124254>
> 
> Signed-off-by: Liu Kui <kui.liu at virtuozzo.com>
> ---
>   fs/fuse/dev.c                      |   2 +-
>   fs/fuse/file.c                     |  30 ++++++--
>   fs/fuse/fuse_i.h                   |  22 +++++-
>   fs/fuse/inode.c                    | 114 ++++++++++++++++++++++++-----
>   fs/fuse/kio/pcs/pcs_fuse_kdirect.c |  32 ++++++--
>   5 files changed, 165 insertions(+), 35 deletions(-)
> 
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index c1102069d032..4fcfd644dcf6 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -110,7 +110,7 @@ static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
>           return !fc->initialized || (for_background && fc->blocked);
>   }
> 
> -static void fuse_drop_waiting(struct fuse_conn *fc)
> +void fuse_drop_waiting(struct fuse_conn *fc)
>   {
>           /*
>            * lockess check of fc->connected is okay, because atomic_dec_and_test()
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 0860996c19ad..7bebe03dda5b 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -252,10 +252,11 @@ static void fuse_link_rw_file(struct file *file)
>           struct fuse_file *ff = file->private_data;
> 
>           spin_lock(&fi->lock);
> -       if (test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
> +       if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
>                   spin_lock(&ff->lock);
>                   set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
>                   spin_unlock(&ff->lock);
> +               fuse_ktrace(ff->fm->fc, "fuse_file[%llu] --> invalidate_file on [%llu] pending", ff- 
>  >fh, ff->nodeid);
>           }
>           if (list_empty(&ff->rw_entry))
>                   list_add(&ff->rw_entry, &fi->rw_files);
> @@ -319,6 +320,13 @@ static int fuse_open(struct inode *inode, struct file *file)
>           if ((file->f_flags & O_DIRECT) && !fc->direct_enable)
>                   return -EINVAL;
> 
> +       if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
> +               fuse_ktrace(fc, "waiting for invalidate_file on [%llu] to complete", fi->nodeid);
> +               err = wait_on_bit(&fi->state, FUSE_I_INVAL_FILES, TASK_KILLABLE);
> +               if (err)
> +                       return err;
> +       }
> +
>           err = generic_file_open(inode, file);
>           if (err)
>                   return err;
> @@ -361,8 +369,6 @@ static int fuse_open(struct inode *inode, struct file *file)
>                   inode_unlock(inode);
> 
>           if (!err && fc->close_wait) {
> -               struct fuse_inode *fi = get_fuse_inode(inode);
> -
>                   inode_lock(inode);
>                   spin_lock(&fi->lock);
> 
> @@ -1409,6 +1415,12 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
>                           return err;
>           }
> 
> +       /*
> +        * Block read if the file had been invalidated.
> +        */
> +       if (fuse_file_fail_immediately(iocb->ki_filp->private_data))
> +               return -EIO;
> +
>           return generic_file_read_iter(iocb, to);
>   }
> 
> @@ -1794,6 +1806,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
>                           goto writethrough;
>                   }
> 
> +               /*
> +                * Block write if the file had been invalidated.
> +                */
> +               if (fuse_file_fail_immediately(file->private_data))
> +                       return -EIO;
> +
>                   return generic_file_write_iter(iocb, from);
>           }
> 
> @@ -2704,13 +2722,9 @@ static int fuse_writepages_fill(struct folio *folio,
> 
>           BUG_ON(wpa && !data->ff);
> 
> -       /* More than optimization: writeback pages to /dev/null; fused would
> -        * drop our FUSE_WRITE requests anyway, but it will be blocked while
> -        * sending NOTIFY_INVAL_FILES until we return!
> -        */
>           if (!wpa && test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
>                   unlock_page(&folio->page);
> -               return 0;
> +               return -EIO;
>           }
> 
>           if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) {
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 853bf12e282d..35222f48cb5b 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -215,6 +215,9 @@ struct fuse_inode {
>                   atomic_t read_count;
>                   atomic_t write_count;
>           } dio;
> +
> +       /** Entry on fc->inval_files_list list */
> +       struct list_head inval_files_entry;
>   };
> 
>   /** FUSE inode state bits */
> @@ -1110,7 +1113,13 @@ struct fuse_conn {
>           } kio;
> 
>           int ktrace_level;
> -       struct fuse_ktrace * ktrace;
> +       struct fuse_ktrace *ktrace;
> +       void (*fuse_ktrace_fn)(struct fuse_conn *fc, const char *fmt, ...);
> +
> +       /* List of fuse_inodes to be invalidated by userspace */
> +       struct list_head inval_files_list;
> +       struct delayed_work inval_files_work;
> +
>           struct dentry *conn_ctl;
> 
>           /* New writepages go into this bucket */
> @@ -1122,6 +1131,14 @@ struct fuse_conn {
>   #endif
>   };
> 
> +#define fuse_ktrace(fc, fmt, args...) { \
> +       do { \
> +               struct fuse_conn *__fc = (fc); \
> +               if (__fc->fuse_ktrace_fn) \
> +                       __fc->fuse_ktrace_fn(__fc, "%s: " fmt, __func__, ## args); \
> +       } while (0); \
> +}
> +
>   /*
>    * Represents a mounted filesystem, potentially a submount.
>    *
> @@ -1552,7 +1569,7 @@ static inline void fuse_dio_wait(struct fuse_inode *fi)
> 
>   static inline bool fuse_file_fail_immediately(struct fuse_file *ff)
>   {
> -       return ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
> +       return unlikely(ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state));
>   }
> 
>   /**
> @@ -1717,6 +1734,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
> 
>   struct fuse_kio_ops *fuse_kio_get(struct fuse_conn *fc, char *name);
>   void fuse_kio_put(struct fuse_kio_ops *ops);
> +void fuse_drop_waiting(struct fuse_conn *fc);
> 
>   /* passthrough.c */
>   static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index f167d275885b..2e6cf9edb04e 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -35,6 +35,8 @@ struct list_head fuse_conn_list;
>   DEFINE_MUTEX(fuse_mutex);
>   EXPORT_SYMBOL_GPL(fuse_mutex);
> 
> +struct workqueue_struct *fuse_inval_files_wq;
> +
>   static int fuse_ve_odirect;
> 
>   static int set_global_limit(const char *val, const struct kernel_param *kp);
> @@ -117,6 +119,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
>           fi->i_size_unstable = 0;
>           fi->private = NULL;
>           INIT_LIST_HEAD(&fi->rw_files);
> +       INIT_LIST_HEAD(&fi->inval_files_entry);
>           mutex_init(&fi->mutex);
>           spin_lock_init(&fi->lock);
>           init_waitqueue_head(&fi->dio.waitq);
> @@ -603,12 +606,81 @@ void fuse_unlock_inode(struct inode *inode, bool locked)
>                   mutex_unlock(&get_fuse_inode(inode)->mutex);
>   }
> 
> +static void fuse_inval_files_work(struct work_struct *w)
> +{
> +       struct fuse_conn *fc = container_of(w, struct fuse_conn, inval_files_work.work);
> +       struct list_head inval_files_list;
> +       struct list_head failed_list;
> +       struct fuse_file *ff;
> +       struct fuse_inode *fi;
> +       bool to_retry;
> +       int err;
> +
> +       INIT_LIST_HEAD(&inval_files_list);
> +       INIT_LIST_HEAD(&failed_list);
> +
> +       spin_lock(&fc->lock);
> +       list_splice_init(&fc->inval_files_list, &inval_files_list);
> +       to_retry = fc->connected;
> +       spin_unlock(&fc->lock);
> +
> +       while (!list_empty(&inval_files_list)) {
> +               u64 nodeid;
> +
> +               fi = list_first_entry(&inval_files_list, struct fuse_inode, inval_files_entry);
> +               list_del(&fi->inval_files_entry);
> +               nodeid = get_node_id(&fi->inode) - FUSE_ROOT_ID;
> +               fuse_ktrace(fc, "invalidate_file on [%llu] starts", nodeid);
> +
> +               err = filemap_write_and_wait(fi->inode.i_mapping);
> +               if (err && err != -EIO && to_retry) {
> +                       fuse_ktrace(fc, "filemap_write_and_wait() on [%llu] returns err=%d", nodeid, err);
> +                       list_add_tail(&fi->inval_files_entry, &failed_list);
> +                       continue;
> +               }
> +
> +               spin_lock(&fi->lock);
> +               list_for_each_entry(ff, &fi->rw_files, rw_entry)
> +                       fuse_revoke_readpages(ff);
> +               spin_unlock(&fi->lock);
> +
> +               wake_up(&fi->page_waitq); /* readpage[s] can wait on fuse wb */
> +
> +               err = invalidate_inode_pages2(fi->inode.i_mapping);
> +               if (err && to_retry) {
> +                       fuse_ktrace(fc, "invalidate_inode_pages2() on [%llu] returns err=%d", nodeid, 
> err);
> +                       list_add_tail(&fi->inval_files_entry, &failed_list);
> +                       continue;
> +               }
> +
> +               fuse_invalidate_attr(&fi->inode);
> +
> +               spin_lock(&fi->lock);
> +               clear_bit(FUSE_I_INVAL_FILES, &fi->state);
> +               wake_up_bit(&fi->state, FUSE_I_INVAL_FILES);
> +               spin_unlock(&fi->lock);
> +
> +               fuse_ktrace(fc, "invalidate_file on [%llu] ends", nodeid);
> +               iput(&fi->inode);
> +       }
> +
> +       if (!list_empty(&failed_list)) {
> +               spin_lock(&fc->lock);
> +               list_splice_init(&failed_list, &fc->inval_files_list);
> +               spin_unlock(&fc->lock);
> +               if (queue_delayed_work(fuse_inval_files_wq, &fc->inval_files_work, 1))
> +                       return;
> +       }
> +
> +       fuse_drop_waiting(fc);
> +}
> +
>   int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
>   {
>           struct inode *inode;
>           struct fuse_inode *fi;
>           struct fuse_file *ff;
> -       int err, i;
> +       int i;
> 
>           if (!fc->async_read) {
>                   printk(KERN_ERR "Turn async_read ON to use "
> @@ -624,6 +696,11 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
> 
>           /* Mark that invalidate files is in progress */
>           spin_lock(&fi->lock);
> +       if (test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
> +               spin_unlock(&fi->lock);
> +               iput(inode);
> +               return 0;
> +       }
>           set_bit(FUSE_I_INVAL_FILES, &fi->state);
>           list_for_each_entry(ff, &fi->rw_files, rw_entry) {
>                   spin_lock(&ff->lock);
> @@ -638,23 +715,14 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
>           for (i = 0; i < FUSE_QHASH_SIZE; i++)
>                   wake_up_all(&fc->qhash[i].waitq);
> 
> -       err = filemap_write_and_wait(inode->i_mapping);
> -       if (!err || err == -EIO) { /* AS_EIO might trigger -EIO */
> -               spin_lock(&fi->lock);
> -               list_for_each_entry(ff, &fi->rw_files, rw_entry)
> -                       fuse_revoke_readpages(ff);
> -               spin_unlock(&fi->lock);
> -
> -               wake_up(&fi->page_waitq); /* readpage[s] can wait on fuse wb */
> -               err = invalidate_inode_pages2(inode->i_mapping);
> -       }
> -
> -       if (!err)
> -               fuse_invalidate_attr(inode);
> +       atomic_inc(&fc->num_waiting);
> +       spin_lock(&fc->lock);
> +       list_add_tail(&fi->inval_files_entry, &fc->inval_files_list);
> +       spin_unlock(&fc->lock);
> +       if (!queue_delayed_work(fuse_inval_files_wq, &fc->inval_files_work, 0))
> +               fuse_drop_waiting(fc);
> 
> -       clear_bit(FUSE_I_INVAL_FILES, &fi->state);
> -       iput(inode);
> -       return err;
> +       return 0;
>   }
> 
>   static void fuse_umount_begin(struct super_block *sb)
> @@ -1308,6 +1376,9 @@ int fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
>           if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
>                   fuse_backing_files_init(fc);
> 
> +       INIT_LIST_HEAD(&fc->inval_files_list);
> +       INIT_DELAYED_WORK(&fc->inval_files_work, fuse_inval_files_work);
> +
>           INIT_LIST_HEAD(&fc->mounts);
>           list_add(&fm->fc_entry, &fc->mounts);
>           fm->fc = fc;
> @@ -2456,13 +2527,17 @@ static int __init fuse_fs_init(void)
>   {
>           int err;
> 
> +       fuse_inval_files_wq = alloc_workqueue("fuse_inval_files_wq", WQ_MEM_RECLAIM, 1);
> +       if (!fuse_inval_files_wq)
> +               goto out;
> +
>           fuse_inode_cachep = kmem_cache_create("fuse_inode",
>                           sizeof(struct fuse_inode), 0,
>                           SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT,
>                           fuse_inode_init_once);
>           err = -ENOMEM;
>           if (!fuse_inode_cachep)
> -               goto out;
> +               goto out1;
> 
>           err = register_fuseblk();
>           if (err)
> @@ -2478,6 +2553,8 @@ static int __init fuse_fs_init(void)
>           unregister_fuseblk();
>    out2:
>           kmem_cache_destroy(fuse_inode_cachep);
> + out1:
> +       destroy_workqueue(fuse_inval_files_wq);
>    out:
>           return err;
>   }
> @@ -2493,6 +2570,7 @@ static void fuse_fs_cleanup(void)
>            */
>           rcu_barrier();
>           kmem_cache_destroy(fuse_inode_cachep);
> +       destroy_workqueue(fuse_inval_files_wq);
>   }
> 
>   static struct kobject *fuse_kobj;
> diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> index eafe2ee2313b..42cdca250cd9 100644
> --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> @@ -158,6 +158,7 @@ MODULE_PARM_DESC(rdmaio_io_failing, "Enable/Disbale RDMA io failing");
> 
>   static int fuse_ktrace_setup(struct fuse_conn * fc);
>   static int fuse_ktrace_remove(struct fuse_conn *fc);
> +static void kfuse_trace(struct fuse_conn *fc, const char *fmt, ...);
> 
>   static struct kmem_cache *pcs_fuse_req_cachep;
>   static struct kmem_cache *pcs_ireq_cachep;
> @@ -1672,6 +1673,8 @@ static int fuse_ktrace_setup(struct fuse_conn * fc)
>                   goto err;
>           }
> 
> +       fc->fuse_ktrace_fn = kfuse_trace;
> +
>           return 0;
> 
>   err:
> @@ -1680,22 +1683,19 @@ static int fuse_ktrace_setup(struct fuse_conn * fc)
>           return ret;
>   }
> 
> -void __kfuse_trace(struct fuse_conn * fc, unsigned long ip, const char * fmt, ...)
> +static void kfuse_tracer(struct fuse_conn *fc, unsigned long ip, const char *fmt, va_list va)
>   {
> -       struct fuse_ktrace * tr;
> -        va_list va;
> +       struct fuse_ktrace *tr;
>           int cpu;
> 
>           cpu = get_cpu();
>           tr = fc->ktrace;
>           if (tr) {
>                   u8 * buf = per_cpu_ptr(tr->buf, cpu);
> -               struct fuse_trace_hdr * t;
> +               struct fuse_trace_hdr *t;
>                   int len;
> 
> -               va_start(va, fmt);
>                   len = vsnprintf(buf, KTRACE_LOG_BUF_SIZE, fmt, va);
> -               va_end(va);
>                   t = fuse_trace_prepare(tr, FUSE_KTRACE_STRING, len + 1);
>                   if (t)
>                           memcpy(t + 1, buf, len + 1);
> @@ -1710,6 +1710,26 @@ void __kfuse_trace(struct fuse_conn * fc, unsigned long ip, const char * fmt, ..
>           put_cpu();
>   }
> 
> +void __kfuse_trace(struct fuse_conn *fc, unsigned long ip, const char *fmt, ...)
> +{
> +       va_list va;
> +
> +       va_start(va, fmt);
> +       kfuse_tracer(fc, ip, fmt, va);
> +       va_end(va);
> +}
> +
> +static void kfuse_trace(struct fuse_conn *fc, const char *fmt, ...)
> +{
> +       va_list va;
> +
> +       if (fc->ktrace_level >= LOG_TRACE) {
> +               va_start(va, fmt);
> +               kfuse_tracer(fc, 0, fmt, va);
> +               va_end(va);
> +       }
> +}
> +
>   void pcs_kio_file_list(struct fuse_conn *fc, kio_file_itr kfile_cb, void *ctx)
>   {
>           struct fuse_file *ff;
> -- 
> 2.39.5 (Apple Git-154)
> 



More information about the Devel mailing list