[Devel] [PATCH VZ10 v3] fs/fuse: revamp fuse_invalidate_files() to avoid blocking the userspace evloop

Alexey Kuznetsov kuznet at virtuozzo.com
Fri Apr 24 13:16:10 MSK 2026


Ack

On Mon, Apr 20, 2026 at 12:11 PM Liu Kui <kui.liu at virtuozzo.com> wrote:
>
> On large files, fuse_invalidate_files() can take very long time to complete.
> This is caused by two slow operations that cannot be optimized:
>  - filemap_write_and_wait() when the file is under heavy write load, and
>  - invalidate_inode_pages2() when the page cache is heavily populated.
>
> These long delays block the userspace evloop (which must not be blocked) and
> can trigger a shaman reboot in the worst case.
>
> To fix this, the following changes are made:
>
> 1. Move the slow cache invalidation work into a dedicated kernel workqueue
>    item and replace filemap_write_and_wait() + invalidate_inode_pages2() with
>    truncate_pagecache_range() to simplify cache invalidation.
>
> 2. In fuse_invalidate_files(), only set the FUSE_I_INVAL_FILES bit in fi->state
>    and schedule the invalidation work for the fuse_inode.
>
> 3. Block new opens of the file while the FUSE_I_INVAL_FILES bit is set.
>    The bit is cleared only after the file has been fully invalidated.
>    This is necessary because userspace views the file as fully invalidated
>    as soon as fuse_invalidate_files() returns.
>
> Additionally, make the fuse trace function available in fuse module so
> that fuse_invalidate_files events can be traced and logged.
>
> Related to
> https://virtuozzo.atlassian.net/browse/VSTOR-124254
>
> Signed-off-by: Liu Kui <kui.liu at virtuozzo.com>
> ---
>  fs/fuse/dev.c                      |  2 +-
>  fs/fuse/file.c                     | 40 ++++++++-----
>  fs/fuse/fuse_i.h                   | 21 ++++++-
>  fs/fuse/inode.c                    | 91 +++++++++++++++++++++++-------
>  fs/fuse/kio/pcs/pcs_fuse_kdirect.c | 32 +++++++++--
>  5 files changed, 144 insertions(+), 42 deletions(-)
>
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index c1102069d032..4fcfd644dcf6 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -110,7 +110,7 @@ static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
>         return !fc->initialized || (for_background && fc->blocked);
>  }
>
> -static void fuse_drop_waiting(struct fuse_conn *fc)
> +void fuse_drop_waiting(struct fuse_conn *fc)
>  {
>         /*
>          * lockess check of fc->connected is okay, because atomic_dec_and_test()
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 0860996c19ad..11fb3996a2ac 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -252,10 +252,11 @@ static void fuse_link_rw_file(struct file *file)
>         struct fuse_file *ff = file->private_data;
>
>         spin_lock(&fi->lock);
> -       if (test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
> +       if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
>                 spin_lock(&ff->lock);
>                 set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
>                 spin_unlock(&ff->lock);
> +               fuse_ktrace(ff->fm->fc, "fuse_file[%llu] --> invalidate_file on [%llu] pending", ff->fh, ff->nodeid);
>         }
>         if (list_empty(&ff->rw_entry))
>                 list_add(&ff->rw_entry, &fi->rw_files);
> @@ -319,6 +320,13 @@ static int fuse_open(struct inode *inode, struct file *file)
>         if ((file->f_flags & O_DIRECT) && !fc->direct_enable)
>                 return -EINVAL;
>
> +       if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
> +               fuse_ktrace(fc, "waiting for invalidate_file on [%llu] to complete", fi->nodeid);
> +               err = wait_on_bit(&fi->state, FUSE_I_INVAL_FILES, TASK_KILLABLE);
> +               if (err)
> +                       return err;
> +       }
> +
>         err = generic_file_open(inode, file);
>         if (err)
>                 return err;
> @@ -361,8 +369,6 @@ static int fuse_open(struct inode *inode, struct file *file)
>                 inode_unlock(inode);
>
>         if (!err && fc->close_wait) {
> -               struct fuse_inode *fi = get_fuse_inode(inode);
> -
>                 inode_lock(inode);
>                 spin_lock(&fi->lock);
>
> @@ -1409,6 +1415,12 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
>                         return err;
>         }
>
> +       /*
> +        * Block read if the file had been invalidated.
> +        */
> +       if (fuse_file_fail_immediately(iocb->ki_filp->private_data))
> +               return -EIO;
> +
>         return generic_file_read_iter(iocb, to);
>  }
>
> @@ -1794,6 +1806,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
>                         goto writethrough;
>                 }
>
> +               /*
> +                * Block write if the file had been invalidated.
> +                */
> +               if (fuse_file_fail_immediately(file->private_data))
> +                       return -EIO;
> +
>                 return generic_file_write_iter(iocb, from);
>         }
>
> @@ -2488,8 +2506,12 @@ static int fuse_writepage_locked(struct folio *folio)
>         struct fuse_args_pages *ap;
>         struct folio *tmp_folio;
>         struct fuse_file *ff;
> -       int error = -ENOMEM;
> +       int error = -EIO;
> +
> +       if (test_bit(FUSE_I_INVAL_FILES, &fi->state))
> +               goto err;
>
> +       error = -ENOMEM;
>         tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
>         if (!tmp_folio)
>                 goto err;
> @@ -2704,13 +2726,9 @@ static int fuse_writepages_fill(struct folio *folio,
>
>         BUG_ON(wpa && !data->ff);
>
> -       /* More than optimization: writeback pages to /dev/null; fused would
> -        * drop our FUSE_WRITE requests anyway, but it will be blocked while
> -        * sending NOTIFY_INVAL_FILES until we return!
> -        */
>         if (!wpa && test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
>                 unlock_page(&folio->page);
> -               return 0;
> +               return -EIO;
>         }
>
>         if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) {
> @@ -2936,10 +2954,6 @@ static int fuse_launder_folio(struct folio *folio)
>                 /* Serialize with pending writeback for the same page */
>                 fuse_wait_on_page_writeback(inode, folio->index);
>
> -               /* Return success if FUSE_NOTIFY_INVAL_FILES is in progress */
> -               if (test_bit(FUSE_I_INVAL_FILES, &get_fuse_inode(inode)->state))
> -                       return 0;
> -
>                 err = fuse_writepage_locked(folio);
>                 if (!err)
>                         fuse_wait_on_page_writeback(inode, folio->index);
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 853bf12e282d..966c8e6c2ab7 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -215,6 +215,9 @@ struct fuse_inode {
>                 atomic_t read_count;
>                 atomic_t write_count;
>         } dio;
> +
> +       /** Entry on fc->inval_files_list list */
> +       struct list_head inval_files_entry;
>  };
>
>  /** FUSE inode state bits */
> @@ -1110,7 +1113,13 @@ struct fuse_conn {
>         } kio;
>
>         int ktrace_level;
> -       struct fuse_ktrace * ktrace;
> +       struct fuse_ktrace *ktrace;
> +       void (*fuse_ktrace_fn)(struct fuse_conn *fc, const char *fmt, ...);
> +
> +       /* List of fuse_inodes to be invalidated by userspace */
> +       struct list_head inval_files_list;
> +       struct delayed_work inval_files_work;
> +
>         struct dentry *conn_ctl;
>
>         /* New writepages go into this bucket */
> @@ -1122,6 +1131,13 @@ struct fuse_conn {
>  #endif
>  };
>
> +#define fuse_ktrace(fc, fmt, args...)  \
> +do { \
> +       struct fuse_conn *__fc = (fc); \
> +       if (__fc->fuse_ktrace_fn) \
> +               __fc->fuse_ktrace_fn(__fc, "%s: " fmt, __func__, ## args); \
> +} while (0)
> +
>  /*
>   * Represents a mounted filesystem, potentially a submount.
>   *
> @@ -1552,7 +1568,7 @@ static inline void fuse_dio_wait(struct fuse_inode *fi)
>
>  static inline bool fuse_file_fail_immediately(struct fuse_file *ff)
>  {
> -       return ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
> +       return unlikely(ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state));
>  }
>
>  /**
> @@ -1717,6 +1733,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
>
>  struct fuse_kio_ops *fuse_kio_get(struct fuse_conn *fc, char *name);
>  void fuse_kio_put(struct fuse_kio_ops *ops);
> +void fuse_drop_waiting(struct fuse_conn *fc);
>
>  /* passthrough.c */
>  static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index f167d275885b..d627302da0b4 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -35,6 +35,8 @@ struct list_head fuse_conn_list;
>  DEFINE_MUTEX(fuse_mutex);
>  EXPORT_SYMBOL_GPL(fuse_mutex);
>
> +struct workqueue_struct *fuse_inval_files_wq;
> +
>  static int fuse_ve_odirect;
>
>  static int set_global_limit(const char *val, const struct kernel_param *kp);
> @@ -117,6 +119,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
>         fi->i_size_unstable = 0;
>         fi->private = NULL;
>         INIT_LIST_HEAD(&fi->rw_files);
> +       INIT_LIST_HEAD(&fi->inval_files_entry);
>         mutex_init(&fi->mutex);
>         spin_lock_init(&fi->lock);
>         init_waitqueue_head(&fi->dio.waitq);
> @@ -603,12 +606,55 @@ void fuse_unlock_inode(struct inode *inode, bool locked)
>                 mutex_unlock(&get_fuse_inode(inode)->mutex);
>  }
>
> +static void fuse_inval_files_work(struct work_struct *w)
> +{
> +       struct fuse_conn *fc = container_of(w, struct fuse_conn, inval_files_work.work);
> +       struct list_head inval_files_list;
> +       struct fuse_file *ff;
> +       struct fuse_inode *fi;
> +
> +       INIT_LIST_HEAD(&inval_files_list);
> +
> +       spin_lock(&fc->lock);
> +       list_splice_init(&fc->inval_files_list, &inval_files_list);
> +       spin_unlock(&fc->lock);
> +
> +       while (!list_empty(&inval_files_list)) {
> +               u64 nodeid;
> +
> +               fi = list_first_entry(&inval_files_list, struct fuse_inode, inval_files_entry);
> +               list_del(&fi->inval_files_entry);
> +               nodeid = get_node_id(&fi->inode) - FUSE_ROOT_ID;
> +               fuse_ktrace(fc, "invalidate_file on [%llu] starts", nodeid);
> +
> +               spin_lock(&fi->lock);
> +               list_for_each_entry(ff, &fi->rw_files, rw_entry)
> +                       fuse_revoke_readpages(ff);
> +               spin_unlock(&fi->lock);
> +
> +               wake_up(&fi->page_waitq); /* readpage[s] can wait on fuse wb */
> +
> +               truncate_pagecache_range(&fi->inode, 0, -1);
> +               fuse_invalidate_attr(&fi->inode);
> +
> +               spin_lock(&fi->lock);
> +               clear_bit(FUSE_I_INVAL_FILES, &fi->state);
> +               wake_up_bit(&fi->state, FUSE_I_INVAL_FILES);
> +               spin_unlock(&fi->lock);
> +
> +               fuse_ktrace(fc, "invalidate_file on [%llu] ends", nodeid);
> +               iput(&fi->inode);
> +       }
> +
> +       fuse_drop_waiting(fc);
> +}
> +
>  int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
>  {
>         struct inode *inode;
>         struct fuse_inode *fi;
>         struct fuse_file *ff;
> -       int err, i;
> +       int i;
>
>         if (!fc->async_read) {
>                 printk(KERN_ERR "Turn async_read ON to use "
> @@ -624,6 +670,11 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
>
>         /* Mark that invalidate files is in progress */
>         spin_lock(&fi->lock);
> +       if (test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
> +               spin_unlock(&fi->lock);
> +               iput(inode);
> +               return 0;
> +       }
>         set_bit(FUSE_I_INVAL_FILES, &fi->state);
>         list_for_each_entry(ff, &fi->rw_files, rw_entry) {
>                 spin_lock(&ff->lock);
> @@ -638,23 +689,14 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
>         for (i = 0; i < FUSE_QHASH_SIZE; i++)
>                 wake_up_all(&fc->qhash[i].waitq);
>
> -       err = filemap_write_and_wait(inode->i_mapping);
> -       if (!err || err == -EIO) { /* AS_EIO might trigger -EIO */
> -               spin_lock(&fi->lock);
> -               list_for_each_entry(ff, &fi->rw_files, rw_entry)
> -                       fuse_revoke_readpages(ff);
> -               spin_unlock(&fi->lock);
> -
> -               wake_up(&fi->page_waitq); /* readpage[s] can wait on fuse wb */
> -               err = invalidate_inode_pages2(inode->i_mapping);
> -       }
> -
> -       if (!err)
> -               fuse_invalidate_attr(inode);
> +       atomic_inc(&fc->num_waiting);
> +       spin_lock(&fc->lock);
> +       list_add_tail(&fi->inval_files_entry, &fc->inval_files_list);
> +       spin_unlock(&fc->lock);
> +       if (!queue_delayed_work(fuse_inval_files_wq, &fc->inval_files_work, 0))
> +               fuse_drop_waiting(fc);
>
> -       clear_bit(FUSE_I_INVAL_FILES, &fi->state);
> -       iput(inode);
> -       return err;
> +       return 0;
>  }
>
>  static void fuse_umount_begin(struct super_block *sb)
> @@ -1308,6 +1350,9 @@ int fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
>         if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
>                 fuse_backing_files_init(fc);
>
> +       INIT_LIST_HEAD(&fc->inval_files_list);
> +       INIT_DELAYED_WORK(&fc->inval_files_work, fuse_inval_files_work);
> +
>         INIT_LIST_HEAD(&fc->mounts);
>         list_add(&fm->fc_entry, &fc->mounts);
>         fm->fc = fc;
> @@ -2454,15 +2499,18 @@ static void fuse_inode_init_once(void *foo)
>
>  static int __init fuse_fs_init(void)
>  {
> -       int err;
> +       int err = -ENOMEM;
> +
> +       fuse_inval_files_wq = alloc_workqueue("fuse_inval_files_wq", WQ_MEM_RECLAIM, 1);
> +       if (!fuse_inval_files_wq)
> +               goto out;
>
>         fuse_inode_cachep = kmem_cache_create("fuse_inode",
>                         sizeof(struct fuse_inode), 0,
>                         SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT,
>                         fuse_inode_init_once);
> -       err = -ENOMEM;
>         if (!fuse_inode_cachep)
> -               goto out;
> +               goto out1;
>
>         err = register_fuseblk();
>         if (err)
> @@ -2478,6 +2526,8 @@ static int __init fuse_fs_init(void)
>         unregister_fuseblk();
>   out2:
>         kmem_cache_destroy(fuse_inode_cachep);
> + out1:
> +       destroy_workqueue(fuse_inval_files_wq);
>   out:
>         return err;
>  }
> @@ -2493,6 +2543,7 @@ static void fuse_fs_cleanup(void)
>          */
>         rcu_barrier();
>         kmem_cache_destroy(fuse_inode_cachep);
> +       destroy_workqueue(fuse_inval_files_wq);
>  }
>
>  static struct kobject *fuse_kobj;
> diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> index eafe2ee2313b..42cdca250cd9 100644
> --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> @@ -158,6 +158,7 @@ MODULE_PARM_DESC(rdmaio_io_failing, "Enable/Disbale RDMA io failing");
>
>  static int fuse_ktrace_setup(struct fuse_conn * fc);
>  static int fuse_ktrace_remove(struct fuse_conn *fc);
> +static void kfuse_trace(struct fuse_conn *fc, const char *fmt, ...);
>
>  static struct kmem_cache *pcs_fuse_req_cachep;
>  static struct kmem_cache *pcs_ireq_cachep;
> @@ -1672,6 +1673,8 @@ static int fuse_ktrace_setup(struct fuse_conn * fc)
>                 goto err;
>         }
>
> +       fc->fuse_ktrace_fn = kfuse_trace;
> +
>         return 0;
>
>  err:
> @@ -1680,22 +1683,19 @@ static int fuse_ktrace_setup(struct fuse_conn * fc)
>         return ret;
>  }
>
> -void __kfuse_trace(struct fuse_conn * fc, unsigned long ip, const char * fmt, ...)
> +static void kfuse_tracer(struct fuse_conn *fc, unsigned long ip, const char *fmt, va_list va)
>  {
> -       struct fuse_ktrace * tr;
> -        va_list va;
> +       struct fuse_ktrace *tr;
>         int cpu;
>
>         cpu = get_cpu();
>         tr = fc->ktrace;
>         if (tr) {
>                 u8 * buf = per_cpu_ptr(tr->buf, cpu);
> -               struct fuse_trace_hdr * t;
> +               struct fuse_trace_hdr *t;
>                 int len;
>
> -               va_start(va, fmt);
>                 len = vsnprintf(buf, KTRACE_LOG_BUF_SIZE, fmt, va);
> -               va_end(va);
>                 t = fuse_trace_prepare(tr, FUSE_KTRACE_STRING, len + 1);
>                 if (t)
>                         memcpy(t + 1, buf, len + 1);
> @@ -1710,6 +1710,26 @@ void __kfuse_trace(struct fuse_conn * fc, unsigned long ip, const char * fmt, ..
>         put_cpu();
>  }
>
> +void __kfuse_trace(struct fuse_conn *fc, unsigned long ip, const char *fmt, ...)
> +{
> +       va_list va;
> +
> +       va_start(va, fmt);
> +       kfuse_tracer(fc, ip, fmt, va);
> +       va_end(va);
> +}
> +
> +static void kfuse_trace(struct fuse_conn *fc, const char *fmt, ...)
> +{
> +       va_list va;
> +
> +       if (fc->ktrace_level >= LOG_TRACE) {
> +               va_start(va, fmt);
> +               kfuse_tracer(fc, 0, fmt, va);
> +               va_end(va);
> +       }
> +}
> +
>  void pcs_kio_file_list(struct fuse_conn *fc, kio_file_itr kfile_cb, void *ctx)
>  {
>         struct fuse_file *ff;
> --
> 2.39.5 (Apple Git-154)



More information about the Devel mailing list