[Devel] [PATCH VZ10 v3] fs/fuse: revamp fuse_invalidate_files() to avoid blocking the userspace evloop
Alexey Kuznetsov
kuznet at virtuozzo.com
Fri Apr 24 13:16:10 MSK 2026
Ack
On Mon, Apr 20, 2026 at 12:11 PM Liu Kui <kui.liu at virtuozzo.com> wrote:
>
> On large files, fuse_invalidate_files() can take very long time to complete.
> This is caused by two slow operations that cannot be optimized:
> - filemap_write_and_wait() when the file is under heavy write load, and
> - invalidate_inode_pages2() when the page cache is heavily populated.
>
> These long delays block the userspace evloop (which must not be blocked) and
> can trigger a shaman reboot in the worst case.
>
> To fix this, the following changes are made:
>
> 1. Move the slow cache invalidation work into a dedicated kernel workqueue
> item and replace filemap_write_and_wait() + invalidate_inode_pages2() with
> truncate_pagecache_range() to simplify cache invalidation.
>
> 2. In fuse_invalidate_files(), only set the FUSE_I_INVAL_FILES bit in fi->state
> and schedule the invalidation work for the fuse_inode.
>
> 3. Block new opens of the file while the FUSE_I_INVAL_FILES bit is set.
> The bit is cleared only after the file has been fully invalidated.
> This is necessary because userspace views the file as fully invalidated
> as soon as fuse_invalidate_files() returns.
>
> Additionally, make the fuse trace function available in fuse module so
> that fuse_invalidate_files events can be traced and logged.
>
> Related to
> https://virtuozzo.atlassian.net/browse/VSTOR-124254
>
> Signed-off-by: Liu Kui <kui.liu at virtuozzo.com>
> ---
> fs/fuse/dev.c | 2 +-
> fs/fuse/file.c | 40 ++++++++-----
> fs/fuse/fuse_i.h | 21 ++++++-
> fs/fuse/inode.c | 91 +++++++++++++++++++++++-------
> fs/fuse/kio/pcs/pcs_fuse_kdirect.c | 32 +++++++++--
> 5 files changed, 144 insertions(+), 42 deletions(-)
>
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index c1102069d032..4fcfd644dcf6 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -110,7 +110,7 @@ static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
> return !fc->initialized || (for_background && fc->blocked);
> }
>
> -static void fuse_drop_waiting(struct fuse_conn *fc)
> +void fuse_drop_waiting(struct fuse_conn *fc)
> {
> /*
> * lockess check of fc->connected is okay, because atomic_dec_and_test()
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 0860996c19ad..11fb3996a2ac 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -252,10 +252,11 @@ static void fuse_link_rw_file(struct file *file)
> struct fuse_file *ff = file->private_data;
>
> spin_lock(&fi->lock);
> - if (test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
> + if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
> spin_lock(&ff->lock);
> set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
> spin_unlock(&ff->lock);
> + fuse_ktrace(ff->fm->fc, "fuse_file[%llu] --> invalidate_file on [%llu] pending", ff->fh, ff->nodeid);
> }
> if (list_empty(&ff->rw_entry))
> list_add(&ff->rw_entry, &fi->rw_files);
> @@ -319,6 +320,13 @@ static int fuse_open(struct inode *inode, struct file *file)
> if ((file->f_flags & O_DIRECT) && !fc->direct_enable)
> return -EINVAL;
>
> + if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
> + fuse_ktrace(fc, "waiting for invalidate_file on [%llu] to complete", fi->nodeid);
> + err = wait_on_bit(&fi->state, FUSE_I_INVAL_FILES, TASK_KILLABLE);
> + if (err)
> + return err;
> + }
> +
> err = generic_file_open(inode, file);
> if (err)
> return err;
> @@ -361,8 +369,6 @@ static int fuse_open(struct inode *inode, struct file *file)
> inode_unlock(inode);
>
> if (!err && fc->close_wait) {
> - struct fuse_inode *fi = get_fuse_inode(inode);
> -
> inode_lock(inode);
> spin_lock(&fi->lock);
>
> @@ -1409,6 +1415,12 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
> return err;
> }
>
> + /*
> + * Block read if the file had been invalidated.
> + */
> + if (fuse_file_fail_immediately(iocb->ki_filp->private_data))
> + return -EIO;
> +
> return generic_file_read_iter(iocb, to);
> }
>
> @@ -1794,6 +1806,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
> goto writethrough;
> }
>
> + /*
> + * Block write if the file had been invalidated.
> + */
> + if (fuse_file_fail_immediately(file->private_data))
> + return -EIO;
> +
> return generic_file_write_iter(iocb, from);
> }
>
> @@ -2488,8 +2506,12 @@ static int fuse_writepage_locked(struct folio *folio)
> struct fuse_args_pages *ap;
> struct folio *tmp_folio;
> struct fuse_file *ff;
> - int error = -ENOMEM;
> + int error = -EIO;
> +
> + if (test_bit(FUSE_I_INVAL_FILES, &fi->state))
> + goto err;
>
> + error = -ENOMEM;
> tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
> if (!tmp_folio)
> goto err;
> @@ -2704,13 +2726,9 @@ static int fuse_writepages_fill(struct folio *folio,
>
> BUG_ON(wpa && !data->ff);
>
> - /* More than optimization: writeback pages to /dev/null; fused would
> - * drop our FUSE_WRITE requests anyway, but it will be blocked while
> - * sending NOTIFY_INVAL_FILES until we return!
> - */
> if (!wpa && test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
> unlock_page(&folio->page);
> - return 0;
> + return -EIO;
> }
>
> if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) {
> @@ -2936,10 +2954,6 @@ static int fuse_launder_folio(struct folio *folio)
> /* Serialize with pending writeback for the same page */
> fuse_wait_on_page_writeback(inode, folio->index);
>
> - /* Return success if FUSE_NOTIFY_INVAL_FILES is in progress */
> - if (test_bit(FUSE_I_INVAL_FILES, &get_fuse_inode(inode)->state))
> - return 0;
> -
> err = fuse_writepage_locked(folio);
> if (!err)
> fuse_wait_on_page_writeback(inode, folio->index);
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 853bf12e282d..966c8e6c2ab7 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -215,6 +215,9 @@ struct fuse_inode {
> atomic_t read_count;
> atomic_t write_count;
> } dio;
> +
> + /** Entry on fc->inval_files_list list */
> + struct list_head inval_files_entry;
> };
>
> /** FUSE inode state bits */
> @@ -1110,7 +1113,13 @@ struct fuse_conn {
> } kio;
>
> int ktrace_level;
> - struct fuse_ktrace * ktrace;
> + struct fuse_ktrace *ktrace;
> + void (*fuse_ktrace_fn)(struct fuse_conn *fc, const char *fmt, ...);
> +
> + /* List of fuse_inodes to be invalidated by userspace */
> + struct list_head inval_files_list;
> + struct delayed_work inval_files_work;
> +
> struct dentry *conn_ctl;
>
> /* New writepages go into this bucket */
> @@ -1122,6 +1131,13 @@ struct fuse_conn {
> #endif
> };
>
> +#define fuse_ktrace(fc, fmt, args...) \
> +do { \
> + struct fuse_conn *__fc = (fc); \
> + if (__fc->fuse_ktrace_fn) \
> + __fc->fuse_ktrace_fn(__fc, "%s: " fmt, __func__, ## args); \
> +} while (0)
> +
> /*
> * Represents a mounted filesystem, potentially a submount.
> *
> @@ -1552,7 +1568,7 @@ static inline void fuse_dio_wait(struct fuse_inode *fi)
>
> static inline bool fuse_file_fail_immediately(struct fuse_file *ff)
> {
> - return ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
> + return unlikely(ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state));
> }
>
> /**
> @@ -1717,6 +1733,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
>
> struct fuse_kio_ops *fuse_kio_get(struct fuse_conn *fc, char *name);
> void fuse_kio_put(struct fuse_kio_ops *ops);
> +void fuse_drop_waiting(struct fuse_conn *fc);
>
> /* passthrough.c */
> static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index f167d275885b..d627302da0b4 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -35,6 +35,8 @@ struct list_head fuse_conn_list;
> DEFINE_MUTEX(fuse_mutex);
> EXPORT_SYMBOL_GPL(fuse_mutex);
>
> +struct workqueue_struct *fuse_inval_files_wq;
> +
> static int fuse_ve_odirect;
>
> static int set_global_limit(const char *val, const struct kernel_param *kp);
> @@ -117,6 +119,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
> fi->i_size_unstable = 0;
> fi->private = NULL;
> INIT_LIST_HEAD(&fi->rw_files);
> + INIT_LIST_HEAD(&fi->inval_files_entry);
> mutex_init(&fi->mutex);
> spin_lock_init(&fi->lock);
> init_waitqueue_head(&fi->dio.waitq);
> @@ -603,12 +606,55 @@ void fuse_unlock_inode(struct inode *inode, bool locked)
> mutex_unlock(&get_fuse_inode(inode)->mutex);
> }
>
> +static void fuse_inval_files_work(struct work_struct *w)
> +{
> + struct fuse_conn *fc = container_of(w, struct fuse_conn, inval_files_work.work);
> + struct list_head inval_files_list;
> + struct fuse_file *ff;
> + struct fuse_inode *fi;
> +
> + INIT_LIST_HEAD(&inval_files_list);
> +
> + spin_lock(&fc->lock);
> + list_splice_init(&fc->inval_files_list, &inval_files_list);
> + spin_unlock(&fc->lock);
> +
> + while (!list_empty(&inval_files_list)) {
> + u64 nodeid;
> +
> + fi = list_first_entry(&inval_files_list, struct fuse_inode, inval_files_entry);
> + list_del(&fi->inval_files_entry);
> + nodeid = get_node_id(&fi->inode) - FUSE_ROOT_ID;
> + fuse_ktrace(fc, "invalidate_file on [%llu] starts", nodeid);
> +
> + spin_lock(&fi->lock);
> + list_for_each_entry(ff, &fi->rw_files, rw_entry)
> + fuse_revoke_readpages(ff);
> + spin_unlock(&fi->lock);
> +
> + wake_up(&fi->page_waitq); /* readpage[s] can wait on fuse wb */
> +
> + truncate_pagecache_range(&fi->inode, 0, -1);
> + fuse_invalidate_attr(&fi->inode);
> +
> + spin_lock(&fi->lock);
> + clear_bit(FUSE_I_INVAL_FILES, &fi->state);
> + wake_up_bit(&fi->state, FUSE_I_INVAL_FILES);
> + spin_unlock(&fi->lock);
> +
> + fuse_ktrace(fc, "invalidate_file on [%llu] ends", nodeid);
> + iput(&fi->inode);
> + }
> +
> + fuse_drop_waiting(fc);
> +}
> +
> int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
> {
> struct inode *inode;
> struct fuse_inode *fi;
> struct fuse_file *ff;
> - int err, i;
> + int i;
>
> if (!fc->async_read) {
> printk(KERN_ERR "Turn async_read ON to use "
> @@ -624,6 +670,11 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
>
> /* Mark that invalidate files is in progress */
> spin_lock(&fi->lock);
> + if (test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
> + spin_unlock(&fi->lock);
> + iput(inode);
> + return 0;
> + }
> set_bit(FUSE_I_INVAL_FILES, &fi->state);
> list_for_each_entry(ff, &fi->rw_files, rw_entry) {
> spin_lock(&ff->lock);
> @@ -638,23 +689,14 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
> for (i = 0; i < FUSE_QHASH_SIZE; i++)
> wake_up_all(&fc->qhash[i].waitq);
>
> - err = filemap_write_and_wait(inode->i_mapping);
> - if (!err || err == -EIO) { /* AS_EIO might trigger -EIO */
> - spin_lock(&fi->lock);
> - list_for_each_entry(ff, &fi->rw_files, rw_entry)
> - fuse_revoke_readpages(ff);
> - spin_unlock(&fi->lock);
> -
> - wake_up(&fi->page_waitq); /* readpage[s] can wait on fuse wb */
> - err = invalidate_inode_pages2(inode->i_mapping);
> - }
> -
> - if (!err)
> - fuse_invalidate_attr(inode);
> + atomic_inc(&fc->num_waiting);
> + spin_lock(&fc->lock);
> + list_add_tail(&fi->inval_files_entry, &fc->inval_files_list);
> + spin_unlock(&fc->lock);
> + if (!queue_delayed_work(fuse_inval_files_wq, &fc->inval_files_work, 0))
> + fuse_drop_waiting(fc);
>
> - clear_bit(FUSE_I_INVAL_FILES, &fi->state);
> - iput(inode);
> - return err;
> + return 0;
> }
>
> static void fuse_umount_begin(struct super_block *sb)
> @@ -1308,6 +1350,9 @@ int fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
> if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> fuse_backing_files_init(fc);
>
> + INIT_LIST_HEAD(&fc->inval_files_list);
> + INIT_DELAYED_WORK(&fc->inval_files_work, fuse_inval_files_work);
> +
> INIT_LIST_HEAD(&fc->mounts);
> list_add(&fm->fc_entry, &fc->mounts);
> fm->fc = fc;
> @@ -2454,15 +2499,18 @@ static void fuse_inode_init_once(void *foo)
>
> static int __init fuse_fs_init(void)
> {
> - int err;
> + int err = -ENOMEM;
> +
> + fuse_inval_files_wq = alloc_workqueue("fuse_inval_files_wq", WQ_MEM_RECLAIM, 1);
> + if (!fuse_inval_files_wq)
> + goto out;
>
> fuse_inode_cachep = kmem_cache_create("fuse_inode",
> sizeof(struct fuse_inode), 0,
> SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT,
> fuse_inode_init_once);
> - err = -ENOMEM;
> if (!fuse_inode_cachep)
> - goto out;
> + goto out1;
>
> err = register_fuseblk();
> if (err)
> @@ -2478,6 +2526,8 @@ static int __init fuse_fs_init(void)
> unregister_fuseblk();
> out2:
> kmem_cache_destroy(fuse_inode_cachep);
> + out1:
> + destroy_workqueue(fuse_inval_files_wq);
> out:
> return err;
> }
> @@ -2493,6 +2543,7 @@ static void fuse_fs_cleanup(void)
> */
> rcu_barrier();
> kmem_cache_destroy(fuse_inode_cachep);
> + destroy_workqueue(fuse_inval_files_wq);
> }
>
> static struct kobject *fuse_kobj;
> diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> index eafe2ee2313b..42cdca250cd9 100644
> --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> @@ -158,6 +158,7 @@ MODULE_PARM_DESC(rdmaio_io_failing, "Enable/Disbale RDMA io failing");
>
> static int fuse_ktrace_setup(struct fuse_conn * fc);
> static int fuse_ktrace_remove(struct fuse_conn *fc);
> +static void kfuse_trace(struct fuse_conn *fc, const char *fmt, ...);
>
> static struct kmem_cache *pcs_fuse_req_cachep;
> static struct kmem_cache *pcs_ireq_cachep;
> @@ -1672,6 +1673,8 @@ static int fuse_ktrace_setup(struct fuse_conn * fc)
> goto err;
> }
>
> + fc->fuse_ktrace_fn = kfuse_trace;
> +
> return 0;
>
> err:
> @@ -1680,22 +1683,19 @@ static int fuse_ktrace_setup(struct fuse_conn * fc)
> return ret;
> }
>
> -void __kfuse_trace(struct fuse_conn * fc, unsigned long ip, const char * fmt, ...)
> +static void kfuse_tracer(struct fuse_conn *fc, unsigned long ip, const char *fmt, va_list va)
> {
> - struct fuse_ktrace * tr;
> - va_list va;
> + struct fuse_ktrace *tr;
> int cpu;
>
> cpu = get_cpu();
> tr = fc->ktrace;
> if (tr) {
> u8 * buf = per_cpu_ptr(tr->buf, cpu);
> - struct fuse_trace_hdr * t;
> + struct fuse_trace_hdr *t;
> int len;
>
> - va_start(va, fmt);
> len = vsnprintf(buf, KTRACE_LOG_BUF_SIZE, fmt, va);
> - va_end(va);
> t = fuse_trace_prepare(tr, FUSE_KTRACE_STRING, len + 1);
> if (t)
> memcpy(t + 1, buf, len + 1);
> @@ -1710,6 +1710,26 @@ void __kfuse_trace(struct fuse_conn * fc, unsigned long ip, const char * fmt, ..
> put_cpu();
> }
>
> +void __kfuse_trace(struct fuse_conn *fc, unsigned long ip, const char *fmt, ...)
> +{
> + va_list va;
> +
> + va_start(va, fmt);
> + kfuse_tracer(fc, ip, fmt, va);
> + va_end(va);
> +}
> +
> +static void kfuse_trace(struct fuse_conn *fc, const char *fmt, ...)
> +{
> + va_list va;
> +
> + if (fc->ktrace_level >= LOG_TRACE) {
> + va_start(va, fmt);
> + kfuse_tracer(fc, 0, fmt, va);
> + va_end(va);
> + }
> +}
> +
> void pcs_kio_file_list(struct fuse_conn *fc, kio_file_itr kfile_cb, void *ctx)
> {
> struct fuse_file *ff;
> --
> 2.39.5 (Apple Git-154)
More information about the Devel
mailing list