[Devel] [PATCH RHEL10 COMMIT] fs/fuse: revamp fuse_invalidate_files() to avoid blocking the userspace evloop

Konstantin Khorenko khorenko at virtuozzo.com
Fri May 15 18:05:44 MSK 2026


The commit is pushed to "branch-rh10-6.12.0-55.52.1.5.x.vz10-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh10-6.12.0-55.52.1.5.25.vz10
------>
commit 5a19728412f4801b054a07ad2810e4cdb795b10c
Author: Liu Kui <kui.liu at virtuozzo.com>
Date:   Mon Apr 20 11:58:11 2026 +0800

    fs/fuse: revamp fuse_invalidate_files() to avoid blocking the userspace evloop
    
    On large files, fuse_invalidate_files() can take very long time to complete.
    This is caused by two slow operations that cannot be optimized:
     - filemap_write_and_wait() when the file is under heavy write load, and
     - invalidate_inode_pages2() when the page cache is heavily populated.
    
    These long delays block the userspace evloop (which must not be blocked) and
    can trigger a shaman reboot in the worst case.
    
    To fix this, the following changes are made:
    
    1. Move the slow cache invalidation work into a dedicated kernel workqueue
       item and replace filemap_write_and_wait() + invalidate_inode_pages2() with
       truncate_pagecache_range() to simplify cache invalidation.
    
    2. In fuse_invalidate_files(), only set the FUSE_I_INVAL_FILES bit in fi->state
       and schedule the invalidation work for the fuse_inode.
    
    3. Block new opens of the file while the FUSE_I_INVAL_FILES bit is set.
       The bit is cleared only after the file has been fully invalidated.
       This is necessary because userspace views the file as fully invalidated
       as soon as fuse_invalidate_files() returns.
    
    Additionally, make the fuse trace function available in fuse module so
    that fuse_invalidate_files events can be traced and logged.
    
    khorenko@ notes: some items to be addressed later:
      1. review all the places and align trace messages to print correct
         fi->inode value everywhere (subtracked FUSE_ROOT_ID or not)
      2. think about increasing max_active value for fuse_inval_files_wq
         workqueue and probably marking it WQ_UNBOUND.
    
    Related to
    https://virtuozzo.atlassian.net/browse/VSTOR-124254
    
    Signed-off-by: Liu Kui <kui.liu at virtuozzo.com>
    Acked-by: Alexey Kuznetsov <kuznet at virtuozzo.com>
    
    Feature: vStorage
---
 fs/fuse/dev.c                      |  2 +-
 fs/fuse/file.c                     | 40 +++++++++++------
 fs/fuse/fuse_i.h                   | 21 ++++++++-
 fs/fuse/inode.c                    | 91 +++++++++++++++++++++++++++++---------
 fs/fuse/kio/pcs/pcs_fuse_kdirect.c | 32 +++++++++++---
 5 files changed, 144 insertions(+), 42 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c1102069d032d..4fcfd644dcf6b 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -110,7 +110,7 @@ static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
 	return !fc->initialized || (for_background && fc->blocked);
 }
 
-static void fuse_drop_waiting(struct fuse_conn *fc)
+void fuse_drop_waiting(struct fuse_conn *fc)
 {
 	/*
 	 * lockess check of fc->connected is okay, because atomic_dec_and_test()
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0860996c19ad3..11fb3996a2ac3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -252,10 +252,11 @@ static void fuse_link_rw_file(struct file *file)
 	struct fuse_file *ff = file->private_data;
 
 	spin_lock(&fi->lock);
-	if (test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
+	if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
 		spin_lock(&ff->lock);
 		set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
 		spin_unlock(&ff->lock);
+		fuse_ktrace(ff->fm->fc, "fuse_file[%llu] --> invalidate_file on [%llu] pending", ff->fh, ff->nodeid);
 	}
 	if (list_empty(&ff->rw_entry))
 		list_add(&ff->rw_entry, &fi->rw_files);
@@ -319,6 +320,13 @@ static int fuse_open(struct inode *inode, struct file *file)
 	if ((file->f_flags & O_DIRECT) && !fc->direct_enable)
 		return -EINVAL;
 
+	if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
+		fuse_ktrace(fc, "waiting for invalidate_file on [%llu] to complete", fi->nodeid);
+		err = wait_on_bit(&fi->state, FUSE_I_INVAL_FILES, TASK_KILLABLE);
+		if (err)
+			return err;
+	}
+
 	err = generic_file_open(inode, file);
 	if (err)
 		return err;
@@ -361,8 +369,6 @@ static int fuse_open(struct inode *inode, struct file *file)
 		inode_unlock(inode);
 
 	if (!err && fc->close_wait) {
-		struct fuse_inode *fi = get_fuse_inode(inode);
-
 		inode_lock(inode);
 		spin_lock(&fi->lock);
 
@@ -1409,6 +1415,12 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			return err;
 	}
 
+	/*
+	 * Block read if the file had been invalidated.
+	 */
+	if (fuse_file_fail_immediately(iocb->ki_filp->private_data))
+		return -EIO;
+
 	return generic_file_read_iter(iocb, to);
 }
 
@@ -1794,6 +1806,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			goto writethrough;
 		}
 
+		/*
+		 * Block write if the file had been invalidated.
+		 */
+		if (fuse_file_fail_immediately(file->private_data))
+			return -EIO;
+
 		return generic_file_write_iter(iocb, from);
 	}
 
@@ -2488,8 +2506,12 @@ static int fuse_writepage_locked(struct folio *folio)
 	struct fuse_args_pages *ap;
 	struct folio *tmp_folio;
 	struct fuse_file *ff;
-	int error = -ENOMEM;
+	int error = -EIO;
+
+	if (test_bit(FUSE_I_INVAL_FILES, &fi->state))
+		goto err;
 
+	error = -ENOMEM;
 	tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
 	if (!tmp_folio)
 		goto err;
@@ -2704,13 +2726,9 @@ static int fuse_writepages_fill(struct folio *folio,
 
 	BUG_ON(wpa && !data->ff);
 
-	/* More than optimization: writeback pages to /dev/null; fused would
-	 * drop our FUSE_WRITE requests anyway, but it will be blocked while
-	 * sending NOTIFY_INVAL_FILES until we return!
-	 */
 	if (!wpa && test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
 		unlock_page(&folio->page);
-		return 0;
+		return -EIO;
 	}
 
 	if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) {
@@ -2936,10 +2954,6 @@ static int fuse_launder_folio(struct folio *folio)
 		/* Serialize with pending writeback for the same page */
 		fuse_wait_on_page_writeback(inode, folio->index);
 
-		/* Return success if FUSE_NOTIFY_INVAL_FILES is in progress */
-		if (test_bit(FUSE_I_INVAL_FILES, &get_fuse_inode(inode)->state))
-			return 0;
-
 		err = fuse_writepage_locked(folio);
 		if (!err)
 			fuse_wait_on_page_writeback(inode, folio->index);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 853bf12e282d9..966c8e6c2ab70 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -215,6 +215,9 @@ struct fuse_inode {
 		atomic_t read_count;
 		atomic_t write_count;
 	} dio;
+
+	/** Entry on fc->inval_files_list list */
+	struct list_head inval_files_entry;
 };
 
 /** FUSE inode state bits */
@@ -1110,7 +1113,13 @@ struct fuse_conn {
 	} kio;
 
 	int ktrace_level;
-	struct fuse_ktrace * ktrace;
+	struct fuse_ktrace *ktrace;
+	void (*fuse_ktrace_fn)(struct fuse_conn *fc, const char *fmt, ...);
+
+	/* List of fuse_inodes to be invalidated by userspace */
+	struct list_head inval_files_list;
+	struct delayed_work inval_files_work;
+
 	struct dentry *conn_ctl;
 
 	/* New writepages go into this bucket */
@@ -1122,6 +1131,13 @@ struct fuse_conn {
 #endif
 };
 
+#define fuse_ktrace(fc, fmt, args...)  \
+do { \
+	struct fuse_conn *__fc = (fc); \
+	if (__fc->fuse_ktrace_fn) \
+		__fc->fuse_ktrace_fn(__fc, "%s: " fmt, __func__, ## args); \
+} while (0)
+
 /*
  * Represents a mounted filesystem, potentially a submount.
  *
@@ -1552,7 +1568,7 @@ static inline void fuse_dio_wait(struct fuse_inode *fi)
 
 static inline bool fuse_file_fail_immediately(struct fuse_file *ff)
 {
-	return ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
+	return unlikely(ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state));
 }
 
 /**
@@ -1717,6 +1733,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
 
 struct fuse_kio_ops *fuse_kio_get(struct fuse_conn *fc, char *name);
 void fuse_kio_put(struct fuse_kio_ops *ops);
+void fuse_drop_waiting(struct fuse_conn *fc);
 
 /* passthrough.c */
 static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f167d275885bf..d627302da0b43 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -35,6 +35,8 @@ struct list_head fuse_conn_list;
 DEFINE_MUTEX(fuse_mutex);
 EXPORT_SYMBOL_GPL(fuse_mutex);
 
+struct workqueue_struct *fuse_inval_files_wq;
+
 static int fuse_ve_odirect;
 
 static int set_global_limit(const char *val, const struct kernel_param *kp);
@@ -117,6 +119,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi->i_size_unstable = 0;
 	fi->private = NULL;
 	INIT_LIST_HEAD(&fi->rw_files);
+	INIT_LIST_HEAD(&fi->inval_files_entry);
 	mutex_init(&fi->mutex);
 	spin_lock_init(&fi->lock);
 	init_waitqueue_head(&fi->dio.waitq);
@@ -603,12 +606,55 @@ void fuse_unlock_inode(struct inode *inode, bool locked)
 		mutex_unlock(&get_fuse_inode(inode)->mutex);
 }
 
+static void fuse_inval_files_work(struct work_struct *w)
+{
+	struct fuse_conn *fc = container_of(w, struct fuse_conn, inval_files_work.work);
+	struct list_head inval_files_list;
+	struct fuse_file *ff;
+	struct fuse_inode *fi;
+
+	INIT_LIST_HEAD(&inval_files_list);
+
+	spin_lock(&fc->lock);
+	list_splice_init(&fc->inval_files_list, &inval_files_list);
+	spin_unlock(&fc->lock);
+
+	while (!list_empty(&inval_files_list)) {
+		u64 nodeid;
+
+		fi = list_first_entry(&inval_files_list, struct fuse_inode, inval_files_entry);
+		list_del(&fi->inval_files_entry);
+		nodeid = get_node_id(&fi->inode) - FUSE_ROOT_ID;
+		fuse_ktrace(fc, "invalidate_file on [%llu] starts", nodeid);
+
+		spin_lock(&fi->lock);
+		list_for_each_entry(ff, &fi->rw_files, rw_entry)
+			fuse_revoke_readpages(ff);
+		spin_unlock(&fi->lock);
+
+		wake_up(&fi->page_waitq); /* readpage[s] can wait on fuse wb */
+
+		truncate_pagecache_range(&fi->inode, 0, -1);
+		fuse_invalidate_attr(&fi->inode);
+
+		spin_lock(&fi->lock);
+		clear_bit(FUSE_I_INVAL_FILES, &fi->state);
+		wake_up_bit(&fi->state, FUSE_I_INVAL_FILES);
+		spin_unlock(&fi->lock);
+
+		fuse_ktrace(fc, "invalidate_file on [%llu] ends", nodeid);
+		iput(&fi->inode);
+	}
+
+	fuse_drop_waiting(fc);
+}
+
 int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
 {
 	struct inode *inode;
 	struct fuse_inode *fi;
 	struct fuse_file *ff;
-	int err, i;
+	int i;
 
 	if (!fc->async_read) {
 		printk(KERN_ERR "Turn async_read ON to use "
@@ -624,6 +670,11 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
 
 	/* Mark that invalidate files is in progress */
 	spin_lock(&fi->lock);
+	if (test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
+		spin_unlock(&fi->lock);
+		iput(inode);
+		return 0;
+	}
 	set_bit(FUSE_I_INVAL_FILES, &fi->state);
 	list_for_each_entry(ff, &fi->rw_files, rw_entry) {
 		spin_lock(&ff->lock);
@@ -638,23 +689,14 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
 	for (i = 0; i < FUSE_QHASH_SIZE; i++)
 		wake_up_all(&fc->qhash[i].waitq);
 
-	err = filemap_write_and_wait(inode->i_mapping);
-	if (!err || err == -EIO) { /* AS_EIO might trigger -EIO */
-		spin_lock(&fi->lock);
-		list_for_each_entry(ff, &fi->rw_files, rw_entry)
-			fuse_revoke_readpages(ff);
-		spin_unlock(&fi->lock);
-
-		wake_up(&fi->page_waitq); /* readpage[s] can wait on fuse wb */
-		err = invalidate_inode_pages2(inode->i_mapping);
-	}
-
-	if (!err)
-		fuse_invalidate_attr(inode);
+	atomic_inc(&fc->num_waiting);
+	spin_lock(&fc->lock);
+	list_add_tail(&fi->inval_files_entry, &fc->inval_files_list);
+	spin_unlock(&fc->lock);
+	if (!queue_delayed_work(fuse_inval_files_wq, &fc->inval_files_work, 0))
+		fuse_drop_waiting(fc);
 
-	clear_bit(FUSE_I_INVAL_FILES, &fi->state);
-	iput(inode);
-	return err;
+	return 0;
 }
 
 static void fuse_umount_begin(struct super_block *sb)
@@ -1308,6 +1350,9 @@ int fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
 	if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
 		fuse_backing_files_init(fc);
 
+	INIT_LIST_HEAD(&fc->inval_files_list);
+	INIT_DELAYED_WORK(&fc->inval_files_work, fuse_inval_files_work);
+
 	INIT_LIST_HEAD(&fc->mounts);
 	list_add(&fm->fc_entry, &fc->mounts);
 	fm->fc = fc;
@@ -2454,15 +2499,18 @@ static void fuse_inode_init_once(void *foo)
 
 static int __init fuse_fs_init(void)
 {
-	int err;
+	int err = -ENOMEM;
+
+	fuse_inval_files_wq = alloc_workqueue("fuse_inval_files_wq", WQ_MEM_RECLAIM, 1);
+	if (!fuse_inval_files_wq)
+		goto out;
 
 	fuse_inode_cachep = kmem_cache_create("fuse_inode",
 			sizeof(struct fuse_inode), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT,
 			fuse_inode_init_once);
-	err = -ENOMEM;
 	if (!fuse_inode_cachep)
-		goto out;
+		goto out1;
 
 	err = register_fuseblk();
 	if (err)
@@ -2478,6 +2526,8 @@ static int __init fuse_fs_init(void)
 	unregister_fuseblk();
  out2:
 	kmem_cache_destroy(fuse_inode_cachep);
+ out1:
+	destroy_workqueue(fuse_inval_files_wq);
  out:
 	return err;
 }
@@ -2493,6 +2543,7 @@ static void fuse_fs_cleanup(void)
 	 */
 	rcu_barrier();
 	kmem_cache_destroy(fuse_inode_cachep);
+	destroy_workqueue(fuse_inval_files_wq);
 }
 
 static struct kobject *fuse_kobj;
diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
index eafe2ee2313b7..42cdca250cd99 100644
--- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
+++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
@@ -158,6 +158,7 @@ MODULE_PARM_DESC(rdmaio_io_failing, "Enable/Disbale RDMA io failing");
 
 static int fuse_ktrace_setup(struct fuse_conn * fc);
 static int fuse_ktrace_remove(struct fuse_conn *fc);
+static void kfuse_trace(struct fuse_conn *fc, const char *fmt, ...);
 
 static struct kmem_cache *pcs_fuse_req_cachep;
 static struct kmem_cache *pcs_ireq_cachep;
@@ -1672,6 +1673,8 @@ static int fuse_ktrace_setup(struct fuse_conn * fc)
 		goto err;
 	}
 
+	fc->fuse_ktrace_fn = kfuse_trace;
+
 	return 0;
 
 err:
@@ -1680,22 +1683,19 @@ static int fuse_ktrace_setup(struct fuse_conn * fc)
 	return ret;
 }
 
-void __kfuse_trace(struct fuse_conn * fc, unsigned long ip, const char * fmt, ...)
+static void kfuse_tracer(struct fuse_conn *fc, unsigned long ip, const char *fmt, va_list va)
 {
-	struct fuse_ktrace * tr;
-        va_list va;
+	struct fuse_ktrace *tr;
 	int cpu;
 
 	cpu = get_cpu();
 	tr = fc->ktrace;
 	if (tr) {
 		u8 * buf = per_cpu_ptr(tr->buf, cpu);
-		struct fuse_trace_hdr * t;
+		struct fuse_trace_hdr *t;
 		int len;
 
-		va_start(va, fmt);
 		len = vsnprintf(buf, KTRACE_LOG_BUF_SIZE, fmt, va);
-		va_end(va);
 		t = fuse_trace_prepare(tr, FUSE_KTRACE_STRING, len + 1);
 		if (t)
 			memcpy(t + 1, buf, len + 1);
@@ -1710,6 +1710,26 @@ void __kfuse_trace(struct fuse_conn * fc, unsigned long ip, const char * fmt, ..
 	put_cpu();
 }
 
+void __kfuse_trace(struct fuse_conn *fc, unsigned long ip, const char *fmt, ...)
+{
+	va_list va;
+
+	va_start(va, fmt);
+	kfuse_tracer(fc, ip, fmt, va);
+	va_end(va);
+}
+
+static void kfuse_trace(struct fuse_conn *fc, const char *fmt, ...)
+{
+	va_list va;
+
+	if (fc->ktrace_level >= LOG_TRACE) {
+		va_start(va, fmt);
+		kfuse_tracer(fc, 0, fmt, va);
+		va_end(va);
+	}
+}
+
 void pcs_kio_file_list(struct fuse_conn *fc, kio_file_itr kfile_cb, void *ctx)
 {
 	struct fuse_file *ff;


More information about the Devel mailing list