[Devel] [PATCH RHEL8 COMMIT] fuse: queue work for aio_complete (v3)

Konstantin Khorenko khorenko at virtuozzo.com
Fri Apr 23 11:54:54 MSK 2021


The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.19
------>
commit e4c7e297ae1f0ae8701f69339b7aa8ad5218af03
Author: Maxim Patlasov <mpatlasov at virtuozzo.com>
Date:   Fri Apr 23 11:54:54 2021 +0300

    fuse: queue work for aio_complete (v3)
    
    We cannot simply call aio_complete() from fuse_aio_complete() becase rhel7
    doesn't call __fput from a separate context as it used to be on rhel6.
    Otherwise a deadlock for single-threaded fuse daemon can happen: if process
    who generated AIO is already close(2) the file, fput() called from aio_complete
    will be the last fput(); hence, it will send flush_mtime (or release) request
    to userspace who is busy now writing ACK for given AIO to in-kernel fuse.
    
    Changed in v2 (thanks azaitsev@ for suggestion):
     - call aio_complete() with increased f_count; this allows to avoid extra
       context switch when the user still holds the file opened.
    Changed in v3 (thanks pborzenkov@ for fixing):
     - avoid io->iocb use-after-free ('io->iocb' struct is already freed at
       the time fuse_fput_routine() is called. Use 'io->file' member to get
       proper 'struct file' instead.)
    
    https://jira.sw.ru/browse/PSBM-54547
    
    Signed-off-by: Maxim Patlasov <mpatlasov at virtuozzo.com>
---
 fs/fuse/dev.c    | 11 ++++++++++-
 fs/fuse/file.c   | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/fuse_i.h |  1 +
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 13e15a4d7a81..564b76e5687d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -30,6 +30,7 @@ MODULE_ALIAS("devname:fuse");
 #define FUSE_REQ_ID_STEP (1ULL << 1)
 
 static struct kmem_cache *fuse_req_cachep;
+extern struct workqueue_struct *fuse_fput_wq;
 
 static struct fuse_dev *fuse_get_dev(struct file *file)
 {
@@ -2473,11 +2474,16 @@ static struct miscdevice fuse_miscdevice = {
 int __init fuse_dev_init(void)
 {
 	int err = -ENOMEM;
+
+	fuse_fput_wq = create_workqueue("fuse_fput");
+	if (!fuse_fput_wq)
+		goto out;
+
 	fuse_req_cachep = kmem_cache_create("fuse_request",
 					    sizeof(struct fuse_req),
 					    0, 0, NULL);
 	if (!fuse_req_cachep)
-		goto out;
+		goto out_destroq_wq;
 
 	err = misc_register(&fuse_miscdevice);
 	if (err)
@@ -2487,6 +2493,8 @@ int __init fuse_dev_init(void)
 
  out_cache_clean:
 	kmem_cache_destroy(fuse_req_cachep);
+ out_destroq_wq:
+	destroy_workqueue(fuse_fput_wq);
  out:
 	return err;
 }
@@ -2495,4 +2503,5 @@ void fuse_dev_cleanup(void)
 {
 	misc_deregister(&fuse_miscdevice);
 	kmem_cache_destroy(fuse_req_cachep);
+	destroy_workqueue(fuse_fput_wq);
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 13c0e453cd6f..925cf5fc55ba 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -19,6 +19,13 @@
 #include <linux/falloc.h>
 #include <linux/uio.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/file.h>
+
+struct workqueue_struct *fuse_fput_wq;
+static DEFINE_SPINLOCK(fuse_fput_lock);
+static LIST_HEAD(fuse_fput_head);
+static void fuse_fput_routine(struct work_struct *);
+static DECLARE_WORK(fuse_fput_work, fuse_fput_routine);
 
 static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
 				      struct fuse_page_desc **desc)
@@ -783,6 +790,29 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
 	return io->bytes < 0 ? io->size : io->bytes;
 }
 
+static void fuse_fput_routine(struct work_struct *data)
+{
+	spin_lock(&fuse_fput_lock);
+	while (likely(!list_empty(&fuse_fput_head))) {
+		struct fuse_io_priv *io = list_entry(fuse_fput_head.next,
+						     struct fuse_io_priv,
+						     list);
+		struct file *file = io->iocb->ki_filp;
+
+		list_del(&io->list);
+		spin_unlock(&fuse_fput_lock);
+
+		/* hack: __fput() is not visible outside fs/file_table.c */
+		BUG_ON(atomic_long_read(&file->f_count));
+		atomic_long_inc(&file->f_count);
+		fput(file);
+
+		kref_put(&io->refcnt, fuse_io_release);
+		spin_lock(&fuse_fput_lock);
+	}
+	spin_unlock(&fuse_fput_lock);
+}
+
 /**
  * In case of short read, the caller sets 'pos' to the position of
  * actual end of fuse request in IO request. Otherwise, if bytes_requested
@@ -816,6 +846,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 
 	if (!left && !io->blocking) {
 		ssize_t res = fuse_get_res_by_io(io);
+		struct file *file = io->iocb->ki_filp;
 
 		if (res >= 0) {
 			struct inode *inode = file_inode(io->iocb->ki_filp);
@@ -834,7 +865,22 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 			       io->iocb->ki_flags, io->iocb->ki_pos);
 		}
 
+		/* We have to bump f_count here to avoid deadlock for
+		 * single-threaded fuse daemon: if process who generated
+		 * AIO is already close(2) the file, fput() called from
+		 * aio_complete will be the last fput(); hence, it will send
+		 * flush_mtime (or release) request to userspace who is busy
+		 * now writing ACK for given AIO to in-kernel fuse */
+		get_file(file);
 		io->iocb->ki_complete(io->iocb, res, 0);
+
+		if (unlikely(atomic_long_dec_and_test(&file->f_count))) {
+			spin_lock(&fuse_fput_lock);
+			list_add(&io->list, &fuse_fput_head);
+			spin_unlock(&fuse_fput_lock);
+			queue_work(fuse_fput_wq, &fuse_fput_work);
+			return;
+		}
 	}
 
 	kref_put(&io->refcnt, fuse_io_release);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 4f2cdcd1dc51..4acf347595d0 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -340,6 +340,7 @@ struct fuse_io_priv {
 	struct kiocb *iocb;
 	struct completion *done;
 	bool blocking;
+	struct list_head list;
 };
 
 #define FUSE_IO_PRIV_SYNC(i) \


More information about the Devel mailing list