[Devel] [PATCH RHEL8 COMMIT] fuse: queue work for aio_complete (v3)
Konstantin Khorenko
khorenko at virtuozzo.com
Fri Apr 23 11:54:54 MSK 2021
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.19
------>
commit e4c7e297ae1f0ae8701f69339b7aa8ad5218af03
Author: Maxim Patlasov <mpatlasov at virtuozzo.com>
Date: Fri Apr 23 11:54:54 2021 +0300
fuse: queue work for aio_complete (v3)
We cannot simply call aio_complete() from fuse_aio_complete() becase rhel7
doesn't call __fput from a separate context as it used to be on rhel6.
Otherwise a deadlock for single-threaded fuse daemon can happen: if process
who generated AIO is already close(2) the file, fput() called from aio_complete
will be the last fput(); hence, it will send flush_mtime (or release) request
to userspace who is busy now writing ACK for given AIO to in-kernel fuse.
Changed in v2 (thanks azaitsev@ for suggestion):
- call aio_complete() with increased f_count; this allows to avoid extra
context switch when the user still holds the file opened.
Changed in v3 (thanks pborzenkov@ for fixing):
- avoid io->iocb use-after-free ('io->iocb' struct is already freed at
the time fuse_fput_routine() is called. Use 'io->file' member to get
proper 'struct file' instead.)
https://jira.sw.ru/browse/PSBM-54547
Signed-off-by: Maxim Patlasov <mpatlasov at virtuozzo.com>
---
fs/fuse/dev.c | 11 ++++++++++-
fs/fuse/file.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
fs/fuse/fuse_i.h | 1 +
3 files changed, 57 insertions(+), 1 deletion(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 13e15a4d7a81..564b76e5687d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -30,6 +30,7 @@ MODULE_ALIAS("devname:fuse");
#define FUSE_REQ_ID_STEP (1ULL << 1)
static struct kmem_cache *fuse_req_cachep;
+extern struct workqueue_struct *fuse_fput_wq;
static struct fuse_dev *fuse_get_dev(struct file *file)
{
@@ -2473,11 +2474,16 @@ static struct miscdevice fuse_miscdevice = {
int __init fuse_dev_init(void)
{
int err = -ENOMEM;
+
+ fuse_fput_wq = create_workqueue("fuse_fput");
+ if (!fuse_fput_wq)
+ goto out;
+
fuse_req_cachep = kmem_cache_create("fuse_request",
sizeof(struct fuse_req),
0, 0, NULL);
if (!fuse_req_cachep)
- goto out;
+ goto out_destroq_wq;
err = misc_register(&fuse_miscdevice);
if (err)
@@ -2487,6 +2493,8 @@ int __init fuse_dev_init(void)
out_cache_clean:
kmem_cache_destroy(fuse_req_cachep);
+ out_destroq_wq:
+ destroy_workqueue(fuse_fput_wq);
out:
return err;
}
@@ -2495,4 +2503,5 @@ void fuse_dev_cleanup(void)
{
misc_deregister(&fuse_miscdevice);
kmem_cache_destroy(fuse_req_cachep);
+ destroy_workqueue(fuse_fput_wq);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 13c0e453cd6f..925cf5fc55ba 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -19,6 +19,13 @@
#include <linux/falloc.h>
#include <linux/uio.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/file.h>
+
+struct workqueue_struct *fuse_fput_wq;
+static DEFINE_SPINLOCK(fuse_fput_lock);
+static LIST_HEAD(fuse_fput_head);
+static void fuse_fput_routine(struct work_struct *);
+static DECLARE_WORK(fuse_fput_work, fuse_fput_routine);
static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
struct fuse_page_desc **desc)
@@ -783,6 +790,29 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
return io->bytes < 0 ? io->size : io->bytes;
}
+static void fuse_fput_routine(struct work_struct *data)
+{
+ spin_lock(&fuse_fput_lock);
+ while (likely(!list_empty(&fuse_fput_head))) {
+ struct fuse_io_priv *io = list_entry(fuse_fput_head.next,
+ struct fuse_io_priv,
+ list);
+ struct file *file = io->iocb->ki_filp;
+
+ list_del(&io->list);
+ spin_unlock(&fuse_fput_lock);
+
+ /* hack: __fput() is not visible outside fs/file_table.c */
+ BUG_ON(atomic_long_read(&file->f_count));
+ atomic_long_inc(&file->f_count);
+ fput(file);
+
+ kref_put(&io->refcnt, fuse_io_release);
+ spin_lock(&fuse_fput_lock);
+ }
+ spin_unlock(&fuse_fput_lock);
+}
+
/**
* In case of short read, the caller sets 'pos' to the position of
* actual end of fuse request in IO request. Otherwise, if bytes_requested
@@ -816,6 +846,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
if (!left && !io->blocking) {
ssize_t res = fuse_get_res_by_io(io);
+ struct file *file = io->iocb->ki_filp;
if (res >= 0) {
struct inode *inode = file_inode(io->iocb->ki_filp);
@@ -834,7 +865,22 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
io->iocb->ki_flags, io->iocb->ki_pos);
}
+ /* We have to bump f_count here to avoid deadlock for
+ * single-threaded fuse daemon: if process who generated
+ * AIO is already close(2) the file, fput() called from
+ * aio_complete will be the last fput(); hence, it will send
+ * flush_mtime (or release) request to userspace who is busy
+ * now writing ACK for given AIO to in-kernel fuse */
+ get_file(file);
io->iocb->ki_complete(io->iocb, res, 0);
+
+ if (unlikely(atomic_long_dec_and_test(&file->f_count))) {
+ spin_lock(&fuse_fput_lock);
+ list_add(&io->list, &fuse_fput_head);
+ spin_unlock(&fuse_fput_lock);
+ queue_work(fuse_fput_wq, &fuse_fput_work);
+ return;
+ }
}
kref_put(&io->refcnt, fuse_io_release);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 4f2cdcd1dc51..4acf347595d0 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -340,6 +340,7 @@ struct fuse_io_priv {
struct kiocb *iocb;
struct completion *done;
bool blocking;
+ struct list_head list;
};
#define FUSE_IO_PRIV_SYNC(i) \
More information about the Devel
mailing list