[Devel] [PATCH RHEL7 COMMIT] fuse: queue work for aio_complete (v3)
Konstantin Khorenko
khorenko at virtuozzo.com
Thu Nov 3 01:57:31 PDT 2016
The commit is pushed to "branch-rh7-3.10.0-327.36.1.vz7.19.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-327.36.1.vz7.19.5
------>
commit 1a258c698ada93f8cbca12ed6717093f849ab5b4
Author: Maxim Patlasov <mpatlasov at virtuozzo.com>
Date: Thu Nov 3 12:57:30 2016 +0400
fuse: queue work for aio_complete (v3)
We cannot simply call aio_complete() from fuse_aio_complete() becase rhel7
doesn't call __fput from a separate context as it used to be on rhel6.
Otherwise a deadlock for single-threaded fuse daemon can happen: if process
who generated AIO is already close(2) the file, fput() called from aio_complete
will be the last fput(); hence, it will send flush_mtime (or release) request
to userspace who is busy now writing ACK for given AIO to in-kernel fuse.
Changed in v2 (thanks azaitsev@ for suggestion):
- call aio_complete() with increased f_count; this allows to avoid extra
context switch when the user still holds the file opened.
Changed in v3 (thanks pborzenkov@ for fixing):
- avoid io->iocb use-after-free ('io->iocb' struct is already freed at
the time fuse_fput_routine() is called. Use 'io->file' member to get
proper 'struct file' instead.)
https://jira.sw.ru/browse/PSBM-54547
Signed-off-by: Maxim Patlasov <mpatlasov at virtuozzo.com>
---
fs/fuse/dev.c | 11 ++++++++++-
fs/fuse/file.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
fs/fuse/fuse_i.h | 1 +
3 files changed, 61 insertions(+), 3 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0b6d000..d6029ca 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -25,6 +25,7 @@ MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
static struct kmem_cache *fuse_req_cachep;
+extern struct workqueue_struct *fuse_fput_wq;
static struct fuse_conn *fuse_get_conn(struct file *file)
{
@@ -2243,11 +2244,16 @@ static struct miscdevice fuse_miscdevice = {
int __init fuse_dev_init(void)
{
int err = -ENOMEM;
+
+ fuse_fput_wq = create_workqueue("fuse_fput");
+ if (!fuse_fput_wq)
+ goto out;
+
fuse_req_cachep = kmem_cache_create("fuse_request",
sizeof(struct fuse_req),
0, 0, NULL);
if (!fuse_req_cachep)
- goto out;
+ goto out_destroq_wq;
err = misc_register(&fuse_miscdevice);
if (err)
@@ -2257,6 +2263,8 @@ int __init fuse_dev_init(void)
out_cache_clean:
kmem_cache_destroy(fuse_req_cachep);
+ out_destroq_wq:
+ destroy_workqueue(fuse_fput_wq);
out:
return err;
}
@@ -2265,4 +2273,5 @@ void fuse_dev_cleanup(void)
{
misc_deregister(&fuse_miscdevice);
kmem_cache_destroy(fuse_req_cachep);
+ destroy_workqueue(fuse_fput_wq);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 49ee3de..796638a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -19,6 +19,13 @@
#include <linux/falloc.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/virtinfo.h>
+#include <linux/file.h>
+
+struct workqueue_struct *fuse_fput_wq;
+static DEFINE_SPINLOCK(fuse_fput_lock);
+static LIST_HEAD(fuse_fput_head);
+static void fuse_fput_routine(struct work_struct *);
+static DECLARE_WORK(fuse_fput_work, fuse_fput_routine);
static const struct file_operations fuse_direct_io_file_operations;
static void fuse_sync_writes(struct inode *inode);
@@ -761,6 +768,29 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
}
}
+static void fuse_fput_routine(struct work_struct *data)
+{
+ spin_lock(&fuse_fput_lock);
+ while (likely(!list_empty(&fuse_fput_head))) {
+ struct fuse_io_priv *io = list_entry(fuse_fput_head.next,
+ struct fuse_io_priv,
+ list);
+ struct file *file = io->file;
+
+ list_del(&io->list);
+ spin_unlock(&fuse_fput_lock);
+
+ /* hack: __fput() is not visible outside fs/file_table.c */
+ BUG_ON(atomic_long_read(&file->f_count));
+ atomic_long_inc(&file->f_count);
+ fput(file);
+
+ kfree(io);
+ spin_lock(&fuse_fput_lock);
+ }
+ spin_unlock(&fuse_fput_lock);
+}
+
/**
* In case of short read, the caller sets 'pos' to the position of
* actual end of fuse request in IO request. Otherwise, if bytes_requested
@@ -792,6 +822,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
if (!left) {
long res;
+ struct file *file = io->iocb->ki_filp;
if (io->err)
res = io->err;
@@ -801,7 +832,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
res = io->bytes < 0 ? io->size : io->bytes;
if (!is_sync_kiocb(io->iocb)) {
- struct path *path = &io->iocb->ki_filp->f_path;
+ struct path *path = &file->f_path;
struct inode *inode = path->dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -819,8 +850,25 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
io, err, pos, io->err, io->bytes,
io->size, is_sync_kiocb(io->iocb), res,
io->iocb->ki_opcode, io->iocb->ki_pos);
+
+ /* We have to bump f_count here to avoid deadlock for
+ * single-threaded fuse daemon: if process who generated
+ * AIO is already close(2) the file, fput() called from
+ * aio_complete will be the last fput(); hence, it will send
+ * flush_mtime (or release) request to userspace who is busy
+ * now writing ACK for given AIO to in-kernel fuse */
+ get_file(file);
+ BUG_ON(io->file != io->iocb->ki_filp);
aio_complete(io->iocb, res, 0);
- kfree(io);
+
+ if (unlikely(atomic_long_dec_and_test(&file->f_count))) {
+ spin_lock(&fuse_fput_lock);
+ list_add(&io->list, &fuse_fput_head);
+ spin_unlock(&fuse_fput_lock);
+ queue_work(fuse_fput_wq, &fuse_fput_work);
+ } else {
+ kfree(io);
+ }
}
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 24aaeb7..1d24bf6 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -277,6 +277,7 @@ struct fuse_io_priv {
int err;
struct kiocb *iocb;
struct file *file;
+ struct list_head list;
};
/**
More information about the Devel
mailing list