[Devel] [PATCH RHEL7 COMMIT] kernel: call task_work_run() before exit_task_namespaces()
Konstantin Khorenko
khorenko at virtuozzo.com
Thu Jul 20 11:41:59 MSK 2017
The commit is pushed to "branch-rh7-3.10.0-514.26.1.vz7.33.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.26.1.vz7.33.11
------>
commit 8ec11163ae371bdf08f3e7f5e2420b82ea71c32d
Author: Andrei Vagin <avagin at virtuozzo.com>
Date: Thu Jul 20 12:41:59 2017 +0400
kernel: call task_work_run() before exit_task_namespaces()
This patch solves a problem for a following case. We have a container (a
group of processes in pid and mount namespaces) with a fuse mount. An
init process exits and the kernel kills all process in its pid
namespace. There is a fuse daemon, which handle the fuse mount.
Currently the kernel kills this process and closes all its file
descriptors, but __fput() for them is postponed and they will be
called from a task_work. Then the kernel starts destroying the mount
namespace and the fuse mount, it sees that a control descriptor for
this mount is alive and sends a request to a fuse daemon:
$ cat /proc/4353/task/4355/stack
[<ffffffffa04c3451>] request_wait_answer+0x91/0x270 [fuse]
[<ffffffffa04c36b7>] __fuse_request_send+0x87/0xe0 [fuse]
[<ffffffffa04c6c47>] fuse_request_check_and_send+0x27/0x30 [fuse]
[<ffffffffa04c6c60>] fuse_request_send+0x10/0x20 [fuse]
[<ffffffffa04d2f35>] fuse_put_super+0x55/0xc0 [fuse]
[<ffffffff81218b32>] generic_shutdown_super+0x72/0xf0
[<ffffffff81218f12>] kill_anon_super+0x12/0x20
[<ffffffffa04d2577>] fuse_kill_sb_anon+0x47/0x50 [fuse]
[<ffffffff812194a9>] deactivate_locked_super+0x49/0x80
[<ffffffff81219526>] deactivate_super+0x46/0x60
[<ffffffff81237145>] mntput_no_expire+0xc5/0x120
[<ffffffff812371c4>] mntput+0x24/0x40
[<ffffffff812372f8>] namespace_unlock+0x118/0x130
[<ffffffff81239f2b>] put_mnt_ns+0x4b/0x60
[<ffffffff810b786b>] free_nsproxy+0x1b/0x90
[<ffffffff810b7a0a>] switch_task_namespaces+0x5a/0x70
[<ffffffff810b7ae0>] exit_task_namespaces+0x10/0x20
[<ffffffff8108c883>] do_exit+0x2f3/0xb20
[<ffffffff8108d12f>] do_group_exit+0x3f/0xa0
[<ffffffff8109e760>] get_signal_to_deliver+0x1d0/0x6d0
[<ffffffff8102a357>] do_signal+0x57/0x6b0
[<ffffffff8102aa0f>] do_notify_resume+0x5f/0xb0
[<ffffffff8169273d>] int_signal+0x12/0x17
[<ffffffffffffffff>] 0xffffffffffffffff
But we know that a fuse daemon is already dead and the control
descriptor isn't closed completely, because __fput() was postponed.
This patch calls task_work_run() before destroying namespaces to
complete closing all process files.
In the upstream kernel deactivate_super() is called from a task_work too, but
we can't backport these changes from the upstream, because they are too big.
https://jira.sw.ru/browse/PSBM-68266
Signed-off-by: Andrei Vagin <avagin at virtuozzo.com>
---
include/linux/task_work.h | 9 +++++++--
kernel/exit.c | 9 +++++++++
kernel/task_work.c | 4 ++--
3 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index ca5a1cf..b3af76d 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -14,11 +14,16 @@ init_task_work(struct callback_head *twork, task_work_func_t func)
int task_work_add(struct task_struct *task, struct callback_head *twork, bool);
struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
-void task_work_run(void);
+void __task_work_run(bool exiting);
+
+static inline void task_work_run(void)
+{
+ return __task_work_run(false);
+}
static inline void exit_task_work(struct task_struct *task)
{
- task_work_run();
+ __task_work_run(true);
}
#endif /* _LINUX_TASK_WORK_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 3c83db2..ea54a73 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -827,6 +827,15 @@ void do_exit(long code)
exit_fs(tsk);
if (group_dead)
disassociate_ctty(1);
+
+ /*
+ * task_work_run() has to be called before exit_task_namespaces(),
+ * because fuse_abort_conn() is called from __fput(). If it will not
+ * be executed, we can hang in request_wait_answer(). We have seen this
+ * situation when a process was the last member of a mount namespace
+ * and the mount namespace has a vstorage fuse mount.
+ */
+ task_work_run();
exit_task_namespaces(tsk);
exit_task_work(tsk);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 65bd3c9..f0000c4 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -46,7 +46,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
return work;
}
-void task_work_run(void)
+void __task_work_run(bool exiting)
{
struct task_struct *task = current;
struct callback_head *work, *head, *next;
@@ -58,7 +58,7 @@ void task_work_run(void)
*/
do {
work = ACCESS_ONCE(task->task_works);
- head = !work && (task->flags & PF_EXITING) ?
+ head = !work && exiting ?
&work_exited : NULL;
} while (cmpxchg(&task->task_works, work, head) != work);
More information about the Devel
mailing list