[Devel] [PATCH VZ9] ve: fix task_ve use-after-free from reading /proc/pid/stat

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Tue Jul 9 12:34:55 MSK 2024


A task can exit and become a zombie, and outlive its ve. Because exited
task is moved out of its css set, so it does not hold any reference back
to ve.

More specifically that can happen in two cases:

1) If task is not inside pid namespace of ve and zap_pid_ns_processes
   will not reap/collect the task;
2) If task is an init of root pid namespace of ve, nothing guarantees
   that it can't become a zombie;

And if someone reads e.g.: /proc/pid/stat for such a task it may lead to
a crash on accessing freed ->task_ve memory.

https://virtuozzo.atlassian.net/browse/PSBM-157285

Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 kernel/ve/ve.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index e6d4a6f387e3e..ec14691970d6e 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -845,6 +845,14 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
 	}
 	rcu_read_unlock();
 
+	/*
+	 * To prevent use-after-free (e.g. reading /proc/pid/stat) in case
+	 * zombie container init outlives the ve_struct.
+	 *
+	 * Note: synchronize_rcu in ve_drop_context makes it rcu safe.
+	 */
+	rcu_assign_pointer(current->task_ve, &ve0);
+
 	/*
 	 * At this point all userspace tasks in container are dead.
 	 */
@@ -1199,6 +1207,32 @@ static void ve_attach(struct cgroup_taskset *tset)
 	}
 }
 
+/*
+ * When a container task exits it is moved out of its css set, see cgroup_exit.
+ * So the task effectively puts its reference to ve_struct of the container,
+ * and now nothing prevents ve_struct from being freed while the task is still
+ * there.
+ *
+ * We need to make sure that this task's ->task_ve is not pointing to the
+ * container ve before putting the reference, else the use of ->task_ve from
+ * the task can lead to use-after-free (e.g. reading /proc/pid/stat).
+ *
+ * One exception to that rule is the init process of the container, it needs
+ * ->task_ve in ve_stop_ns / ve_exit_ns, and holds a reference to ve_struct.
+ */
+static void ve_exit(struct task_struct *task)
+{
+	struct ve_struct *ve = task->task_ve;
+
+	/*
+	 * Pairs with synchronize_rcu in ve_grab_context and ve_drop_context
+	 */
+	rcu_read_lock();
+	if (!ve->ve_ns || ve->ve_ns->pid_ns_for_children->child_reaper != task)
+		rcu_assign_pointer(task->task_ve, &ve0);
+	rcu_read_unlock();
+}
+
 static int ve_state_show(struct seq_file *sf, void *v)
 {
 	struct cgroup_subsys_state *css = seq_css(sf);
@@ -1906,6 +1940,7 @@ struct cgroup_subsys ve_cgrp_subsys = {
 	.css_free	= ve_destroy,
 	.can_attach	= ve_can_attach,
 	.attach		= ve_attach,
+	.exit		= ve_exit,
 	.legacy_cftypes	= ve_cftypes,
 };
 
-- 
2.45.2



More information about the Devel mailing list