[Devel] [PATCH vz9 2/2] ve/cgroups: drop lock when stopping workqueue to avoid dead lock

Alexander Atanasov alexander.atanasov at virtuozzo.com
Mon Jan 30 08:00:03 MSK 2023


Rework is_running variable into state to avoid guessing the state,
to be able to do more strict checks in what state the ve is at
and to verify state transitions.

This allows to reduce the size of critical sections that needs to
take lock. All entry points check for good state before proceeding.
So when the state is updated we can drop lock at places that require
to do so.

The deadlock is between ve->op_sem and kernfs_rwsem  (lockdep tracked
via kn->active). It is ab-ba deadlock. When using the sysfs
kernfs_rwsem is taken first, then ve code takes ve->op_sem.
When ve code is called from inside the kernel ve->op_sem is taken
first but it can reach back into kernfs code and deadlock.
reboot and release_agent teardown is the reported case.

https://jira.sw.ru/browse/PSBM-144580
Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
---
 include/linux/ve.h | 26 ++++++++++++++++++-
 kernel/ve/ve.c     | 65 +++++++++++++++++++++++++++++++---------------
 2 files changed, 69 insertions(+), 22 deletions(-)

diff --git a/include/linux/ve.h b/include/linux/ve.h
index a023d9a8d14a..8e173628b532 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -27,6 +27,14 @@ struct user_namespace;
 struct cn_private;
 struct vfsmount;
 
+#define VE_STATE_STARTING 	0
+#define VE_STATE_RUNNING 	1
+#define VE_STATE_STOPPING 	2
+#define VE_STATE_STOPPED 	3
+#define VE_STATE_DEAD		4
+
+#define VE_IS_RUNNING(ve) ((ve)->state == VE_STATE_RUNNING)
+
 struct ve_struct {
 	struct cgroup_subsys_state	css;
 
@@ -35,7 +43,7 @@ struct ve_struct {
 	struct list_head	ve_list;
 
 	envid_t			veid;
-	int			is_running;
+	int			state;
 	u8			is_pseudosuper:1;
 
 	struct rw_semaphore	op_sem;
@@ -144,6 +152,18 @@ extern int nr_ve;
 extern unsigned int sysctl_ve_mount_nr;
 
 #ifdef CONFIG_VE
+static inline void ve_set_state(struct ve_struct *ve, int new_state)
+{
+	WARN_ONCE(ve->state == VE_STATE_DEAD, "VE is already DEAD");
+	WARN_ONCE(new_state == VE_STATE_RUNNING && ve->state != VE_STATE_STARTING,
+		"Invalid transition to running from %d\n", ve->state);
+	WARN_ONCE(new_state == VE_STATE_STOPPING && ve->state != VE_STATE_RUNNING,
+		"Invalid transition to stopping from %d\n", ve->state);
+	WARN_ONCE(new_state == VE_STATE_STOPPED && ve->state != VE_STATE_STOPPING,
+		"Invalid transition to stooped from %d\n", ve->state);
+	ve->state = new_state;
+}
+
 void ve_add_to_release_list(struct cgroup *cgrp);
 void ve_rm_from_release_list(struct cgroup *cgrp);
 
@@ -229,6 +249,10 @@ extern bool is_ve_init_net(const struct net *net);
 static inline void ve_stop_ns(struct pid_namespace *ns) { }
 static inline void ve_exit_ns(struct pid_namespace *ns) { }
 
+
+static inline void ve_set_state(struct ve_struct *ve, int new_state) {}
+
+
 #define ve_feature_set(ve, f)		{ true; }
 
 static inline bool current_user_ns_initial(void)
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 80865161670e..8afef0e631d5 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -70,7 +70,7 @@ struct ve_struct ve0 = {
 
 	RCU_POINTER_INITIALIZER(ve_ns, &init_nsproxy),
 
-	.is_running		= 1,
+	.state			= VE_STATE_RUNNING,
 	.is_pseudosuper		= 1,
 
 	.init_cred		= &init_cred,
@@ -662,7 +662,7 @@ void ve_add_to_release_list(struct cgroup *cgrp)
 	if (!ve)
 		ve = &ve0;
 
-	if (!ve->is_running) {
+	if (!VE_IS_RUNNING(ve)) {
 		rcu_read_unlock();
 		return;
 	}
@@ -718,7 +718,7 @@ static int ve_start_container(struct ve_struct *ve)
 
 	ve_ns = rcu_dereference_protected(ve->ve_ns, lockdep_is_held(&ve->op_sem));
 
-	if (ve->is_running || ve_ns)
+	if (ve->state != VE_STATE_STARTING || ve_ns)
 		return -EBUSY;
 
 	if (tsk->task_ve != ve || !is_child_reaper(task_pid(tsk)))
@@ -766,7 +766,7 @@ static int ve_start_container(struct ve_struct *ve)
 	if (err)
 		goto err_release_agent_setup;
 
-	ve->is_running = 1;
+	ve_set_state(ve, VE_STATE_RUNNING);
 
 	printk(KERN_INFO "CT: %s: started\n", ve_name(ve));
 
@@ -805,19 +805,31 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
 	if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns)
 		goto unlock;
 	/*
-	 * Here the VE changes its state into "not running".
+	 * Here the VE changes its state into stopping.
 	 * op_sem works as barrier for vzctl ioctls.
 	 * ve_mutex works as barrier for ve_can_attach().
 	 */
-	ve->is_running = 0;
+	ve_set_state(ve, VE_STATE_STOPPING);
 	synchronize_rcu();
 
+	/*
+	 * Drop the lock - all entry points must check state before proceeding.
+	 * The goal is to avoid a deadlock between sysfs access from userspace and
+	 * removing sysfs entries from inside the kernel - i.e. order of ve->op_sem
+	 * and kernfs_rwsem (lockdep checked via kn->active)
+	 */
+
+	up_write(&ve->op_sem);
+
 	/*
 	 * release_agent works on top of umh_worker, so we must make sure, that
 	 * ve workqueue is stopped first.
 	 */
+
 	ve_workqueue_stop(ve);
 
+	down_write(&ve->op_sem);
+
 	/*
 	 * Neither it can be in pseudosuper state
 	 * anymore, setup it again if needed.
@@ -857,6 +869,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
 	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
 	ve_list_del(ve);
 	ve_drop_context(ve);
+	ve_set_state(ve, VE_STATE_STOPPED);
 	up_write(&ve->op_sem);
 
 	printk(KERN_INFO "CT: %s: stopped\n", ve_name(ve));
@@ -957,6 +970,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	if (!ve->sched_lat_ve.cur)
 		goto err_lat;
 
+	ve->state = VE_STATE_STARTING;
 	ve->features = VE_FEATURES_DEF;
 
 	INIT_WORK(&ve->release_agent_work, cgroup1_release_agent);
@@ -1096,6 +1110,7 @@ static void ve_destroy(struct cgroup_subsys_state *css)
 	kfree(ve->binfmt_misc);
 #endif
 	free_percpu(ve->sched_lat_ve.cur);
+	ve_set_state(ve, VE_STATE_DEAD);
 	kmem_cache_free(ve_cachep, ve);
 }
 
@@ -1132,7 +1147,7 @@ static int ve_is_attachable(struct cgroup_taskset *tset)
 	task = cgroup_taskset_first(tset, &css);
 	ve = css_to_ve(css);
 
-	if (ve->is_running)
+	if (VE_IS_RUNNING(ve))
 		return 0;
 
 	if (!ve->veid) {
@@ -1179,7 +1194,7 @@ static void ve_attach(struct cgroup_taskset *tset)
 		struct ve_struct *ve = css_to_ve(css);
 
 		/* this probihibts ptracing of task entered to VE from host system */
-		if (ve->is_running && task->mm)
+		if (VE_IS_RUNNING(ve) && task->mm)
 			task->mm->vps_dumpable = VD_VE_ENTER_TASK;
 
 		/* Drop OOM protection. */
@@ -1204,14 +1219,18 @@ static int ve_state_show(struct seq_file *sf, void *v)
 	struct ve_struct *ve = css_to_ve(css);
 
 	down_read(&ve->op_sem);
-	if (ve->is_running)
+	if (ve->state == VE_STATE_STARTING)
+		seq_puts(sf, "STARTING");
+	else if (ve->state == VE_STATE_RUNNING)
 		seq_puts(sf, "RUNNING");
-	else if (!cgroup_is_populated(css->cgroup) && !ve->ve_ns)
-		seq_puts(sf, "STOPPED");
-	else if (ve->ve_ns)
+	else if (ve->state == VE_STATE_STOPPING)
 		seq_puts(sf, "STOPPING");
+	else if (ve->state == VE_STATE_STOPPED)
+		seq_puts(sf, "STOPPED");
+	else if (ve->state == VE_STATE_DEAD)
+		seq_puts(sf, "DEAD");
 	else
-		seq_puts(sf, "STARTING");
+		seq_puts(sf, "UNKNOWN");
 	seq_putc(sf, '\n');
 	up_read(&ve->op_sem);
 
@@ -1227,9 +1246,13 @@ static ssize_t ve_state_write(struct kernfs_open_file *of, char *buf,
 	int ret = -EINVAL;
 
 	if (!strcmp(buf, "START")) {
-		down_write(&ve->op_sem);
-		ret = ve_start_container(ve);
-		up_write(&ve->op_sem);
+		if (ve->state != VE_STATE_STARTING)
+			ret = -EBUSY;
+		else {
+			down_write(&ve->op_sem);
+			ret = ve_start_container(ve);
+			up_write(&ve->op_sem);
+		}
 	}
 	return ret ? ret : nbytes;
 }
@@ -1254,7 +1277,7 @@ static int ve_id_write(struct cgroup_subsys_state *css, struct cftype *cft, u64
 	ve_ns = rcu_dereference_protected(ve->ve_ns, lockdep_is_held(&ve->op_sem));
 
 	/* FIXME: check veid is uniqul */
-	if (ve->is_running || ve_ns) {
+	if (VE_IS_RUNNING(ve) || ve_ns) {
 		if (ve->veid != val)
 			err = -EBUSY;
 	} else
@@ -1289,7 +1312,7 @@ static int ve_pseudosuper_write(struct cgroup_subsys_state *css, struct cftype *
 		return -EPERM;
 
 	down_write(&ve->op_sem);
-	if (val && (ve->is_running || ve->ve_ns)) {
+	if (val && (VE_IS_RUNNING(ve) || ve->ve_ns)) {
 		up_write(&ve->op_sem);
 		return -EBUSY;
 	}
@@ -1325,7 +1348,7 @@ static int ve_features_write(struct cgroup_subsys_state *css, struct cftype *cft
 		return -EPERM;
 
 	down_write(&ve->op_sem);
-	if (ve->is_running || ve->ve_ns) {
+	if (VE_IS_RUNNING(ve) || ve->ve_ns) {
 		up_write(&ve->op_sem);
 		return -EBUSY;
 	}
@@ -1348,7 +1371,7 @@ static int ve_netns_max_nr_write(struct cgroup_subsys_state *css, struct cftype
 		return -EPERM;
 
 	down_write(&ve->op_sem);
-	if (ve->is_running || ve->ve_ns) {
+	if (VE_IS_RUNNING(ve) || ve->ve_ns) {
 		up_write(&ve->op_sem);
 		return -EBUSY;
 	}
@@ -1699,7 +1722,7 @@ static int ve_aio_max_nr_write(struct cgroup_subsys_state *css,
 		return -EPERM;
 
 	down_write(&ve->op_sem);
-	if (ve->is_running || ve->ve_ns) {
+	if (VE_IS_RUNNING(ve) || ve->ve_ns) {
 		up_write(&ve->op_sem);
 		return -EBUSY;
 	}
-- 
2.31.1



More information about the Devel mailing list