[Devel] [PATCH RHEL9 COMMIT] ve/cgroups: Drop lock when stopping workqueue to avoid dead lock

Konstantin Khorenko khorenko at virtuozzo.com
Wed Mar 1 18:53:37 MSK 2023


The commit is pushed to "branch-rh9-5.14.0-162.6.1.vz9.18.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-162.6.1.vz9.18.11
------>
commit e027bba11f17becbb9c1ea136ab66cdc2e449ffa
Author: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
Date:   Thu Feb 2 22:26:11 2023 +0200

    ve/cgroups: Drop lock when stopping workqueue to avoid dead lock
    
    Rework is_running variable into state to avoid guessing the state,
    be able to do more strict checks in what state the VE is in
    and verify state transitions.
    
    This allows to reduce the size of critical sections that need to
    take lock. All entry points check for good state before proceeding.
    
    The deadlock is between ve->op_sem and kernfs_rwsem (lockdep tracked
    via kn->active). It is AB-BA deadlock. When using the sysfs
    kernfs_rwsem is taken first, then VE code takes ve->op_sem.
    
    When VE code is called from inside the kernel ve->op_sem is taken
    first but it can reach back into kernfs code and deadlock.
    
    reboot and release_agent teardown is the reported case.
    
    Note: libvzctl only cares if the CT is stopped or it is assummed
          running, so
           - STOPPED internal CT state is set on the appropriate error paths
           - "STOPPED" is reported to userspace in case the internal
             CT state is either STOPPED or DEAD.
    
    v2:
      * Improved state handling on error paths and fixed show state to be
        compatible with vzctl.
    
      * libvzctl only cares if the ve is stopped or it is assummed running -
        fix this by reporting and updating the states so libvzctl gets what
        it expects.
    
    v3:
      * ve_state_show() is rewritten to use switch/case
      * ve_state_write() hunk dropped - VE_STATE_STARTING is checked inside
        ve_start_container() under op_sem and that is enough
      * ve_set_state(): added CT name in the warning output except for the
        case the prev state == DEAD and ve_name is already freed
    
    https://jira.sw.ru/browse/PSBM-144580
    Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
---
 include/linux/ve.h | 29 +++++++++++++++++++++++-
 kernel/ve/ve.c     | 66 +++++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 73 insertions(+), 22 deletions(-)

diff --git a/include/linux/ve.h b/include/linux/ve.h
index 678cd9b6a94a..8ca0e7a15f25 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -27,6 +27,14 @@ struct user_namespace;
 struct cn_private;
 struct vfsmount;
 
+#define VE_STATE_STARTING	0
+#define VE_STATE_RUNNING	1
+#define VE_STATE_STOPPING	2
+#define VE_STATE_STOPPED	3
+#define VE_STATE_DEAD		4
+
+#define VE_IS_RUNNING(ve) ((ve)->state == VE_STATE_RUNNING)
+
 struct ve_struct {
 	struct cgroup_subsys_state	css;
 
@@ -35,7 +43,7 @@ struct ve_struct {
 	struct list_head	ve_list;
 
 	envid_t			veid;
-	int			is_running;
+	int			state;
 	u8			is_pseudosuper:1;
 
 	struct rw_semaphore	op_sem;
@@ -144,6 +152,23 @@ extern int nr_ve;
 extern unsigned int sysctl_ve_mount_nr;
 
 #ifdef CONFIG_VE
+static inline void ve_set_state(struct ve_struct *ve, int new_state)
+{
+	if (ve->state == VE_STATE_DEAD) {
+		WARN_ONCE(1, "CT: already DEAD\n");
+	} else {
+		WARN_ONCE(new_state == VE_STATE_RUNNING &&
+			  ve->state != VE_STATE_STARTING,
+			  "CT#%s: Invalid transition to running from %d\n",
+			  ve_name(ve), ve->state);
+		WARN_ONCE(new_state == VE_STATE_STOPPING &&
+			  ve->state != VE_STATE_RUNNING,
+			  "CT#%s: Invalid transition to stopping from %d\n",
+			  ve_name(ve), ve->state);
+	}
+	ve->state = new_state;
+}
+
 void ve_add_to_release_list(struct cgroup *cgrp);
 void ve_rm_from_release_list(struct cgroup *cgrp);
 
@@ -228,6 +253,8 @@ extern bool is_ve_init_net(const struct net *net);
 static inline void ve_stop_ns(struct pid_namespace *ns) { }
 static inline void ve_exit_ns(struct pid_namespace *ns) { }
 
+static inline void ve_set_state(struct ve_struct *ve, int new_state) {}
+
 #define ve_feature_set(ve, f)		{ true; }
 
 static inline bool current_user_ns_initial(void)
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 55d45b5f2fbf..c56e0849fa3b 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -69,7 +69,7 @@ struct ve_struct ve0 = {
 
 	RCU_POINTER_INITIALIZER(ve_ns, &init_nsproxy),
 
-	.is_running		= 1,
+	.state			= VE_STATE_RUNNING,
 	.is_pseudosuper		= 1,
 
 	.init_cred		= &init_cred,
@@ -647,7 +647,7 @@ void ve_add_to_release_list(struct cgroup *cgrp)
 	if (!ve)
 		ve = &ve0;
 
-	if (!ve->is_running) {
+	if (!VE_IS_RUNNING(ve)) {
 		rcu_read_unlock();
 		return;
 	}
@@ -703,7 +703,7 @@ static int ve_start_container(struct ve_struct *ve)
 
 	ve_ns = rcu_dereference_protected(ve->ve_ns, lockdep_is_held(&ve->op_sem));
 
-	if (ve->is_running || ve_ns)
+	if (ve->state != VE_STATE_STARTING || ve_ns)
 		return -EBUSY;
 
 	if (tsk->task_ve != ve || !is_child_reaper(task_pid(tsk)))
@@ -751,7 +751,7 @@ static int ve_start_container(struct ve_struct *ve)
 	if (err)
 		goto err_release_agent_setup;
 
-	ve->is_running = 1;
+	ve_set_state(ve, VE_STATE_RUNNING);
 
 	printk(KERN_INFO "CT: %s: started\n", ve_name(ve));
 
@@ -772,6 +772,7 @@ static int ve_start_container(struct ve_struct *ve)
 err_kthreadd:
 	ve_list_del(ve);
 err_list:
+	ve_set_state(ve, VE_STATE_STOPPED);
 	ve_drop_context(ve);
 	return err;
 }
@@ -790,19 +791,29 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
 	if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns)
 		goto unlock;
 	/*
-	 * Here the VE changes its state into "not running".
+	 * Here the VE changes its state into stopping.
 	 * op_sem works as barrier for vzctl ioctls.
 	 * ve_mutex works as barrier for ve_can_attach().
 	 */
-	ve->is_running = 0;
+	ve_set_state(ve, VE_STATE_STOPPING);
 	synchronize_rcu();
 
+	/*
+	 * Drop the lock - all entry points must check state before proceeding.
+	 * The goal is to avoid a deadlock between sysfs access from userspace
+	 * and removing sysfs entries from inside the kernel - i.e. order of
+	 * ve->op_sem and kernfs_rwsem (lockdep checked via kn->active).
+	 */
+	up_write(&ve->op_sem);
+
 	/*
 	 * release_agent works on top of umh_worker, so we must make sure, that
 	 * ve workqueue is stopped first.
 	 */
 	ve_workqueue_stop(ve);
 
+	down_write(&ve->op_sem);
+
 	/*
 	 * Neither it can be in pseudosuper state
 	 * anymore, setup it again if needed.
@@ -842,6 +853,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
 	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
 	ve_list_del(ve);
 	ve_drop_context(ve);
+	ve_set_state(ve, VE_STATE_STOPPED);
 	up_write(&ve->op_sem);
 
 	printk(KERN_INFO "CT: %s: stopped\n", ve_name(ve));
@@ -942,6 +954,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	if (!ve->sched_lat_ve.cur)
 		goto err_lat;
 
+	ve->state = VE_STATE_STARTING;
 	ve->features = VE_FEATURES_DEF;
 
 	INIT_WORK(&ve->release_agent_work, cgroup1_release_agent);
@@ -1011,6 +1024,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 err_lat:
 	kmem_cache_free(ve_cachep, ve);
 err_ve:
+	ve_set_state(ve, VE_STATE_STOPPED);
 	return ERR_PTR(err);
 }
 
@@ -1081,6 +1095,7 @@ static void ve_destroy(struct cgroup_subsys_state *css)
 	kfree(ve->binfmt_misc);
 #endif
 	free_percpu(ve->sched_lat_ve.cur);
+	ve_set_state(ve, VE_STATE_DEAD);
 	kmem_cache_free(ve_cachep, ve);
 }
 
@@ -1117,7 +1132,7 @@ static int ve_is_attachable(struct cgroup_taskset *tset)
 	task = cgroup_taskset_first(tset, &css);
 	ve = css_to_ve(css);
 
-	if (ve->is_running)
+	if (VE_IS_RUNNING(ve))
 		return 0;
 
 	if (!ve->veid) {
@@ -1164,7 +1179,7 @@ static void ve_attach(struct cgroup_taskset *tset)
 		struct ve_struct *ve = css_to_ve(css);
 
 		/* this probihibts ptracing of task entered to VE from host system */
-		if (ve->is_running && task->mm)
+		if (VE_IS_RUNNING(ve) && task->mm)
 			task->mm->vps_dumpable = VD_VE_ENTER_TASK;
 
 		/* Drop OOM protection. */
@@ -1189,14 +1204,23 @@ static int ve_state_show(struct seq_file *sf, void *v)
 	struct ve_struct *ve = css_to_ve(css);
 
 	down_read(&ve->op_sem);
-	if (ve->is_running)
-		seq_puts(sf, "RUNNING");
-	else if (!cgroup_is_populated(css->cgroup) && !ve->ve_ns)
-		seq_puts(sf, "STOPPED");
-	else if (ve->ve_ns)
-		seq_puts(sf, "STOPPING");
-	else
-		seq_puts(sf, "STARTING");
+        switch (ve->state) {
+                case VE_STATE_STARTING:
+                        seq_puts(sf, "STARTING");
+                        break;
+                case VE_STATE_RUNNING:
+                        seq_puts(sf, "RUNNING");
+                        break;
+                case VE_STATE_STOPPING:
+                        seq_puts(sf, "STOPPING");
+                        break;
+                case VE_STATE_STOPPED:
+                case VE_STATE_DEAD:
+                        seq_puts(sf, "STOPPED");
+                        break;
+                default:
+                        seq_puts(sf, "UNKNOWN");
+        }
 	seq_putc(sf, '\n');
 	up_read(&ve->op_sem);
 
@@ -1239,7 +1263,7 @@ static int ve_id_write(struct cgroup_subsys_state *css, struct cftype *cft, u64
 	ve_ns = rcu_dereference_protected(ve->ve_ns, lockdep_is_held(&ve->op_sem));
 
 	/* FIXME: check veid is uniqul */
-	if (ve->is_running || ve_ns) {
+	if (VE_IS_RUNNING(ve) || ve_ns) {
 		if (ve->veid != val)
 			err = -EBUSY;
 	} else
@@ -1274,7 +1298,7 @@ static int ve_pseudosuper_write(struct cgroup_subsys_state *css, struct cftype *
 		return -EPERM;
 
 	down_write(&ve->op_sem);
-	if (val && (ve->is_running || ve->ve_ns)) {
+	if (val && (VE_IS_RUNNING(ve) || ve->ve_ns)) {
 		up_write(&ve->op_sem);
 		return -EBUSY;
 	}
@@ -1310,7 +1334,7 @@ static int ve_features_write(struct cgroup_subsys_state *css, struct cftype *cft
 		return -EPERM;
 
 	down_write(&ve->op_sem);
-	if (ve->is_running || ve->ve_ns) {
+	if (VE_IS_RUNNING(ve) || ve->ve_ns) {
 		up_write(&ve->op_sem);
 		return -EBUSY;
 	}
@@ -1333,7 +1357,7 @@ static int ve_netns_max_nr_write(struct cgroup_subsys_state *css, struct cftype
 		return -EPERM;
 
 	down_write(&ve->op_sem);
-	if (ve->is_running || ve->ve_ns) {
+	if (VE_IS_RUNNING(ve) || ve->ve_ns) {
 		up_write(&ve->op_sem);
 		return -EBUSY;
 	}
@@ -1684,7 +1708,7 @@ static int ve_aio_max_nr_write(struct cgroup_subsys_state *css,
 		return -EPERM;
 
 	down_write(&ve->op_sem);
-	if (ve->is_running || ve->ve_ns) {
+	if (VE_IS_RUNNING(ve) || ve->ve_ns) {
 		up_write(&ve->op_sem);
 		return -EBUSY;
 	}


More information about the Devel mailing list