[Devel] [PATCH v2 vz9 2/2] ve/cgroups: drop lock when stopping workqueue to avoid dead lock

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Fri Feb 10 12:58:56 MSK 2023


Though I like the idea of explicit states, you must be really careful 
when you rework them as vzctl uses those states to wait for container to 
start or to stop and changing anything may break vzctl.

On 03.02.2023 04:26, Alexander Atanasov wrote:
> Rework is_running variable into state to avoid guessing the state,
> be able to do more strict checks in what state the ve is in
> and verify state transitions.
> 
> This allows to reduce the size of critical sections that needs to
> take lock. All entry points check for good state before proceeding.
> 
> The deadlock is between ve->op_sem and kernfs_rwsem  (lockdep tracked
> via kn->active). It is ab-ba deadlock. When using the sysfs
> kernfs_rwsem is taken first, then ve code takes ve->op_sem.
> When ve code is called from inside the kernel ve->op_sem is taken
> first but it can reach back into kernfs code and deadlock.
> reboot and release_agent teardown is the reported case.
> 
> https://jira.sw.ru/browse/PSBM-144580
> Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
> ---
>   include/linux/ve.h | 26 ++++++++++++++++++-
>   kernel/ve/ve.c     | 65 +++++++++++++++++++++++++++++++---------------
>   2 files changed, 69 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index a023d9a8d14a..8e173628b532 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -27,6 +27,14 @@ struct user_namespace;
>   struct cn_private;
>   struct vfsmount;
>   
> +#define VE_STATE_STARTING 	0
> +#define VE_STATE_RUNNING 	1
> +#define VE_STATE_STOPPING 	2
> +#define VE_STATE_STOPPED 	3
> +#define VE_STATE_DEAD		4
> +
> +#define VE_IS_RUNNING(ve) ((ve)->state == VE_STATE_RUNNING)
> +
>   struct ve_struct {
>   	struct cgroup_subsys_state	css;
>   
> @@ -35,7 +43,7 @@ struct ve_struct {
>   	struct list_head	ve_list;
>   
>   	envid_t			veid;
> -	int			is_running;
> +	int			state;
>   	u8			is_pseudosuper:1;
>   
>   	struct rw_semaphore	op_sem;
> @@ -144,6 +152,18 @@ extern int nr_ve;
>   extern unsigned int sysctl_ve_mount_nr;
>   
>   #ifdef CONFIG_VE
> +static inline void ve_set_state(struct ve_struct *ve, int new_state)
> +{
> +	WARN_ONCE(ve->state == VE_STATE_DEAD, "VE is already DEAD");
> +	WARN_ONCE(new_state == VE_STATE_RUNNING && ve->state != VE_STATE_STARTING,
> +		"Invalid transition to running from %d\n", ve->state);
> +	WARN_ONCE(new_state == VE_STATE_STOPPING && ve->state != VE_STATE_RUNNING,
> +		"Invalid transition to stopping from %d\n", ve->state);
> +	WARN_ONCE(new_state == VE_STATE_STOPPED && ve->state != VE_STATE_STOPPING,
> +		"Invalid transition to stooped from %d\n", ve->state);
> +	ve->state = new_state;
> +}
> +
>   void ve_add_to_release_list(struct cgroup *cgrp);
>   void ve_rm_from_release_list(struct cgroup *cgrp);
>   
> @@ -229,6 +249,10 @@ extern bool is_ve_init_net(const struct net *net);
>   static inline void ve_stop_ns(struct pid_namespace *ns) { }
>   static inline void ve_exit_ns(struct pid_namespace *ns) { }
>   
> +
> +static inline void ve_set_state(struct ve_struct *ve, int new_state) {}
> +
> +

Don't add excess newlines.

@@ -249,9 +252,9 @@ extern bool is_ve_init_net(const struct net *net);
  static inline void ve_stop_ns(struct pid_namespace *ns) { }
  static inline void ve_exit_ns(struct pid_namespace *ns) { }

-
-static inline void ve_set_state(struct ve_struct *ve, int new_state) {}
-
+static inline void ve_set_state(struct ve_struct *ve, int new_state)
+{
+}

>   #define ve_feature_set(ve, f)		{ true; }
>   
>   static inline bool current_user_ns_initial(void)
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 80865161670e..8afef0e631d5 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -70,7 +70,7 @@ struct ve_struct ve0 = {
>   
>   	RCU_POINTER_INITIALIZER(ve_ns, &init_nsproxy),
>   
> -	.is_running		= 1,
> +	.state			= VE_STATE_RUNNING,
>   	.is_pseudosuper		= 1,
>   
>   	.init_cred		= &init_cred,
> @@ -662,7 +662,7 @@ void ve_add_to_release_list(struct cgroup *cgrp)
>   	if (!ve)
>   		ve = &ve0;
>   
> -	if (!ve->is_running) {
> +	if (!VE_IS_RUNNING(ve)) {
>   		rcu_read_unlock();
>   		return;
>   	}
> @@ -718,7 +718,7 @@ static int ve_start_container(struct ve_struct *ve)
>   
>   	ve_ns = rcu_dereference_protected(ve->ve_ns, lockdep_is_held(&ve->op_sem));
>   
> -	if (ve->is_running || ve_ns)
> +	if (ve->state != VE_STATE_STARTING || ve_ns)
>   		return -EBUSY;
>   
>   	if (tsk->task_ve != ve || !is_child_reaper(task_pid(tsk)))
> @@ -766,7 +766,7 @@ static int ve_start_container(struct ve_struct *ve)
>   	if (err)
>   		goto err_release_agent_setup;
>   
> -	ve->is_running = 1;
> +	ve_set_state(ve, VE_STATE_RUNNING);
>   
>   	printk(KERN_INFO "CT: %s: started\n", ve_name(ve));
>   
> @@ -805,19 +805,31 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
>   	if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns)
>   		goto unlock;
>   	/*
> -	 * Here the VE changes its state into "not running".
> +	 * Here the VE changes its state into stopping.
>   	 * op_sem works as barrier for vzctl ioctls.
>   	 * ve_mutex works as barrier for ve_can_attach().
>   	 */
> -	ve->is_running = 0;
> +	ve_set_state(ve, VE_STATE_STOPPING);
>   	synchronize_rcu();
>   
> +	/*
> +	 * Drop the lock - all entry points must check state before proceeding.
> +	 * The goal is to avoid a deadlock between sysfs access from userspace and
> +	 * removing sysfs entries from inside the kernel - i.e. order of ve->op_sem
> +	 * and kernfs_rwsem (lockdep checked via kn->active)
> +	 */
> +
> +	up_write(&ve->op_sem);
> +
>   	/*
>   	 * release_agent works on top of umh_worker, so we must make sure, that
>   	 * ve workqueue is stopped first.
>   	 */
> +
>   	ve_workqueue_stop(ve);
>   
> +	down_write(&ve->op_sem);
> +
>   	/*
>   	 * Neither it can be in pseudosuper state
>   	 * anymore, setup it again if needed.
> @@ -857,6 +869,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
>   	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>   	ve_list_del(ve);
>   	ve_drop_context(ve);
> +	ve_set_state(ve, VE_STATE_STOPPED);
>   	up_write(&ve->op_sem);
>   
>   	printk(KERN_INFO "CT: %s: stopped\n", ve_name(ve));
> @@ -957,6 +970,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
>   	if (!ve->sched_lat_ve.cur)
>   		goto err_lat;
>   
> +	ve->state = VE_STATE_STARTING;

That breaks original behaviour of ve_state_show which in its turn would 
break vzctl. I assume we need to set STOPPED state here and rework other 
logic accordingly. We must also set STARTING state at the begining of 
ve_start_container.

Also don't forget about error paths, If you set some state you should 
revert change on error paths with good care, e.g. ve_drop_context were 
effectively returning from STARTING to STOPPED in case of fail in 
ve_start_container.

>   	ve->features = VE_FEATURES_DEF;
>   
>   	INIT_WORK(&ve->release_agent_work, cgroup1_release_agent);
> @@ -1096,6 +1110,7 @@ static void ve_destroy(struct cgroup_subsys_state *css)
>   	kfree(ve->binfmt_misc);
>   #endif
>   	free_percpu(ve->sched_lat_ve.cur);
> +	ve_set_state(ve, VE_STATE_DEAD);
>   	kmem_cache_free(ve_cachep, ve);
>   }
>   
> @@ -1132,7 +1147,7 @@ static int ve_is_attachable(struct cgroup_taskset *tset)
>   	task = cgroup_taskset_first(tset, &css);
>   	ve = css_to_ve(css);
>   
> -	if (ve->is_running)
> +	if (VE_IS_RUNNING(ve))
>   		return 0;
>   
>   	if (!ve->veid) {
> @@ -1179,7 +1194,7 @@ static void ve_attach(struct cgroup_taskset *tset)
>   		struct ve_struct *ve = css_to_ve(css);
>   
>   		/* this probihibts ptracing of task entered to VE from host system */
> -		if (ve->is_running && task->mm)
> +		if (VE_IS_RUNNING(ve) && task->mm)
>   			task->mm->vps_dumpable = VD_VE_ENTER_TASK;
>   
>   		/* Drop OOM protection. */
> @@ -1204,14 +1219,18 @@ static int ve_state_show(struct seq_file *sf, void *v)
>   	struct ve_struct *ve = css_to_ve(css);
>   
>   	down_read(&ve->op_sem);
> -	if (ve->is_running)
> +	if (ve->state == VE_STATE_STARTING)
> +		seq_puts(sf, "STARTING");
> +	else if (ve->state == VE_STATE_RUNNING)
>   		seq_puts(sf, "RUNNING");
> -	else if (!cgroup_is_populated(css->cgroup) && !ve->ve_ns)
> -		seq_puts(sf, "STOPPED");
> -	else if (ve->ve_ns)
> +	else if (ve->state == VE_STATE_STOPPING)
>   		seq_puts(sf, "STOPPING");
> +	else if (ve->state == VE_STATE_STOPPED)
> +		seq_puts(sf, "STOPPED");
> +	else if (ve->state == VE_STATE_DEAD)
> +		seq_puts(sf, "DEAD");
>   	else
> -		seq_puts(sf, "STARTING");
> +		seq_puts(sf, "UNKNOWN");
>   	seq_putc(sf, '\n');
>   	up_read(&ve->op_sem);
>   
> @@ -1227,9 +1246,13 @@ static ssize_t ve_state_write(struct kernfs_open_file *of, char *buf,
>   	int ret = -EINVAL;
>   
>   	if (!strcmp(buf, "START")) {
> -		down_write(&ve->op_sem);
> -		ret = ve_start_container(ve);
> -		up_write(&ve->op_sem);
> +		if (ve->state != VE_STATE_STARTING)
> +			ret = -EBUSY;
> +		else {
> +			down_write(&ve->op_sem);
> +			ret = ve_start_container(ve);
> +			up_write(&ve->op_sem);
> +		}
>   	}
>   	return ret ? ret : nbytes;
>   }
> @@ -1254,7 +1277,7 @@ static int ve_id_write(struct cgroup_subsys_state *css, struct cftype *cft, u64
>   	ve_ns = rcu_dereference_protected(ve->ve_ns, lockdep_is_held(&ve->op_sem));
>   
>   	/* FIXME: check veid is uniqul */
> -	if (ve->is_running || ve_ns) {
> +	if (VE_IS_RUNNING(ve) || ve_ns) {
>   		if (ve->veid != val)
>   			err = -EBUSY;
>   	} else
> @@ -1289,7 +1312,7 @@ static int ve_pseudosuper_write(struct cgroup_subsys_state *css, struct cftype *
>   		return -EPERM;
>   
>   	down_write(&ve->op_sem);
> -	if (val && (ve->is_running || ve->ve_ns)) {
> +	if (val && (VE_IS_RUNNING(ve) || ve->ve_ns)) {
>   		up_write(&ve->op_sem);
>   		return -EBUSY;
>   	}
> @@ -1325,7 +1348,7 @@ static int ve_features_write(struct cgroup_subsys_state *css, struct cftype *cft
>   		return -EPERM;
>   
>   	down_write(&ve->op_sem);
> -	if (ve->is_running || ve->ve_ns) {
> +	if (VE_IS_RUNNING(ve) || ve->ve_ns) {
>   		up_write(&ve->op_sem);
>   		return -EBUSY;
>   	}
> @@ -1348,7 +1371,7 @@ static int ve_netns_max_nr_write(struct cgroup_subsys_state *css, struct cftype
>   		return -EPERM;
>   
>   	down_write(&ve->op_sem);
> -	if (ve->is_running || ve->ve_ns) {
> +	if (VE_IS_RUNNING(ve) || ve->ve_ns) {
>   		up_write(&ve->op_sem);
>   		return -EBUSY;
>   	}
> @@ -1699,7 +1722,7 @@ static int ve_aio_max_nr_write(struct cgroup_subsys_state *css,
>   		return -EPERM;
>   
>   	down_write(&ve->op_sem);
> -	if (ve->is_running || ve->ve_ns) {
> +	if (VE_IS_RUNNING(ve) || ve->ve_ns) {
>   		up_write(&ve->op_sem);
>   		return -EBUSY;
>   	}

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.


More information about the Devel mailing list