[Devel] [PATCH v2 vz9 2/2] ve/cgroups: drop lock when stopping workqueue to avoid dead lock
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Fri Feb 10 12:58:56 MSK 2023
Though I like the idea of explicit states, you must be really careful
when you rework them as vzctl uses those states to wait for container to
start or to stop and changing anything may break vzctl.
On 03.02.2023 04:26, Alexander Atanasov wrote:
> Rework is_running variable into state to avoid guessing the state,
> be able to do more strict checks in what state the ve is in
> and verify state transitions.
>
> This allows to reduce the size of critical sections that needs to
> take lock. All entry points check for good state before proceeding.
>
> The deadlock is between ve->op_sem and kernfs_rwsem (lockdep tracked
> via kn->active). It is ab-ba deadlock. When using the sysfs
> kernfs_rwsem is taken first, then ve code takes ve->op_sem.
> When ve code is called from inside the kernel ve->op_sem is taken
> first but it can reach back into kernfs code and deadlock.
> reboot and release_agent teardown is the reported case.
>
> https://jira.sw.ru/browse/PSBM-144580
> Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
> ---
> include/linux/ve.h | 26 ++++++++++++++++++-
> kernel/ve/ve.c | 65 +++++++++++++++++++++++++++++++---------------
> 2 files changed, 69 insertions(+), 22 deletions(-)
>
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index a023d9a8d14a..8e173628b532 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -27,6 +27,14 @@ struct user_namespace;
> struct cn_private;
> struct vfsmount;
>
> +#define VE_STATE_STARTING 0
> +#define VE_STATE_RUNNING 1
> +#define VE_STATE_STOPPING 2
> +#define VE_STATE_STOPPED 3
> +#define VE_STATE_DEAD 4
> +
> +#define VE_IS_RUNNING(ve) ((ve)->state == VE_STATE_RUNNING)
> +
> struct ve_struct {
> struct cgroup_subsys_state css;
>
> @@ -35,7 +43,7 @@ struct ve_struct {
> struct list_head ve_list;
>
> envid_t veid;
> - int is_running;
> + int state;
> u8 is_pseudosuper:1;
>
> struct rw_semaphore op_sem;
> @@ -144,6 +152,18 @@ extern int nr_ve;
> extern unsigned int sysctl_ve_mount_nr;
>
> #ifdef CONFIG_VE
> +static inline void ve_set_state(struct ve_struct *ve, int new_state)
> +{
> + WARN_ONCE(ve->state == VE_STATE_DEAD, "VE is already DEAD");
> + WARN_ONCE(new_state == VE_STATE_RUNNING && ve->state != VE_STATE_STARTING,
> + "Invalid transition to running from %d\n", ve->state);
> + WARN_ONCE(new_state == VE_STATE_STOPPING && ve->state != VE_STATE_RUNNING,
> + "Invalid transition to stopping from %d\n", ve->state);
> + WARN_ONCE(new_state == VE_STATE_STOPPED && ve->state != VE_STATE_STOPPING,
> + "Invalid transition to stooped from %d\n", ve->state);
> + ve->state = new_state;
> +}
> +
> void ve_add_to_release_list(struct cgroup *cgrp);
> void ve_rm_from_release_list(struct cgroup *cgrp);
>
> @@ -229,6 +249,10 @@ extern bool is_ve_init_net(const struct net *net);
> static inline void ve_stop_ns(struct pid_namespace *ns) { }
> static inline void ve_exit_ns(struct pid_namespace *ns) { }
>
> +
> +static inline void ve_set_state(struct ve_struct *ve, int new_state) {}
> +
> +
Don't add excess newlines.
@@ -249,9 +252,9 @@ extern bool is_ve_init_net(const struct net *net);
static inline void ve_stop_ns(struct pid_namespace *ns) { }
static inline void ve_exit_ns(struct pid_namespace *ns) { }
-
-static inline void ve_set_state(struct ve_struct *ve, int new_state) {}
-
+static inline void ve_set_state(struct ve_struct *ve, int new_state)
+{
+}
> #define ve_feature_set(ve, f) { true; }
>
> static inline bool current_user_ns_initial(void)
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 80865161670e..8afef0e631d5 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -70,7 +70,7 @@ struct ve_struct ve0 = {
>
> RCU_POINTER_INITIALIZER(ve_ns, &init_nsproxy),
>
> - .is_running = 1,
> + .state = VE_STATE_RUNNING,
> .is_pseudosuper = 1,
>
> .init_cred = &init_cred,
> @@ -662,7 +662,7 @@ void ve_add_to_release_list(struct cgroup *cgrp)
> if (!ve)
> ve = &ve0;
>
> - if (!ve->is_running) {
> + if (!VE_IS_RUNNING(ve)) {
> rcu_read_unlock();
> return;
> }
> @@ -718,7 +718,7 @@ static int ve_start_container(struct ve_struct *ve)
>
> ve_ns = rcu_dereference_protected(ve->ve_ns, lockdep_is_held(&ve->op_sem));
>
> - if (ve->is_running || ve_ns)
> + if (ve->state != VE_STATE_STARTING || ve_ns)
> return -EBUSY;
>
> if (tsk->task_ve != ve || !is_child_reaper(task_pid(tsk)))
> @@ -766,7 +766,7 @@ static int ve_start_container(struct ve_struct *ve)
> if (err)
> goto err_release_agent_setup;
>
> - ve->is_running = 1;
> + ve_set_state(ve, VE_STATE_RUNNING);
>
> printk(KERN_INFO "CT: %s: started\n", ve_name(ve));
>
> @@ -805,19 +805,31 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
> if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns)
> goto unlock;
> /*
> - * Here the VE changes its state into "not running".
> + * Here the VE changes its state into stopping.
> * op_sem works as barrier for vzctl ioctls.
> * ve_mutex works as barrier for ve_can_attach().
> */
> - ve->is_running = 0;
> + ve_set_state(ve, VE_STATE_STOPPING);
> synchronize_rcu();
>
> + /*
> + * Drop the lock - all entry points must check state before proceeding.
> + * The goal is to avoid a deadlock between sysfs access from userspace and
> + * removing sysfs entries from inside the kernel - i.e. order of ve->op_sem
> + * and kernfs_rwsem (lockdep checked via kn->active)
> + */
> +
> + up_write(&ve->op_sem);
> +
> /*
> * release_agent works on top of umh_worker, so we must make sure, that
> * ve workqueue is stopped first.
> */
> +
> ve_workqueue_stop(ve);
>
> + down_write(&ve->op_sem);
> +
> /*
> * Neither it can be in pseudosuper state
> * anymore, setup it again if needed.
> @@ -857,6 +869,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
> ve_hook_iterate_fini(VE_SS_CHAIN, ve);
> ve_list_del(ve);
> ve_drop_context(ve);
> + ve_set_state(ve, VE_STATE_STOPPED);
> up_write(&ve->op_sem);
>
> printk(KERN_INFO "CT: %s: stopped\n", ve_name(ve));
> @@ -957,6 +970,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
> if (!ve->sched_lat_ve.cur)
> goto err_lat;
>
> + ve->state = VE_STATE_STARTING;
That breaks original behaviour of ve_state_show which in its turn would
break vzctl. I assume we need to set STOPPED state here and rework other
logic accordingly. We must also set STARTING state at the begining of
ve_start_container.
Also don't forget about error paths, If you set some state you should
revert change on error paths with good care, e.g. ve_drop_context were
effectively returning from STARTING to STOPPED in case of fail in
ve_start_container.
> ve->features = VE_FEATURES_DEF;
>
> INIT_WORK(&ve->release_agent_work, cgroup1_release_agent);
> @@ -1096,6 +1110,7 @@ static void ve_destroy(struct cgroup_subsys_state *css)
> kfree(ve->binfmt_misc);
> #endif
> free_percpu(ve->sched_lat_ve.cur);
> + ve_set_state(ve, VE_STATE_DEAD);
> kmem_cache_free(ve_cachep, ve);
> }
>
> @@ -1132,7 +1147,7 @@ static int ve_is_attachable(struct cgroup_taskset *tset)
> task = cgroup_taskset_first(tset, &css);
> ve = css_to_ve(css);
>
> - if (ve->is_running)
> + if (VE_IS_RUNNING(ve))
> return 0;
>
> if (!ve->veid) {
> @@ -1179,7 +1194,7 @@ static void ve_attach(struct cgroup_taskset *tset)
> struct ve_struct *ve = css_to_ve(css);
>
> /* this probihibts ptracing of task entered to VE from host system */
> - if (ve->is_running && task->mm)
> + if (VE_IS_RUNNING(ve) && task->mm)
> task->mm->vps_dumpable = VD_VE_ENTER_TASK;
>
> /* Drop OOM protection. */
> @@ -1204,14 +1219,18 @@ static int ve_state_show(struct seq_file *sf, void *v)
> struct ve_struct *ve = css_to_ve(css);
>
> down_read(&ve->op_sem);
> - if (ve->is_running)
> + if (ve->state == VE_STATE_STARTING)
> + seq_puts(sf, "STARTING");
> + else if (ve->state == VE_STATE_RUNNING)
> seq_puts(sf, "RUNNING");
> - else if (!cgroup_is_populated(css->cgroup) && !ve->ve_ns)
> - seq_puts(sf, "STOPPED");
> - else if (ve->ve_ns)
> + else if (ve->state == VE_STATE_STOPPING)
> seq_puts(sf, "STOPPING");
> + else if (ve->state == VE_STATE_STOPPED)
> + seq_puts(sf, "STOPPED");
> + else if (ve->state == VE_STATE_DEAD)
> + seq_puts(sf, "DEAD");
> else
> - seq_puts(sf, "STARTING");
> + seq_puts(sf, "UNKNOWN");
> seq_putc(sf, '\n');
> up_read(&ve->op_sem);
>
> @@ -1227,9 +1246,13 @@ static ssize_t ve_state_write(struct kernfs_open_file *of, char *buf,
> int ret = -EINVAL;
>
> if (!strcmp(buf, "START")) {
> - down_write(&ve->op_sem);
> - ret = ve_start_container(ve);
> - up_write(&ve->op_sem);
> + if (ve->state != VE_STATE_STARTING)
> + ret = -EBUSY;
> + else {
> + down_write(&ve->op_sem);
> + ret = ve_start_container(ve);
> + up_write(&ve->op_sem);
> + }
> }
> return ret ? ret : nbytes;
> }
> @@ -1254,7 +1277,7 @@ static int ve_id_write(struct cgroup_subsys_state *css, struct cftype *cft, u64
> ve_ns = rcu_dereference_protected(ve->ve_ns, lockdep_is_held(&ve->op_sem));
>
> /* FIXME: check veid is uniqul */
> - if (ve->is_running || ve_ns) {
> + if (VE_IS_RUNNING(ve) || ve_ns) {
> if (ve->veid != val)
> err = -EBUSY;
> } else
> @@ -1289,7 +1312,7 @@ static int ve_pseudosuper_write(struct cgroup_subsys_state *css, struct cftype *
> return -EPERM;
>
> down_write(&ve->op_sem);
> - if (val && (ve->is_running || ve->ve_ns)) {
> + if (val && (VE_IS_RUNNING(ve) || ve->ve_ns)) {
> up_write(&ve->op_sem);
> return -EBUSY;
> }
> @@ -1325,7 +1348,7 @@ static int ve_features_write(struct cgroup_subsys_state *css, struct cftype *cft
> return -EPERM;
>
> down_write(&ve->op_sem);
> - if (ve->is_running || ve->ve_ns) {
> + if (VE_IS_RUNNING(ve) || ve->ve_ns) {
> up_write(&ve->op_sem);
> return -EBUSY;
> }
> @@ -1348,7 +1371,7 @@ static int ve_netns_max_nr_write(struct cgroup_subsys_state *css, struct cftype
> return -EPERM;
>
> down_write(&ve->op_sem);
> - if (ve->is_running || ve->ve_ns) {
> + if (VE_IS_RUNNING(ve) || ve->ve_ns) {
> up_write(&ve->op_sem);
> return -EBUSY;
> }
> @@ -1699,7 +1722,7 @@ static int ve_aio_max_nr_write(struct cgroup_subsys_state *css,
> return -EPERM;
>
> down_write(&ve->op_sem);
> - if (ve->is_running || ve->ve_ns) {
> + if (VE_IS_RUNNING(ve) || ve->ve_ns) {
> up_write(&ve->op_sem);
> return -EBUSY;
> }
--
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.
More information about the Devel
mailing list