[Devel] [PATCH RH9] ve/cgroup: fix cgroup file active count vs cgroup_mutex deadlock
Konstantin Khorenko
khorenko at virtuozzo.com
Thu Jul 7 19:12:57 MSK 2022
Sasha, can you please review the patch after you finish the current activity?
Thank you!
--
Best regards,
Konstantin Khorenko,
Virtuozzo Linux Kernel Team
On 04.07.2022 13:35, Pavel Tikhomirov wrote:
> Any lock B taken in write callback of kernfs file, for instance ve.state
> cgroup file, gets a lockdep dependency from this kernfs file refcount
> increment A, same is valid for any locks C which in their turn are taken
> from under B in other code paths.
>
> Meaning that if we wait for a refcount A to become released under B (or
> C) while from other thread we hold the refcount A and try to lock B
> (which is held by other thread which tries to lock C) we would get ABBA
> (or ABBCCA) deadlock.
>
> In cgroup_rmdir() we wait for kernfs refcount release under cgroup_mutex
> so the cgroup_mutex can't be in position B or C.
>
> Though we have two places where the above rule is violated:
>
> First, cgroup_mutex is taken in ve_start_container() which is under the
> "ve.state" file kernfs refcount held, which puts the lock in position B.
>
> Second, cgroup_mutex is taken in cgroup_unmark_ve_roots() which is under
> ve->op_sem mutex, and ve->op_sem mutex in it's turn is taken from the
> ve_state_write() which is also under "ve.state" file kernfs refcount
> held, which puts the lock in position C.
>
> So we need to move cgroup_mutex both from under the refcount and from
> under ve->op_sem. We do this by splitting the actual cgroup file
> structure modification code (which requires cgroup_mutex) into separate
> functions which on container start called through task_work to remove
> dependencies and on container stop called before taking ve->op_sem lock.
>
> https://jira.sw.ru/browse/PSBM-140700
> Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
>
> Feature: cgroup: per-CT cgroup release_agent
> ---
> include/linux/cgroup.h | 2 +
> include/linux/ve.h | 1 +
> kernel/cgroup/cgroup.c | 157 ++++++++++++++++++++++++++++-------------
> kernel/ve/ve.c | 26 ++++---
> 4 files changed, 129 insertions(+), 57 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index d2586a8397f3..9b4d11c97aec 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -891,6 +891,8 @@ void cgroup1_release_agent(struct work_struct *work);
> #ifdef CONFIG_VE
> extern int cgroup_mark_ve_roots(struct ve_struct *ve);
> void cgroup_unmark_ve_roots(struct ve_struct *ve);
> +int ve_release_agent_setup(struct ve_struct *ve);
> +void ve_release_agent_teardown(struct ve_struct *ve);
> struct ve_struct *cgroup_ve_owner(struct cgroup *cgrp);
> #endif
>
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index d64ed3a7d3a4..18b0e30366b7 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -113,6 +113,7 @@ struct ve_struct {
> /* Should take rcu_read_lock and check ve->is_running before queue */
> struct workqueue_struct *wq;
> struct work_struct release_agent_work;
> + struct callback_head ve_release_agent_setup_head;
>
> /*
> * List of data, private for each root cgroup in
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 30f10afaecd4..46b4ac62e897 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -2094,43 +2094,101 @@ static inline bool ve_check_root_cgroups(struct css_set *cset)
> return false;
> }
>
> -static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
> - struct cftype *cft, bool activate);
> -
> int cgroup_mark_ve_roots(struct ve_struct *ve)
> {
> struct cgrp_cset_link *link;
> struct css_set *cset;
> struct cgroup *cgrp;
> - struct cftype *cft;
> - int err = 0;
> -
> - cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
> - BUG_ON(!cft || cft->file_offset);
> -
> - mutex_lock(&cgroup_mutex);
> - spin_lock_irq(&css_set_lock);
>
> /*
> - * We can safely use ve->ve_ns without rcu_read_lock here, as we are
> - * always called _after_ ve_grab_context under ve->op_sem, so we've
> - * just set ve_ns and nobody else can modify it under us.
> + * It's safe to use ve->ve_ns->cgroup_ns->root_cset here without extra
> + * locking as we do it from container init at container start after
> + * ve_grab_context and only container init can tear those down.
> */
> - cset = rcu_dereference_protected(ve->ve_ns,
> - lockdep_is_held(&ve->op_sem))->cgroup_ns->root_cset;
> + cset = rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns->root_cset;
> + BUG_ON(!cset);
>
> + spin_lock_irq(&css_set_lock);
> if (ve_check_root_cgroups(cset)) {
> spin_unlock_irq(&css_set_lock);
> - mutex_unlock(&cgroup_mutex);
> return -EINVAL;
> }
>
> + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
> + cgrp = link->cgrp;
> +
> + if (!is_virtualized_cgroup(cgrp))
> + continue;
> +
> + rcu_assign_pointer(cgrp->ve_owner, ve);
> + set_bit(CGRP_VE_ROOT, &cgrp->flags);
> + }
> + spin_unlock_irq(&css_set_lock);
> +
> + /* FIXME: implies cpu cgroup in cset, which is not always right */
> + link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
> + synchronize_rcu();
> + return 0;
> +}
> +
> +void cgroup_unmark_ve_roots(struct ve_struct *ve)
> +{
> + struct cgrp_cset_link *link;
> + struct css_set *cset;
> + struct cgroup *cgrp;
> +
> + /*
> + * It's safe to use ve->ve_ns->cgroup_ns->root_cset here without extra
> + * locking as we do it from container init at container start after
> + * ve_grab_context and only container init can tear those down.
> + */
> + cset = rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns->root_cset;
> + BUG_ON(!cset);
> +
> + spin_lock_irq(&css_set_lock);
> + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
> + cgrp = link->cgrp;
> +
> + if (!is_virtualized_cgroup(cgrp))
> + continue;
> +
> + /* Set by cgroup_mark_ve_roots */
> + if (!test_bit(CGRP_VE_ROOT, &cgrp->flags))
> + continue;
> +
> + rcu_assign_pointer(cgrp->ve_owner, NULL);
> + clear_bit(CGRP_VE_ROOT, &cgrp->flags);
> + }
> + spin_unlock_irq(&css_set_lock);
> + /* ve_owner == NULL will be visible */
> + synchronize_rcu();
> +}
> +
> +static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
> + struct cftype *cft, bool activate);
> +
> +void ve_release_agent_setup_work(struct callback_head *head)
> +{
> + struct cgrp_cset_link *link;
> + struct ve_struct *ve;
> + struct css_set *cset;
> + struct cgroup *cgrp;
> + struct cftype *cft;
> + int err;
> +
> + cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
> + BUG_ON(!cft || cft->file_offset);
> +
> + ve = container_of(head, struct ve_struct,
> + ve_release_agent_setup_head);
> + cset = rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns->root_cset;
> +
> /*
> * We want to traverse the cset->cgrp_links list and
> * call cgroup_add_file() function which will call
> * memory allocation functions with GFP_KERNEL flag.
> - * We can't continue to hold css_set_lock, but it's
> - * safe to hold cgroup_mutex.
> + * We can't lock css_set_lock, instead it's safe to
> + * hold cgroup_mutex.
> *
> * It's safe to hold only cgroup_mutex and traverse
> * the cset list just because in all scenarious where
> @@ -2144,18 +2202,17 @@ int cgroup_mark_ve_roots(struct ve_struct *ve)
> * check which prevents any list modifications if someone is still
> * holding the refcnt to cset.
> * See copy_cgroup_ns() function it's taking refcnt's by get_css_set().
> - *
> */
> - spin_unlock_irq(&css_set_lock);
> -
> + mutex_lock(&cgroup_mutex);
> list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
> cgrp = link->cgrp;
>
> if (!is_virtualized_cgroup(cgrp))
> continue;
>
> - rcu_assign_pointer(cgrp->ve_owner, ve);
> - set_bit(CGRP_VE_ROOT, &cgrp->flags);
> + /* Set by cgroup_mark_ve_roots */
> + if (!test_bit(CGRP_VE_ROOT, &cgrp->flags))
> + continue;
>
> /*
> * The cgroupns can hold reference on cgroups which are already
> @@ -2171,17 +2228,29 @@ int cgroup_mark_ve_roots(struct ve_struct *ve)
> }
> }
> }
> -
> - link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
> mutex_unlock(&cgroup_mutex);
> - synchronize_rcu();
> +}
>
> - if (err)
> - cgroup_unmark_ve_roots(ve);
> - return err;
> +int ve_release_agent_setup(struct ve_struct *ve)
> +{
> + /*
> + * Add release_agent files to ve root cgroups via task_work.
> + * This is to resolve deadlock between cgroup file refcount and
> + * cgroup_mutex, we can't take cgroup_mutex under the refcount
> + * or under locks, which are under the refcount (ve->op_sem).
> + *
> + * It's safe to use ve->ve_ns->cgroup_ns->root_cset inside task_work
> + * without extra locking as we do it from container init before exiting
> + * to userspace just after container start and only container init can
> + * tear those down.
> + */
> + init_task_work(&ve->ve_release_agent_setup_head,
> + ve_release_agent_setup_work);
> + return task_work_add(current, &ve->ve_release_agent_setup_head,
> + TWA_RESUME);
> }
>
> -void cgroup_unmark_ve_roots(struct ve_struct *ve)
> +void ve_release_agent_teardown(struct ve_struct *ve)
> {
> struct cgrp_cset_link *link;
> struct css_set *cset;
> @@ -2191,36 +2260,28 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve)
> cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
> BUG_ON(!cft || cft->file_offset);
>
> - mutex_lock(&cgroup_mutex);
> -
> /*
> - * We can safely use ve->ve_ns without rcu_read_lock here, as we are
> - * always called _before_ ve_drop_context under ve->op_sem, so we
> - * did not change ve_ns yet and nobody else can modify it under us.
> + * It's safe to use ve->ve_ns->cgroup_ns->root_cset here without extra
> + * locking as we do it from container init at container stop before
> + * ve_drop_context and only container init can tear those down.
> */
> - cset = rcu_dereference_protected(ve->ve_ns,
> - lockdep_is_held(&ve->op_sem))->cgroup_ns->root_cset;
> - BUG_ON(!cset);
> + cset = rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns->root_cset;
>
> - /*
> - * Traversing @cgrp_links without @css_set_lock is safe here for
> - * the same reasons as in cgroup_mark_ve_roots().
> - */
> + mutex_lock(&cgroup_mutex);
> list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
> cgrp = link->cgrp;
>
> if (!is_virtualized_cgroup(cgrp))
> continue;
>
> + /* Set by cgroup_mark_ve_roots */
> + if (!test_bit(CGRP_VE_ROOT, &cgrp->flags))
> + continue;
> +
> if (!cgroup_is_dead(cgrp))
> cgroup_rm_file(cgrp, cft);
> - rcu_assign_pointer(cgrp->ve_owner, NULL);
> - clear_bit(CGRP_VE_ROOT, &cgrp->flags);
> }
> -
> mutex_unlock(&cgroup_mutex);
> - /* ve_owner == NULL will be visible */
> - synchronize_rcu();
> }
>
> struct cgroup_subsys_state *css_ve_root1(struct cgroup_subsys_state *css)
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index d8c381c02e2c..9ee16b66ba4e 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -754,6 +754,10 @@ static int ve_start_container(struct ve_struct *ve)
> if (err)
> goto err_mark_ve;
>
> + err = ve_release_agent_setup(ve);
> + if (err)
> + goto err_release_agent_setup;
> +
> ve->is_running = 1;
>
> printk(KERN_INFO "CT: %s: started\n", ve_name(ve));
> @@ -762,6 +766,8 @@ static int ve_start_container(struct ve_struct *ve)
>
> return 0;
>
> +err_release_agent_setup:
> + cgroup_unmark_ve_roots(ve);
> err_mark_ve:
> ve_hook_iterate_fini(VE_SS_CHAIN, ve);
> err_iterate:
> @@ -823,26 +829,28 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
> struct ve_struct *ve = current->task_ve;
> struct nsproxy *ve_ns;
>
> - down_write(&ve->op_sem);
> - ve_ns = rcu_dereference_protected(ve->ve_ns, lockdep_is_held(&ve->op_sem));
> /*
> - * current->cgroups already switched to init_css_set in cgroup_exit(),
> - * but current->task_ve still points to our exec ve.
> + * Check that root container pidns dies and we are here to stop VE
> */
> - if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns)
> - goto unlock;
> -
> - cgroup_unmark_ve_roots(ve);
> + rcu_read_lock();
> + ve_ns = rcu_dereference(ve->ve_ns);
> + if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns) {
> + rcu_read_unlock();
> + return;
> + }
> + rcu_read_unlock();
>
> /*
> * At this point all userspace tasks in container are dead.
> */
> + ve_release_agent_teardown(ve);
> + down_write(&ve->op_sem);
> + cgroup_unmark_ve_roots(ve);
> ve_hook_iterate_fini(VE_SS_CHAIN, ve);
> ve_list_del(ve);
> ve_drop_context(ve);
> printk(KERN_INFO "CT: %s: stopped\n", ve_name(ve));
> put_ve(ve); /* from ve_start_container() */
> -unlock:
> up_write(&ve->op_sem);
> }
>
More information about the Devel
mailing list