[Devel] [PATCH RH9] ve/cgroup: fix cgroup file active count vs cgroup_mutex deadlock

Thu Jul 28 14:22:47 MSK 2022

A kind reminder. :)

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 07.07.2022 18:12, Konstantin Khorenko wrote:
> Sasha, can you please review the patch after you finish the current activity?
> 
> Thank you!
> 
> --
> Best regards,
> 
> Konstantin Khorenko,
> Virtuozzo Linux Kernel Team
> 
> On 04.07.2022 13:35, Pavel Tikhomirov wrote:
>> Any lock B taken in write callback of kernfs file, for instance ve.state
>> cgroup file, gets a lockdep dependency from this kernfs file refcount
>> increment A, same is valid for any locks C which in their turn are taken
>> from under B in other code paths.
>>
>> Meaning that if we wait for a refcount A to become released under B (or
>> C) while from other thread we hold the refcount A and try to lock B
>> (which is held by other thread which tries to lock C) we would get ABBA
>> (or ABBCCA) deadlock.
>>
>> In cgroup_rmdir() we wait for kernfs refcount release under cgroup_mutex
>> so the cgroup_mutex can't be in position B or C.
>>
>> Though we have two places where the above rule is violated:
>>
>> First, cgroup_mutex is taken in ve_start_container() which is under the
>> "ve.state" file kernfs refcount held, which puts the lock in position B.
>>
>> Second, cgroup_mutex is taken in cgroup_unmark_ve_roots() which is under
>> ve->op_sem mutex, and ve->op_sem mutex in it's turn is taken from the
>> ve_state_write() which is also under "ve.state" file kernfs refcount
>> held, which puts the lock in position C.
>>
>> So we need to move cgroup_mutex both from under the refcount and from
>> under ve->op_sem. We do this by splitting the actual cgroup file
>> structure modification code (which requires cgroup_mutex) into separate
>> functions which on container start called through task_work to remove
>> dependencies and on container stop called before taking ve->op_sem lock.
>>
>> https://jira.sw.ru/browse/PSBM-140700
>> Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
>>
>> Feature: cgroup: per-CT cgroup release_agent
>> ---
>>    include/linux/cgroup.h |   2 +
>>    include/linux/ve.h     |   1 +
>>    kernel/cgroup/cgroup.c | 157 ++++++++++++++++++++++++++++-------------
>>    kernel/ve/ve.c         |  26 ++++---
>>    4 files changed, 129 insertions(+), 57 deletions(-)
>>
>> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
>> index d2586a8397f3..9b4d11c97aec 100644
>> --- a/include/linux/cgroup.h
>> +++ b/include/linux/cgroup.h
>> @@ -891,6 +891,8 @@ void cgroup1_release_agent(struct work_struct *work);
>>    #ifdef CONFIG_VE
>>    extern int cgroup_mark_ve_roots(struct ve_struct *ve);
>>    void cgroup_unmark_ve_roots(struct ve_struct *ve);
>> +int ve_release_agent_setup(struct ve_struct *ve);
>> +void ve_release_agent_teardown(struct ve_struct *ve);
>>    struct ve_struct *cgroup_ve_owner(struct cgroup *cgrp);
>>    #endif
>>    
>> diff --git a/include/linux/ve.h b/include/linux/ve.h
>> index d64ed3a7d3a4..18b0e30366b7 100644
>> --- a/include/linux/ve.h
>> +++ b/include/linux/ve.h
>> @@ -113,6 +113,7 @@ struct ve_struct {
>>    	/* Should take rcu_read_lock and check ve->is_running before queue */
>>    	struct workqueue_struct	*wq;
>>    	struct work_struct	release_agent_work;
>> +	struct callback_head	ve_release_agent_setup_head;
>>    
>>    	/*
>>    	 * List of data, private for each root cgroup in
>> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
>> index 30f10afaecd4..46b4ac62e897 100644
>> --- a/kernel/cgroup/cgroup.c
>> +++ b/kernel/cgroup/cgroup.c
>> @@ -2094,43 +2094,101 @@ static inline bool ve_check_root_cgroups(struct css_set *cset)
>>    	return false;
>>    }
>>    
>> -static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
>> -			   struct cftype *cft, bool activate);
>> -
>>    int cgroup_mark_ve_roots(struct ve_struct *ve)
>>    {
>>    	struct cgrp_cset_link *link;
>>    	struct css_set *cset;
>>    	struct cgroup *cgrp;
>> -	struct cftype *cft;
>> -	int err = 0;
>> -
>> -	cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
>> -	BUG_ON(!cft || cft->file_offset);
>> -
>> -	mutex_lock(&cgroup_mutex);
>> -	spin_lock_irq(&css_set_lock);
>>    
>>    	/*
>> -	 * We can safely use ve->ve_ns without rcu_read_lock here, as we are
>> -	 * always called _after_ ve_grab_context under ve->op_sem, so we've
>> -	 * just set ve_ns and nobody else can modify it under us.
>> +	 * It's safe to use ve->ve_ns->cgroup_ns->root_cset here without extra
>> +	 * locking as we do it from container init at container start after
>> +	 * ve_grab_context and only container init can tear those down.
>>    	 */
>> -	cset = rcu_dereference_protected(ve->ve_ns,
>> -			lockdep_is_held(&ve->op_sem))->cgroup_ns->root_cset;
>> +	cset = rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns->root_cset;
>> +	BUG_ON(!cset);
>>    
>> +	spin_lock_irq(&css_set_lock);
>>    	if (ve_check_root_cgroups(cset)) {
>>    		spin_unlock_irq(&css_set_lock);
>> -		mutex_unlock(&cgroup_mutex);
>>    		return -EINVAL;
>>    	}
>>    
>> +	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
>> +		cgrp = link->cgrp;
>> +
>> +		if (!is_virtualized_cgroup(cgrp))
>> +			continue;
>> +
>> +		rcu_assign_pointer(cgrp->ve_owner, ve);
>> +		set_bit(CGRP_VE_ROOT, &cgrp->flags);
>> +	}
>> +	spin_unlock_irq(&css_set_lock);
>> +
>> +	/* FIXME: implies cpu cgroup in cset, which is not always right */
>> +	link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
>> +	synchronize_rcu();
>> +	return 0;
>> +}
>> +
>> +void cgroup_unmark_ve_roots(struct ve_struct *ve)
>> +{
>> +	struct cgrp_cset_link *link;
>> +	struct css_set *cset;
>> +	struct cgroup *cgrp;
>> +
>> +	/*
>> +	 * It's safe to use ve->ve_ns->cgroup_ns->root_cset here without extra
>> +	 * locking as we do it from container init at container start after
>> +	 * ve_grab_context and only container init can tear those down.
>> +	 */
>> +	cset = rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns->root_cset;
>> +	BUG_ON(!cset);
>> +
>> +	spin_lock_irq(&css_set_lock);
>> +	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
>> +		cgrp = link->cgrp;
>> +
>> +		if (!is_virtualized_cgroup(cgrp))
>> +			continue;
>> +
>> +		/* Set by cgroup_mark_ve_roots */
>> +		if (!test_bit(CGRP_VE_ROOT, &cgrp->flags))
>> +			continue;
>> +
>> +		rcu_assign_pointer(cgrp->ve_owner, NULL);
>> +		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
>> +	}
>> +	spin_unlock_irq(&css_set_lock);
>> +	/* ve_owner == NULL will be visible */
>> +	synchronize_rcu();
>> +}
>> +
>> +static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
>> +			   struct cftype *cft, bool activate);
>> +
>> +void ve_release_agent_setup_work(struct callback_head *head)
>> +{
>> +	struct cgrp_cset_link *link;
>> +	struct ve_struct *ve;
>> +	struct css_set *cset;
>> +	struct cgroup *cgrp;
>> +	struct cftype *cft;
>> +	int err;
>> +
>> +	cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
>> +	BUG_ON(!cft || cft->file_offset);
>> +
>> +	ve = container_of(head, struct ve_struct,
>> +			  ve_release_agent_setup_head);
>> +	cset = rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns->root_cset;
>> +
>>    	/*
>>    	 * We want to traverse the cset->cgrp_links list and
>>    	 * call cgroup_add_file() function which will call
>>    	 * memory allocation functions with GFP_KERNEL flag.
>> -	 * We can't continue to hold css_set_lock, but it's
>> -	 * safe to hold cgroup_mutex.
>> +	 * We can't lock css_set_lock, instead it's safe to
>> +	 * hold cgroup_mutex.
>>    	 *
>>    	 * It's safe to hold only cgroup_mutex and traverse
>>    	 * the cset list just because in all scenarious where
>> @@ -2144,18 +2202,17 @@ int cgroup_mark_ve_roots(struct ve_struct *ve)
>>    	 * check which prevents any list modifications if someone is still
>>    	 * holding the refcnt to cset.
>>    	 * See copy_cgroup_ns() function it's taking refcnt's by get_css_set().
>> -	 *
>>    	 */
>> -	spin_unlock_irq(&css_set_lock);
>> -
>> +	mutex_lock(&cgroup_mutex);
>>    	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
>>    		cgrp = link->cgrp;
>>    
>>    		if (!is_virtualized_cgroup(cgrp))
>>    			continue;
>>    
>> -		rcu_assign_pointer(cgrp->ve_owner, ve);
>> -		set_bit(CGRP_VE_ROOT, &cgrp->flags);
>> +		/* Set by cgroup_mark_ve_roots */
>> +		if (!test_bit(CGRP_VE_ROOT, &cgrp->flags))
>> +			continue;
>>    
>>    		/*
>>    		 * The cgroupns can hold reference on cgroups which are already
>> @@ -2171,17 +2228,29 @@ int cgroup_mark_ve_roots(struct ve_struct *ve)
>>    			}
>>    		}
>>    	}
>> -
>> -	link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
>>    	mutex_unlock(&cgroup_mutex);
>> -	synchronize_rcu();
>> +}
>>    
>> -	if (err)
>> -		cgroup_unmark_ve_roots(ve);
>> -	return err;
>> +int ve_release_agent_setup(struct ve_struct *ve)
>> +{
>> +	/*
>> +	 * Add release_agent files to ve root cgroups via task_work.
>> +	 * This is to resolve deadlock between cgroup file refcount and
>> +	 * cgroup_mutex, we can't take cgroup_mutex under the refcount
>> +	 * or under locks, which are under the refcount (ve->op_sem).
>> +	 *
>> +	 * It's safe to use ve->ve_ns->cgroup_ns->root_cset inside task_work
>> +	 * without extra locking as we do it from container init before exiting
>> +	 * to userspace just after container start and only container init can
>> +	 * tear those down.
>> +	 */
>> +	init_task_work(&ve->ve_release_agent_setup_head,
>> +		       ve_release_agent_setup_work);
>> +	return task_work_add(current, &ve->ve_release_agent_setup_head,
>> +			     TWA_RESUME);
>>    }
>>    
>> -void cgroup_unmark_ve_roots(struct ve_struct *ve)
>> +void ve_release_agent_teardown(struct ve_struct *ve)
>>    {
>>    	struct cgrp_cset_link *link;
>>    	struct css_set *cset;
>> @@ -2191,36 +2260,28 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve)
>>    	cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
>>    	BUG_ON(!cft || cft->file_offset);
>>    
>> -	mutex_lock(&cgroup_mutex);
>> -
>>    	/*
>> -	 * We can safely use ve->ve_ns without rcu_read_lock here, as we are
>> -	 * always called _before_ ve_drop_context under ve->op_sem, so we
>> -	 * did not change ve_ns yet and nobody else can modify it under us.
>> +	 * It's safe to use ve->ve_ns->cgroup_ns->root_cset here without extra
>> +	 * locking as we do it from container init at container stop before
>> +	 * ve_drop_context and only container init can tear those down.
>>    	 */
>> -	cset = rcu_dereference_protected(ve->ve_ns,
>> -			lockdep_is_held(&ve->op_sem))->cgroup_ns->root_cset;
>> -	BUG_ON(!cset);
>> +	cset = rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns->root_cset;
>>    
>> -	/*
>> -	 * Traversing @cgrp_links without @css_set_lock is safe here for
>> -	 * the same reasons as in cgroup_mark_ve_roots().
>> -	 */
>> +	mutex_lock(&cgroup_mutex);
>>    	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
>>    		cgrp = link->cgrp;
>>    
>>    		if (!is_virtualized_cgroup(cgrp))
>>    			continue;
>>    
>> +		/* Set by cgroup_mark_ve_roots */
>> +		if (!test_bit(CGRP_VE_ROOT, &cgrp->flags))
>> +			continue;
>> +
>>    		if (!cgroup_is_dead(cgrp))
>>    			cgroup_rm_file(cgrp, cft);
>> -		rcu_assign_pointer(cgrp->ve_owner, NULL);
>> -		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
>>    	}
>> -
>>    	mutex_unlock(&cgroup_mutex);
>> -	/* ve_owner == NULL will be visible */
>> -	synchronize_rcu();
>>    }
>>    
>>    struct cgroup_subsys_state *css_ve_root1(struct cgroup_subsys_state *css)
>> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
>> index d8c381c02e2c..9ee16b66ba4e 100644
>> --- a/kernel/ve/ve.c
>> +++ b/kernel/ve/ve.c
>> @@ -754,6 +754,10 @@ static int ve_start_container(struct ve_struct *ve)
>>    	if (err)
>>    		goto err_mark_ve;
>>    
>> +	err = ve_release_agent_setup(ve);
>> +	if (err)
>> +		goto err_release_agent_setup;
>> +
>>    	ve->is_running = 1;
>>    
>>    	printk(KERN_INFO "CT: %s: started\n", ve_name(ve));
>> @@ -762,6 +766,8 @@ static int ve_start_container(struct ve_struct *ve)
>>    
>>    	return 0;
>>    
>> +err_release_agent_setup:
>> +	cgroup_unmark_ve_roots(ve);
>>    err_mark_ve:
>>    	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>>    err_iterate:
>> @@ -823,26 +829,28 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
>>    	struct ve_struct *ve = current->task_ve;
>>    	struct nsproxy *ve_ns;
>>    
>> -	down_write(&ve->op_sem);
>> -	ve_ns = rcu_dereference_protected(ve->ve_ns, lockdep_is_held(&ve->op_sem));
>>    	/*
>> -	 * current->cgroups already switched to init_css_set in cgroup_exit(),
>> -	 * but current->task_ve still points to our exec ve.
>> +	 * Check that root container pidns dies and we are here to stop VE
>>    	 */
>> -	if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns)
>> -		goto unlock;
>> -
>> -	cgroup_unmark_ve_roots(ve);
>> +	rcu_read_lock();
>> +	ve_ns = rcu_dereference(ve->ve_ns);
>> +	if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns) {
>> +		rcu_read_unlock();
>> +		return;
>> +	}
>> +	rcu_read_unlock();
>>    
>>    	/*
>>    	 * At this point all userspace tasks in container are dead.
>>    	 */
>> +	ve_release_agent_teardown(ve);
>> +	down_write(&ve->op_sem);
>> +	cgroup_unmark_ve_roots(ve);
>>    	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>>    	ve_list_del(ve);
>>    	ve_drop_context(ve);
>>    	printk(KERN_INFO "CT: %s: stopped\n", ve_name(ve));
>>    	put_ve(ve); /* from ve_start_container() */
>> -unlock:
>>    	up_write(&ve->op_sem);
>>    }
>>