[Devel] [PATCH RHEL7 v21 12/14] ve/cgroup: added release_agent to each container root cgroup.

Fri Jul 31 12:38:30 MSK 2020

On 31.07.2020 11:24, Kirill Tkhai wrote:
> On 28.07.2020 20:53, Valeriy Vdovin wrote:
>> Each container will now have access to it's own cgroup release_agent
>> file.
>> Creation:
>>    Normally all cgroup files are created during a call to cgroup_create
>>    by cgroup_populate_dir function. It creates or not creates all cgroup
>>    files once and they immediately become visible to userspace as
>>    filesystem objects.
>>    Due to specifics of container creation process, it is not possible to
>>    use the same code for 'release_agent' file creation. For VE to start
>>    operating, first a list of ordinary cgroups is being created for
>>    each subsystem, then the set of newly created cgroups are converted to
>>    "virtual roots", so at the time when cgroup_create is executed, there
>>    is no knowledge of wheather or not "release_agent" file should be
>>    created. This information only comes at "convertion" step which is
>>    'cgroup_mark_ve_roots' function. As the file is created dynamically
>>    in a live cgroup, a rather delicate locking sequence is present in
>>    the new code:
>>      - each new "virtual root" cgroup will have to add "release_agent" file,
>>        thus each cgroup's directory would need to be locked during
>>        the insertion time by cgroup->dentry->d_inode->i_mutex.
>>      - d_inode->i_mutex has an ordering dependency with cgroup_mutex
>>        (see cgroup_mount/cgroup_remount). They can not be locked in order
>>        {lock(cgroup_mutex), lock(inode->i_mutex)}.
>>      - to collect a list of cgroups, that need to become virtual we need
>>        cgroup_mutex lock to iterate active roots.
>>      - to overcome the above conflict we first need to collect a list of
>>        all virtual cgroups under cgroup_mutex lock, then release it and
>>        after that to insert "release_agent" to each root under
>>        inode->i_mutex lock.
>>      - to collect a list of cgroups on stack we utilize
>>        cgroup->cft_q_node, made specially for that purpose under it's own
>>        cgroup_cft_mutex.
>>
>> Destruction:
>>    Destruction is done in reverse from the above within
>>    cgroup_unmark_ve_root.
>>    After file destruction we must prevent further write operations to
>>    this file in case when someone has opened this file prior to VE
>>    and cgroup destruction. This is achieved by checking if cgroup
>>    in the argument to cgroup_file_write function has features of host
>>    or virtual root.
>>
>> https://jira.sw.ru/browse/PSBM-83887
>>
>> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
>> ---
>>   include/linux/cgroup.h |  2 +-
>>   include/linux/ve.h     |  2 +-
>>   kernel/cgroup.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++----
>>   kernel/ve/ve.c         |  6 +++-
>>   4 files changed, 87 insertions(+), 9 deletions(-)
>>
>> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
>> index fc138c0..645c9fd 100644
>> --- a/include/linux/cgroup.h
>> +++ b/include/linux/cgroup.h
>> @@ -671,7 +671,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
>>   void cgroup_release_agent(struct work_struct *work);
>>   
>>   #ifdef CONFIG_VE
>> -void cgroup_mark_ve_roots(struct ve_struct *ve);
>> +int cgroup_mark_ve_roots(struct ve_struct *ve);
>>   void cgroup_unmark_ve_roots(struct ve_struct *ve);
>>   struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
>>   #endif
>> diff --git a/include/linux/ve.h b/include/linux/ve.h
>> index b6662637..5bf275f 100644
>> --- a/include/linux/ve.h
>> +++ b/include/linux/ve.h
>> @@ -215,7 +215,7 @@ void do_update_load_avg_ve(void);
>>   void ve_add_to_release_list(struct cgroup *cgrp);
>>   void ve_rm_from_release_list(struct cgroup *cgrp);
>>   
>> -int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot,
>> +int ve_set_release_agent_path(struct cgroup *cgroot,
>>   	const char *release_agent);
>>   
>>   const char *ve_get_release_agent_path(struct cgroup *cgrp_root);
>> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
>> index 1d9c889..bb77804 100644
>> --- a/kernel/cgroup.c
>> +++ b/kernel/cgroup.c
>> @@ -2360,13 +2360,28 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
>>   	if (!cgroup_lock_live_group(cgrp))
>>   		return -ENODEV;
>>   
>> +	/*
>> +	 * Call to cgroup_get_local_root is a way to make sure
>> +	 * that cgrp in the argument is valid "virtual root"
>> +	 * cgroup. If it's not - this means that cgroup_unmark_ve_roots
>> +	 * has already cleared CGRP_VE_ROOT flag from this cgroup while
>> +	 * the file was still opened by other process and
>> +	 * that we shouldn't allow further writings via that file.
>> +	 */
>>   	root_cgrp = cgroup_get_local_root(cgrp);
>>   	BUG_ON(!root_cgrp);
>> +
>> +	if (root_cgrp != cgrp) {
>> +		ret = -EPERM;
>> +		goto out;
>> +	}
>> +
>>   	if (root_cgrp->ve_owner)
>>   		ret = ve_set_release_agent_path(root_cgrp, buffer);
>>   	else
>>   		return -ENODEV;
>>   
>> +out:
>>   	mutex_unlock(&cgroup_mutex);
>>   	return ret;
>>   }
>> @@ -4204,7 +4219,7 @@ static struct cftype files[] = {
>>   	},
>>   	{
>>   		.name = "release_agent",
>> -		.flags = CFTYPE_ONLY_ON_ROOT,
>> +		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
>>   		.read_seq_string = cgroup_release_agent_show,
>>   		.write_string = cgroup_release_agent_write,
>>   		.max_write_len = PATH_MAX,
>> @@ -4422,39 +4437,98 @@ static int subgroups_count(struct cgroup *cgroup)
>>   	return cgrps_count;
>>   }
>>   
>> +static struct cftype *get_cftype_by_name(const char *name)
>> +{
>> +	struct cftype *cft;
>> +	for (cft = files; cft->name[0] != '\0'; cft++) {
>> +		if (!strcmp(cft->name, name))
>> +			return cft;
>> +	}
>> +	return NULL;
>> +}
>> +
>>   #ifdef CONFIG_VE
>> -void cgroup_mark_ve_roots(struct ve_struct *ve)
>> +int cgroup_mark_ve_roots(struct ve_struct *ve)
>>   {
>> -	struct cgroup *cgrp;
>> +	struct cgroup *cgrp, *tmp;
>>   	struct cgroupfs_root *root;
>> +	int err = 0;
>> +	struct cftype *cft;
>> +	LIST_HEAD(pending);
>>   
>> +	cft = get_cftype_by_name("release_agent");
>> +	BUG_ON(!cft);
>> +
>> +	mutex_lock(&cgroup_cft_mutex);
>>   	mutex_lock(&cgroup_mutex);
>>   	for_each_active_root(root) {
>> -		cgrp = task_cgroup_from_root(ve->init_task, root);
>> +		cgrp = css_cgroup_from_root(ve->root_css_set, root);
>>   		rcu_assign_pointer(cgrp->ve_owner, ve);
>>   		set_bit(CGRP_VE_ROOT, &cgrp->flags);
>>   
>> +		dget(cgrp->dentry);
>> +		list_add_tail(&cgrp->cft_q_node, &pending);
>>   		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
>>   			link_ve_root_cpu_cgroup(cgrp);
>>   	}
>>   	mutex_unlock(&cgroup_mutex);
>>   	synchronize_rcu();
>> +	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
>> +		struct inode *inode = cgrp->dentry->d_inode;
>> +
>> +		if (err) {
>> +			list_del_init(&cgrp->cft_q_node);
>> +			dput(cgrp->dentry);
>> +			continue;
>> +		}
> How does this work?
>
> If cgroup_add_file() sets error on previous iteration, then here we do double dput()?

Here is how it works. Before marking VE_ROOT and creating release agent 
files, we need to know a list of cgroups that will need to act upon. We 
can only find it under cgroup_mutex lock. But we have a restrinction - 
to create a file we need to lock inode->i_mutex. The right lock ordering 
is different:

1. lock(inode->i_mutex), 2. lock(cgroup_mutex)

Because of that we split the whole function into 2 loops. First we lock 
cgroup_mutex and find all cgroups under cgroup mutex lock and put them 
in a list on stack (struct list_head pending). After the list is ready 
we release cgroup_mutex and go to second step. In second step we iterate 
the list of cgroups and call cgroup_add_file for each item with correct 
lock ordering. The consistency of pending cgroups list is enforced by 
cgroup_cft_mutex, which is locked across both loops.

struct cgroup has a list node field specially for this kind of 
situations called "cft_q_node", see cgroup_cfts_commit for more details.

Now in the first loop we collect the list on stack and do dget N times 
for each item in the list. In the second loop we have to call dput same 
number of times.

In the second loop the block "if (err) {...dput...continue;}" means - if 
any cgroup_add_file has returned with error, skip any further file 
creation but cleanup cgroup->cft_q_node and call dput.

I don't see double dput here per item here.

>> +
>> +		mutex_lock(&inode->i_mutex);
>> +		mutex_lock(&cgroup_mutex);
>> +		if (!cgroup_is_removed(cgrp))
>> +			err = cgroup_add_file(cgrp, NULL, cft);
>> +		mutex_unlock(&cgroup_mutex);
>> +		mutex_unlock(&inode->i_mutex);
>> +
>> +		list_del_init(&cgrp->cft_q_node);
>> +		dput(cgrp->dentry);
>> +	}
>> +	mutex_unlock(&cgroup_cft_mutex);
>> +
>> +	if (err)
>> +		cgroup_unmark_ve_roots(ve);
>> +
>> +	return err;
>>   }
>>   
>>   void cgroup_unmark_ve_roots(struct ve_struct *ve)
>>   {
>> -	struct cgroup *cgrp;
>> +	struct cgroup *cgrp, *tmp;
>>   	struct cgroupfs_root *root;
>> +	struct cftype *cft;
>> +	LIST_HEAD(pending);
>> +
>> +	cft = get_cftype_by_name("release_agent");
>>   
>> +	mutex_lock(&cgroup_cft_mutex);
>>   	mutex_lock(&cgroup_mutex);
>>   	for_each_active_root(root) {
>>   		cgrp = css_cgroup_from_root(ve->root_css_set, root);
>> +		dget(cgrp->dentry);
>> +		list_add_tail(&cgrp->cft_q_node, &pending);
>> +	}
>> +	mutex_unlock(&cgroup_mutex);
>> +	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
>> +		struct inode *inode = cgrp->dentry->d_inode;
>> +		mutex_lock(&inode->i_mutex);
>> +		mutex_lock(&cgroup_mutex);
>> +		cgroup_rm_file(cgrp, cft);
>>   		BUG_ON(!rcu_dereference_protected(cgrp->ve_owner,
>>   				lockdep_is_held(&cgroup_mutex)));
>>   		rcu_assign_pointer(cgrp->ve_owner, NULL);
>>   		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
>> +		mutex_unlock(&cgroup_mutex);
>> +		mutex_unlock(&inode->i_mutex);
>>   	}
>> -	mutex_unlock(&cgroup_mutex);
>> +	mutex_unlock(&cgroup_cft_mutex);
>>   	/* ve_owner == NULL will be visible */
>>   	synchronize_rcu();
>>   
>> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
>> index f03f665..8d78270 100644
>> --- a/kernel/ve/ve.c
>> +++ b/kernel/ve/ve.c
>> @@ -718,7 +718,9 @@ static int ve_start_container(struct ve_struct *ve)
>>   	if (err < 0)
>>   		goto err_iterate;
>>   
>> -	cgroup_mark_ve_roots(ve);
>> +	err = cgroup_mark_ve_roots(ve);
>> +	if (err)
>> +		goto err_mark_ve;
>>   
>>   	ve->is_running = 1;
>>   
>> @@ -728,6 +730,8 @@ static int ve_start_container(struct ve_struct *ve)
>>   
>>   	return 0;
>>   
>> +err_mark_ve:
>> +	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>>   err_iterate:
>>   	ve_workqueue_stop(ve);
>>   err_workqueue:
>>