[Devel] [PATCH RHEL7 v21 12/14] ve/cgroup: added release_agent to each container root cgroup.
Valeriy Vdovin
valeriy.vdovin at virtuozzo.com
Fri Jul 31 12:38:30 MSK 2020
On 31.07.2020 11:24, Kirill Tkhai wrote:
> On 28.07.2020 20:53, Valeriy Vdovin wrote:
>> Each container will now have access to it's own cgroup release_agent
>> file.
>> Creation:
>> Normally all cgroup files are created during a call to cgroup_create
>> by cgroup_populate_dir function. It creates or not creates all cgroup
>> files once and they immediately become visible to userspace as
>> filesystem objects.
>> Due to specifics of container creation process, it is not possible to
>> use the same code for 'release_agent' file creation. For VE to start
>> operating, first a list of ordinary cgroups is being created for
>> each subsystem, then the set of newly created cgroups are converted to
>> "virtual roots", so at the time when cgroup_create is executed, there
>> is no knowledge of wheather or not "release_agent" file should be
>> created. This information only comes at "convertion" step which is
>> 'cgroup_mark_ve_roots' function. As the file is created dynamically
>> in a live cgroup, a rather delicate locking sequence is present in
>> the new code:
>> - each new "virtual root" cgroup will have to add "release_agent" file,
>> thus each cgroup's directory would need to be locked during
>> the insertion time by cgroup->dentry->d_inode->i_mutex.
>> - d_inode->i_mutex has an ordering dependency with cgroup_mutex
>> (see cgroup_mount/cgroup_remount). They can not be locked in order
>> {lock(cgroup_mutex), lock(inode->i_mutex)}.
>> - to collect a list of cgroups, that need to become virtual we need
>> cgroup_mutex lock to iterate active roots.
>> - to overcome the above conflict we first need to collect a list of
>> all virtual cgroups under cgroup_mutex lock, then release it and
>> after that to insert "release_agent" to each root under
>> inode->i_mutex lock.
>> - to collect a list of cgroups on stack we utilize
>> cgroup->cft_q_node, made specially for that purpose under it's own
>> cgroup_cft_mutex.
>>
>> Destruction:
>> Destruction is done in reverse from the above within
>> cgroup_unmark_ve_root.
>> After file destruction we must prevent further write operations to
>> this file in case when someone has opened this file prior to VE
>> and cgroup destruction. This is achieved by checking if cgroup
>> in the argument to cgroup_file_write function has features of host
>> or virtual root.
>>
>> https://jira.sw.ru/browse/PSBM-83887
>>
>> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
>> ---
>> include/linux/cgroup.h | 2 +-
>> include/linux/ve.h | 2 +-
>> kernel/cgroup.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++----
>> kernel/ve/ve.c | 6 +++-
>> 4 files changed, 87 insertions(+), 9 deletions(-)
>>
>> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
>> index fc138c0..645c9fd 100644
>> --- a/include/linux/cgroup.h
>> +++ b/include/linux/cgroup.h
>> @@ -671,7 +671,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
>> void cgroup_release_agent(struct work_struct *work);
>>
>> #ifdef CONFIG_VE
>> -void cgroup_mark_ve_roots(struct ve_struct *ve);
>> +int cgroup_mark_ve_roots(struct ve_struct *ve);
>> void cgroup_unmark_ve_roots(struct ve_struct *ve);
>> struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
>> #endif
>> diff --git a/include/linux/ve.h b/include/linux/ve.h
>> index b6662637..5bf275f 100644
>> --- a/include/linux/ve.h
>> +++ b/include/linux/ve.h
>> @@ -215,7 +215,7 @@ void do_update_load_avg_ve(void);
>> void ve_add_to_release_list(struct cgroup *cgrp);
>> void ve_rm_from_release_list(struct cgroup *cgrp);
>>
>> -int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot,
>> +int ve_set_release_agent_path(struct cgroup *cgroot,
>> const char *release_agent);
>>
>> const char *ve_get_release_agent_path(struct cgroup *cgrp_root);
>> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
>> index 1d9c889..bb77804 100644
>> --- a/kernel/cgroup.c
>> +++ b/kernel/cgroup.c
>> @@ -2360,13 +2360,28 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
>> if (!cgroup_lock_live_group(cgrp))
>> return -ENODEV;
>>
>> + /*
>> + * Call to cgroup_get_local_root is a way to make sure
>> + * that cgrp in the argument is valid "virtual root"
>> + * cgroup. If it's not - this means that cgroup_unmark_ve_roots
>> + * has already cleared CGRP_VE_ROOT flag from this cgroup while
>> + * the file was still opened by other process and
>> + * that we shouldn't allow further writings via that file.
>> + */
>> root_cgrp = cgroup_get_local_root(cgrp);
>> BUG_ON(!root_cgrp);
>> +
>> + if (root_cgrp != cgrp) {
>> + ret = -EPERM;
>> + goto out;
>> + }
>> +
>> if (root_cgrp->ve_owner)
>> ret = ve_set_release_agent_path(root_cgrp, buffer);
>> else
>> return -ENODEV;
>>
>> +out:
>> mutex_unlock(&cgroup_mutex);
>> return ret;
>> }
>> @@ -4204,7 +4219,7 @@ static struct cftype files[] = {
>> },
>> {
>> .name = "release_agent",
>> - .flags = CFTYPE_ONLY_ON_ROOT,
>> + .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
>> .read_seq_string = cgroup_release_agent_show,
>> .write_string = cgroup_release_agent_write,
>> .max_write_len = PATH_MAX,
>> @@ -4422,39 +4437,98 @@ static int subgroups_count(struct cgroup *cgroup)
>> return cgrps_count;
>> }
>>
>> +static struct cftype *get_cftype_by_name(const char *name)
>> +{
>> + struct cftype *cft;
>> + for (cft = files; cft->name[0] != '\0'; cft++) {
>> + if (!strcmp(cft->name, name))
>> + return cft;
>> + }
>> + return NULL;
>> +}
>> +
>> #ifdef CONFIG_VE
>> -void cgroup_mark_ve_roots(struct ve_struct *ve)
>> +int cgroup_mark_ve_roots(struct ve_struct *ve)
>> {
>> - struct cgroup *cgrp;
>> + struct cgroup *cgrp, *tmp;
>> struct cgroupfs_root *root;
>> + int err = 0;
>> + struct cftype *cft;
>> + LIST_HEAD(pending);
>>
>> + cft = get_cftype_by_name("release_agent");
>> + BUG_ON(!cft);
>> +
>> + mutex_lock(&cgroup_cft_mutex);
>> mutex_lock(&cgroup_mutex);
>> for_each_active_root(root) {
>> - cgrp = task_cgroup_from_root(ve->init_task, root);
>> + cgrp = css_cgroup_from_root(ve->root_css_set, root);
>> rcu_assign_pointer(cgrp->ve_owner, ve);
>> set_bit(CGRP_VE_ROOT, &cgrp->flags);
>>
>> + dget(cgrp->dentry);
>> + list_add_tail(&cgrp->cft_q_node, &pending);
>> if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
>> link_ve_root_cpu_cgroup(cgrp);
>> }
>> mutex_unlock(&cgroup_mutex);
>> synchronize_rcu();
>> + list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
>> + struct inode *inode = cgrp->dentry->d_inode;
>> +
>> + if (err) {
>> + list_del_init(&cgrp->cft_q_node);
>> + dput(cgrp->dentry);
>> + continue;
>> + }
> How does this work?
>
> If cgroup_add_file() sets error on previous iteration, then here we do double dput()?
Here is how it works. Before marking VE_ROOT and creating release agent
files, we need to know a list of cgroups that will need to act upon. We
can only find it under cgroup_mutex lock. But we have a restrinction -
to create a file we need to lock inode->i_mutex. The right lock ordering
is different:
1. lock(inode->i_mutex), 2. lock(cgroup_mutex)
Because of that we split the whole function into 2 loops. First we lock
cgroup_mutex and find all cgroups under cgroup mutex lock and put them
in a list on stack (struct list_head pending). After the list is ready
we release cgroup_mutex and go to second step. In second step we iterate
the list of cgroups and call cgroup_add_file for each item with correct
lock ordering. The consistency of pending cgroups list is enforced by
cgroup_cft_mutex, which is locked across both loops.
struct cgroup has a list node field specially for this kind of
situations called "cft_q_node", see cgroup_cfts_commit for more details.
Now in the first loop we collect the list on stack and do dget N times
for each item in the list. In the second loop we have to call dput same
number of times.
In the second loop the block "if (err) {...dput...continue;}" means - if
any cgroup_add_file has returned with error, skip any further file
creation but cleanup cgroup->cft_q_node and call dput.
I don't see double dput here per item here.
>> +
>> + mutex_lock(&inode->i_mutex);
>> + mutex_lock(&cgroup_mutex);
>> + if (!cgroup_is_removed(cgrp))
>> + err = cgroup_add_file(cgrp, NULL, cft);
>> + mutex_unlock(&cgroup_mutex);
>> + mutex_unlock(&inode->i_mutex);
>> +
>> + list_del_init(&cgrp->cft_q_node);
>> + dput(cgrp->dentry);
>> + }
>> + mutex_unlock(&cgroup_cft_mutex);
>> +
>> + if (err)
>> + cgroup_unmark_ve_roots(ve);
>> +
>> + return err;
>> }
>>
>> void cgroup_unmark_ve_roots(struct ve_struct *ve)
>> {
>> - struct cgroup *cgrp;
>> + struct cgroup *cgrp, *tmp;
>> struct cgroupfs_root *root;
>> + struct cftype *cft;
>> + LIST_HEAD(pending);
>> +
>> + cft = get_cftype_by_name("release_agent");
>>
>> + mutex_lock(&cgroup_cft_mutex);
>> mutex_lock(&cgroup_mutex);
>> for_each_active_root(root) {
>> cgrp = css_cgroup_from_root(ve->root_css_set, root);
>> + dget(cgrp->dentry);
>> + list_add_tail(&cgrp->cft_q_node, &pending);
>> + }
>> + mutex_unlock(&cgroup_mutex);
>> + list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
>> + struct inode *inode = cgrp->dentry->d_inode;
>> + mutex_lock(&inode->i_mutex);
>> + mutex_lock(&cgroup_mutex);
>> + cgroup_rm_file(cgrp, cft);
>> BUG_ON(!rcu_dereference_protected(cgrp->ve_owner,
>> lockdep_is_held(&cgroup_mutex)));
>> rcu_assign_pointer(cgrp->ve_owner, NULL);
>> clear_bit(CGRP_VE_ROOT, &cgrp->flags);
>> + mutex_unlock(&cgroup_mutex);
>> + mutex_unlock(&inode->i_mutex);
>> }
>> - mutex_unlock(&cgroup_mutex);
>> + mutex_unlock(&cgroup_cft_mutex);
>> /* ve_owner == NULL will be visible */
>> synchronize_rcu();
>>
>> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
>> index f03f665..8d78270 100644
>> --- a/kernel/ve/ve.c
>> +++ b/kernel/ve/ve.c
>> @@ -718,7 +718,9 @@ static int ve_start_container(struct ve_struct *ve)
>> if (err < 0)
>> goto err_iterate;
>>
>> - cgroup_mark_ve_roots(ve);
>> + err = cgroup_mark_ve_roots(ve);
>> + if (err)
>> + goto err_mark_ve;
>>
>> ve->is_running = 1;
>>
>> @@ -728,6 +730,8 @@ static int ve_start_container(struct ve_struct *ve)
>>
>> return 0;
>>
>> +err_mark_ve:
>> + ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>> err_iterate:
>> ve_workqueue_stop(ve);
>> err_workqueue:
>>
More information about the Devel
mailing list