[Devel] [PATCH VZ8 v1 12/14] ve/cgroup: added release_agent to each container root cgroup.
Kirill Tkhai
ktkhai at virtuozzo.com
Fri Jan 29 18:01:09 MSK 2021
On 20.01.2021 12:56, Valeriy Vdovin wrote:
> Each container will now have access to it's own cgroup release_agent
> file.
> Creation:
> Normally all cgroup files are created during a call to cgroup_create
> by cgroup_populate_dir function. It creates or not creates all cgroup
> files once and they immediately become visible to userspace as
> filesystem objects.
> Due to specifics of container creation process, it is not possible to
> use the same code for 'release_agent' file creation. For VE to start
> operating, first a list of ordinary cgroups is being created for
> each subsystem, then the set of newly created cgroups are converted to
> "virtual roots", so at the time when cgroup_create is executed, there
> is no knowledge of wheather or not "release_agent" file should be
> created. This information only comes at "conversion" step which is
> 'cgroup_mark_ve_roots' function. As the file is created dynamically
> in a live cgroup, a rather delicate locking sequence is present in
> the new code:
> - each new "virtual root" cgroup will have to add "release_agent" file,
> thus each cgroup's directory would need to be locked during
> the insertion time by cgroup->dentry->d_inode->i_mutex.
> - d_inode->i_mutex has an ordering dependency with cgroup_mutex
> (see cgroup_mount/cgroup_remount). They can not be locked in order
> {lock(cgroup_mutex), lock(inode->i_mutex)}.
> - to collect a list of cgroups, that need to become virtual we need
> cgroup_mutex lock to iterate active roots.
> - to overcome the above conflict we first need to collect a list of
> all virtual cgroups under cgroup_mutex lock, then release it and
> after that to insert "release_agent" to each root under
> inode->i_mutex lock.
> - to collect a list of cgroups on stack we utilize
> cgroup->cft_q_node, made specially for that purpose under it's own
> cgroup_cft_mutex.
>
> Destruction:
> Destruction is done in reverse from the above within
> cgroup_unmark_ve_roots.
> After file destruction we must prevent further write operations to
> this file in case when someone has opened this file prior to VE
> and cgroup destruction. This is achieved by checking if cgroup
> in the argument to cgroup_file_write function has features of host
> or virtual root.
>
> https://jira.sw.ru/browse/PSBM-83887
>
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> +++
> cgroup: add missing dput() in cgroup_unmark_ve_roots()
>
> cgroup_unmark_ve_roots() calls dget() on cgroup's dentry but don't
> have the corresponding dput() call. This leads to leaking cgroups.
>
> Add missing dput() to fix this.
>
> https://jira.sw.ru/browse/PSBM-107328
> Fixes: 1ac69e183447 ("ve/cgroup: added release_agent to each container root cgroup.")
>
> (Cherry-picked from 4a1635024df1bae4f4809a3bc445f0cf64d4acf4)
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> ---
> include/linux/cgroup-defs.h | 3 +
> include/linux/cgroup.h | 2 +-
> include/linux/ve.h | 4 +-
> kernel/cgroup/cgroup-internal.h | 1 +
> kernel/cgroup/cgroup-v1.c | 37 ++++++++-
> kernel/cgroup/cgroup.c | 128 ++++++++++++++++++++++++++------
> kernel/ve/ve.c | 42 ++++++++---
> 7 files changed, 179 insertions(+), 38 deletions(-)
>
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index 4404862c1eaf..c7ce5e84f610 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -453,6 +453,9 @@ struct cgroup {
> */
> struct list_head cset_links;
>
> + /* Used for cgroup_mark/umark ve */
> + struct list_head cft_q_node;
> +
> /*
> * Linked list running through all cgroups that can
> * potentially be reaped by the release agent. Protected by
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 0920b2ffb15b..b846617b0a53 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -900,7 +900,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
> void cgroup1_release_agent(struct work_struct *work);
>
> #ifdef CONFIG_VE
> -extern void cgroup_mark_ve_root(struct ve_struct *ve);
> +int cgroup_mark_ve_roots(struct ve_struct *ve);
> void cgroup_unmark_ve_roots(struct ve_struct *ve);
> struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
> #endif
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 65c19f2b9b98..7cef4b39847e 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -145,11 +145,13 @@ extern int nr_ve;
> void ve_add_to_release_list(struct cgroup *cgrp);
> void ve_rm_from_release_list(struct cgroup *cgrp);
>
> -int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot,
> +int ve_set_release_agent_path(struct cgroup *cgroot,
> const char *release_agent);
>
> const char *ve_get_release_agent_path(struct cgroup *cgrp_root);
>
> +void ve_cleanup_per_cgroot_data(struct ve_struct *ve, struct cgroup *cgrp);
> +
> extern struct ve_struct *get_ve(struct ve_struct *ve);
> extern void put_ve(struct ve_struct *ve);
>
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index be0cd157d4dc..112bd917e99d 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -227,6 +227,7 @@ extern const struct proc_ns_operations cgroupns_operations;
> */
> extern struct cftype cgroup1_base_files[];
> extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
> +struct cftype *get_cftype_by_name(const char *name);
>
> int proc_cgroupstats_show(struct seq_file *m, void *v);
> bool cgroup1_ssid_disabled(int ssid);
> diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
> index 31585ab907a3..46be2f688503 100644
> --- a/kernel/cgroup/cgroup-v1.c
> +++ b/kernel/cgroup/cgroup-v1.c
> @@ -557,13 +557,34 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
> struct cgroup *root_cgrp;
>
> cgrp = cgroup_kn_lock_live(of->kn, false);
> + if (!cgrp)
> + return -ENODEV;
> +
> + /*
> + * Call to cgroup_get_local_root is a way to make sure
> + * that cgrp in the argument is valid "virtual root"
> + * cgroup. If it's not - this means that cgroup_unmark_ve_roots
> + * has already cleared CGRP_VE_ROOT flag from this cgroup while
> + * the file was still opened by other process and
> + * that we shouldn't allow further writings via that file.
> + */
> root_cgrp = cgroup_get_local_root(cgrp);
> BUG_ON(!root_cgrp);
> +
> + if (root_cgrp != cgrp) {
> + ret = -EPERM;
> + goto out;
> + }
> +
> if (root_cgrp->ve_owner)
> - ret = ve_set_release_agent_path(root_cgrp, buffer);
> + ret = ve_set_release_agent_path(root_cgrp, strstrip(buf));
> else
> ret = -ENODEV;
>
> + if (!ret)
> + ret = nbytes;
> +
> +out:
> cgroup_kn_unlock(of->kn);
> return ret;
> }
> @@ -680,7 +701,7 @@ struct cftype cgroup1_base_files[] = {
> },
> {
> .name = "release_agent",
> - .flags = CFTYPE_ONLY_ON_ROOT,
> + .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
> .seq_show = cgroup_release_agent_show,
> .write = cgroup_release_agent_write,
> .max_write_len = PATH_MAX - 1,
> @@ -693,6 +714,18 @@ struct cftype cgroup1_base_files[] = {
> { } /* terminate */
> };
>
> +struct cftype *get_cftype_by_name(const char *name)
> +{
> + struct cftype *cft;
> +
> + for (cft = cgroup1_base_files; cft->name[0] != '\0'; cft++) {
> + if (!strcmp(cft->name, name))
> + return cft;
> + }
> + return NULL;
> +}
> +
> +
> #define _cg_virtualized(x) ((ve_is_super(get_exec_env())) ? (x) : 1)
>
> /* Display information about each subsystem and each hierarchy */
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index a14e6feabc54..285e84d1150f 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -70,6 +70,8 @@
> /* let's not notify more than 100 times per second */
> #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
>
> +#define CGROUP_FILENAME_RELEASE_AGENT "release_agent"
> +
> /*
> * cgroup_mutex is the master lock. Any modification to cgroup or its
> * hierarchy must be performed while holding it.
> @@ -1921,56 +1923,137 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
> return 0;
> }
>
> +static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
> + struct cftype *cft, bool activate);
> +
> #ifdef CONFIG_VE
> -void cgroup_mark_ve_root(struct ve_struct *ve)
> +int cgroup_mark_ve_roots(struct ve_struct *ve)
> {
> + int err;
> struct cgrp_cset_link *link;
> struct css_set *cset;
> - struct cgroup *cgrp;
> + struct cgroup *cgrp, *tmp;
> + struct cftype *cft;
> + struct cgroup_subsys_state *cpu_css;
> + LIST_HEAD(pending);
>
> - spin_lock_irq(&css_set_lock);
> + cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
> + BUG_ON(!cft);
>
> + /*
> + * locking scheme:
> + * collect the right cgroups in a list under the locks below and
> + * take actions later without these locks:
> + * css_set_lock - to iterate cset->cgrp_links
> + *
> + * cgroup_add_file can not be called under css_set_lock spinlock,
> + * because eventually it calls __kernfs_new_node, which does some
> + * allocations.
> + */
> +
> + spin_lock_irq(&css_set_lock);
> rcu_read_lock();
> +
> cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset;
> - if (WARN_ON(!cset))
> - goto unlock;
> + if (WARN_ON(!cset)) {
> + rcu_read_unlock();
> + return -ENODEV;
> + }
>
> list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
> cgrp = link->cgrp;
> - rcu_assign_pointer(cgrp->ve_owner, ve);
> - set_bit(CGRP_VE_ROOT, &cgrp->flags);
> +
> + /*
> + * At container start, vzctl creates special cgroups to serve
> + * as virtualized cgroup roots. They are bind-mounted on top
> + * of original cgroup mount point in container namespace. But
> + * not all cgroup mounts undergo this procedure. We should
> + * skip cgroup mounts that are not virtualized.
> + */
> + if (!is_virtualized_cgroup(cgrp))
> + continue;
> +
> + list_add_tail(&cgrp->cft_q_node, &pending);
> }
> - link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
> -unlock:
> + cpu_css = cset->subsys[cpu_cgrp_id];
> rcu_read_unlock();
> -
> spin_unlock_irq(&css_set_lock);
> +
> + /*
> + * reads to cgrp->ve_owner is protected by rcu.
> + * writes to cgrp->ve_owner is protected by ve->op_sem taken
> + * by the caller. Possible race that would require cgroup_mutex
> + * is when two separate ve's will have same cgroups in their css
> + * sets, but this should be protected elsewhere.
> + */
> + list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
> + rcu_assign_pointer(cgrp->ve_owner, ve);
> + set_bit(CGRP_VE_ROOT, &cgrp->flags);
> +
> + if (!cgroup_is_removed(cgrp)) {
> + err = cgroup_add_file(NULL, cgrp, cft, true);
> + if (err) {
> + pr_warn("failed to add file to VE_ROOT cgroup,"
> + " err:%d\n", err);
> + break;
> + }
> + }
> + list_del_init(&cgrp->cft_q_node);
> + }
> + link_ve_root_cpu_cgroup(cpu_css);
> synchronize_rcu();
> +
> + if (err) {
> + pr_warn("failed to mark cgroups as VE_ROOT,"
> + " err:%d, falling back\n", err);
> + cgroup_unmark_ve_roots(ve);
> + }
> + return err;
> }
>
> void cgroup_unmark_ve_roots(struct ve_struct *ve)
> {
> struct cgrp_cset_link *link;
> struct css_set *cset;
> - struct cgroup *cgrp;
> + struct cgroup *cgrp, *tmp;
> + struct cftype *cft;
> + LIST_HEAD(pending);
> +
> + cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
>
> spin_lock_irq(&css_set_lock);
>
> rcu_read_lock();
> cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset;
> - if (WARN_ON(!cset))
> - goto unlock;
> + if (WARN_ON(!cset)) {
> + rcu_read_unlock();
> + spin_unlock_irq(&css_set_lock);
> + return;
> + }
>
> list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
> cgrp = link->cgrp;
> - rcu_assign_pointer(cgrp->ve_owner, NULL);
> - clear_bit(CGRP_VE_ROOT, &cgrp->flags);
> +
> + /*
> + * For this line see comments in
> + * cgroup_mark_ve_roots
> + */
> + if (!is_virtualized_cgroup(cgrp))
> + continue;
> +
> + BUG_ON(cgrp->ve_owner != ve);
> +
> + list_add_tail(&cgrp->cft_q_node, &pending);
> }
> - link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
> -unlock:
> rcu_read_unlock();
> -
> spin_unlock_irq(&css_set_lock);
> +
> + list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
> + kernfs_remove_by_name(cgrp->kn, CGROUP_FILENAME_RELEASE_AGENT);
> + rcu_assign_pointer(cgrp->ve_owner, NULL);
> + clear_bit(CGRP_VE_ROOT, &cgrp->flags);
> + list_del_init(&cgrp->cft_q_node);
> + }
> /* ve_owner == NULL will be visible */
> synchronize_rcu();
>
> @@ -2023,6 +2106,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
> INIT_LIST_HEAD(&cgrp->self.children);
> INIT_LIST_HEAD(&cgrp->cset_links);
> INIT_LIST_HEAD(&cgrp->pidlists);
> + INIT_LIST_HEAD(&cgrp->cft_q_node);
> + INIT_LIST_HEAD(&cgrp->release_list);
> mutex_init(&cgrp->pidlist_mutex);
> cgrp->self.cgroup = cgrp;
> cgrp->self.flags |= CSS_ONLINE;
> @@ -2049,10 +2134,8 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
> idr_init(&root->cgroup_idr);
>
> root->flags = opts->flags;
> - if (opts.release_agent) {
> - ret = ve_set_release_agent_path(root_cgrp,
> - opts.release_agent);
> - }
> + if (opts->release_agent)
> + ve_set_release_agent_path(cgrp, opts->release_agent);
>
> if (opts->name)
> strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
> @@ -2260,6 +2343,7 @@ static void cgroup_kill_sb(struct super_block *sb)
> percpu_ref_kill(&root->cgrp.self.refcnt);
> cgroup_put(&root->cgrp);
> kernfs_kill_sb(sb);
> + ve_cleanup_per_cgroot_data(NULL, &root->cgrp);
> }
>
> struct file_system_type cgroup_fs_type = {
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index da6bd1671afd..176990a4a5a1 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -311,12 +311,13 @@ const char *ve_get_release_agent_path(struct cgroup *cgroot)
> struct per_cgroot_data *data;
> struct cgroup_rcu_string *str;
> struct ve_struct *ve;
> + unsigned long flags;
>
> ve = rcu_dereference(cgroot->ve_owner);
> if (!ve)
> return NULL;
>
> - raw_spin_lock(&ve->per_cgroot_list_lock);
> + spin_lock_irqsave(&ve->per_cgroot_list_lock, flags);
>
> data = per_cgroot_data_find_locked(&ve->per_cgroot_list, cgroot);
> if (data) {
> @@ -324,7 +325,7 @@ const char *ve_get_release_agent_path(struct cgroup *cgroot)
> if (str)
> result = str->val;
> }
> - raw_spin_unlock(&ve->per_cgroot_list_lock);
> + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags);
> return result;
> }
>
> @@ -638,7 +639,9 @@ static int ve_start_container(struct ve_struct *ve)
> if (err < 0)
> goto err_iterate;
>
> - cgroup_mark_ve_root(ve);
> + err = cgroup_mark_ve_roots(ve);
> + if (err)
> + goto err_mark_ve;
>
> ve->is_running = 1;
>
> @@ -648,6 +651,8 @@ static int ve_start_container(struct ve_struct *ve)
>
> return 0;
>
> +err_mark_ve:
> + ve_hook_iterate_fini(VE_SS_CHAIN, ve);
> err_iterate:
> ve_workqueue_stop(ve);
> err_workqueue:
> @@ -662,22 +667,35 @@ static int ve_start_container(struct ve_struct *ve)
> return err;
> }
>
> -static void ve_per_cgroot_free(struct ve_struct *ve)
> +static inline void per_cgroot_data_free(struct per_cgroot_data *data)
> +{
> + struct cgroup_rcu_string *release_agent = data->release_agent_path;
> +
> + RCU_INIT_POINTER(data->release_agent_path, NULL);
> + if (release_agent)
> + kfree_rcu(release_agent, rcu_head);
> + kfree(data);
> +}
> +
> +void ve_cleanup_per_cgroot_data(struct ve_struct *ve, struct cgroup *cgrp)
> {
> struct per_cgroot_data *data, *saved;
> unsigned long flags;
> - struct cgroup_rcu_string *release_agent;
> +
> + BUG_ON(!ve && !cgrp);
> + rcu_read_lock();
> + if (!ve)
> + ve = cgroup_get_ve_owner(cgrp);
>
> spin_lock_irqsave(&ve->per_cgroot_list_lock, flags);
> list_for_each_entry_safe(data, saved, &ve->per_cgroot_list, list) {
> - release_agent = data->release_agent_path;
> - RCU_INIT_POINTER(data->release_agent_path, NULL);
> - if (release_agent)
> - kfree_rcu(release_agent, rcu_head);
> - list_del_init(&data->list);
> - kfree(data);
> + if (!cgrp || data->cgroot == cgrp) {
> + list_del_init(&data->list);
> + per_cgroot_data_free(data);
> + }
> }
> spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags);
> + rcu_read_unlock();
> }
>
> void ve_stop_ns(struct pid_namespace *pid_ns)
> @@ -736,7 +754,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
>
> ve_workqueue_stop(ve);
>
> - ve_per_cgroot_free(ve);
> + ve_cleanup_per_cgroot_data(ve, NULL);
>
> /*
> * At this point all userspace tasks in container are dead.
>
More information about the Devel
mailing list