[Devel] [PATCH VZ8 v1 12/14] ve/cgroup: added release_agent to each container root cgroup.

Fri Jan 29 18:01:09 MSK 2021

On 20.01.2021 12:56, Valeriy Vdovin wrote:
> Each container will now have access to it's own cgroup release_agent
> file.
> Creation:
>   Normally all cgroup files are created during a call to cgroup_create
>   by cgroup_populate_dir function. It creates or not creates all cgroup
>   files once and they immediately become visible to userspace as
>   filesystem objects.
>   Due to specifics of container creation process, it is not possible to
>   use the same code for 'release_agent' file creation. For VE to start
>   operating, first a list of ordinary cgroups is being created for
>   each subsystem, then the set of newly created cgroups are converted to
>   "virtual roots", so at the time when cgroup_create is executed, there
>   is no knowledge of wheather or not "release_agent" file should be
>   created. This information only comes at "conversion" step which is
>   'cgroup_mark_ve_roots' function. As the file is created dynamically
>   in a live cgroup, a rather delicate locking sequence is present in
>   the new code:
>     - each new "virtual root" cgroup will have to add "release_agent" file,
>       thus each cgroup's directory would need to be locked during
>       the insertion time by cgroup->dentry->d_inode->i_mutex.
>     - d_inode->i_mutex has an ordering dependency with cgroup_mutex
>       (see cgroup_mount/cgroup_remount). They can not be locked in order
>       {lock(cgroup_mutex), lock(inode->i_mutex)}.
>     - to collect a list of cgroups, that need to become virtual we need
>       cgroup_mutex lock to iterate active roots.
>     - to overcome the above conflict we first need to collect a list of
>       all virtual cgroups under cgroup_mutex lock, then release it and
>       after that to insert "release_agent" to each root under
>       inode->i_mutex lock.
>     - to collect a list of cgroups on stack we utilize
>       cgroup->cft_q_node, made specially for that purpose under it's own
>       cgroup_cft_mutex.
> 
> Destruction:
>   Destruction is done in reverse from the above within
>   cgroup_unmark_ve_roots.
>   After file destruction we must prevent further write operations to
>   this file in case when someone has opened this file prior to VE
>   and cgroup destruction. This is achieved by checking if cgroup
>   in the argument to cgroup_file_write function has features of host
>   or virtual root.
> 
> https://jira.sw.ru/browse/PSBM-83887
> 
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> +++
> cgroup: add missing dput() in cgroup_unmark_ve_roots()
> 
> cgroup_unmark_ve_roots() calls dget() on cgroup's dentry but don't
> have the corresponding dput() call. This leads to leaking cgroups.
> 
> Add missing dput() to fix this.
> 
> https://jira.sw.ru/browse/PSBM-107328
> Fixes: 1ac69e183447 ("ve/cgroup: added release_agent to each container root cgroup.")
> 
> (Cherry-picked from 4a1635024df1bae4f4809a3bc445f0cf64d4acf4)
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>

Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>

> ---
>  include/linux/cgroup-defs.h     |   3 +
>  include/linux/cgroup.h          |   2 +-
>  include/linux/ve.h              |   4 +-
>  kernel/cgroup/cgroup-internal.h |   1 +
>  kernel/cgroup/cgroup-v1.c       |  37 ++++++++-
>  kernel/cgroup/cgroup.c          | 128 ++++++++++++++++++++++++++------
>  kernel/ve/ve.c                  |  42 ++++++++---
>  7 files changed, 179 insertions(+), 38 deletions(-)
> 
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index 4404862c1eaf..c7ce5e84f610 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -453,6 +453,9 @@ struct cgroup {
>  	 */
>  	struct list_head cset_links;
>  
> +	/* Used for cgroup_mark/umark ve */
> +	struct list_head cft_q_node;
> +
>  	/*
>  	 * Linked list running through all cgroups that can
>  	 * potentially be reaped by the release agent. Protected by
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 0920b2ffb15b..b846617b0a53 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -900,7 +900,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
>  void cgroup1_release_agent(struct work_struct *work);
>  
>  #ifdef CONFIG_VE
> -extern void cgroup_mark_ve_root(struct ve_struct *ve);
> +int cgroup_mark_ve_roots(struct ve_struct *ve);
>  void cgroup_unmark_ve_roots(struct ve_struct *ve);
>  struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
>  #endif
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 65c19f2b9b98..7cef4b39847e 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -145,11 +145,13 @@ extern int nr_ve;
>  void ve_add_to_release_list(struct cgroup *cgrp);
>  void ve_rm_from_release_list(struct cgroup *cgrp);
>  
> -int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot,
> +int ve_set_release_agent_path(struct cgroup *cgroot,
>  	const char *release_agent);
>  
>  const char *ve_get_release_agent_path(struct cgroup *cgrp_root);
>  
> +void ve_cleanup_per_cgroot_data(struct ve_struct *ve, struct cgroup *cgrp);
> +
>  extern struct ve_struct *get_ve(struct ve_struct *ve);
>  extern void put_ve(struct ve_struct *ve);
>  
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index be0cd157d4dc..112bd917e99d 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -227,6 +227,7 @@ extern const struct proc_ns_operations cgroupns_operations;
>   */
>  extern struct cftype cgroup1_base_files[];
>  extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
> +struct cftype *get_cftype_by_name(const char *name);
>  
>  int proc_cgroupstats_show(struct seq_file *m, void *v);
>  bool cgroup1_ssid_disabled(int ssid);
> diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
> index 31585ab907a3..46be2f688503 100644
> --- a/kernel/cgroup/cgroup-v1.c
> +++ b/kernel/cgroup/cgroup-v1.c
> @@ -557,13 +557,34 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
>  	struct cgroup *root_cgrp;
>  
>  	cgrp = cgroup_kn_lock_live(of->kn, false);
> +	if (!cgrp)
> +		return -ENODEV;
> +
> +	/*
> +	 * Call to cgroup_get_local_root is a way to make sure
> +	 * that cgrp in the argument is valid "virtual root"
> +	 * cgroup. If it's not - this means that cgroup_unmark_ve_roots
> +	 * has already cleared CGRP_VE_ROOT flag from this cgroup while
> +	 * the file was still opened by other process and
> +	 * that we shouldn't allow further writings via that file.
> +	 */
>  	root_cgrp = cgroup_get_local_root(cgrp);
>  	BUG_ON(!root_cgrp);
> +
> +	if (root_cgrp != cgrp) {
> +		ret = -EPERM;
> +		goto out;
> +	}
> +
>  	if (root_cgrp->ve_owner)
> -		ret = ve_set_release_agent_path(root_cgrp, buffer);
> +		ret = ve_set_release_agent_path(root_cgrp, strstrip(buf));
>  	else
>  		ret = -ENODEV;
>  
> +	if (!ret)
> +		ret = nbytes;
> +
> +out:
>  	cgroup_kn_unlock(of->kn);
>  	return ret;
>  }
> @@ -680,7 +701,7 @@ struct cftype cgroup1_base_files[] = {
>  	},
>  	{
>  		.name = "release_agent",
> -		.flags = CFTYPE_ONLY_ON_ROOT,
> +		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
>  		.seq_show = cgroup_release_agent_show,
>  		.write = cgroup_release_agent_write,
>  		.max_write_len = PATH_MAX - 1,
> @@ -693,6 +714,18 @@ struct cftype cgroup1_base_files[] = {
>  	{ }	/* terminate */
>  };
>  
> +struct cftype *get_cftype_by_name(const char *name)
> +{
> +	struct cftype *cft;
> +
> +	for (cft = cgroup1_base_files; cft->name[0] != '\0'; cft++) {
> +		if (!strcmp(cft->name, name))
> +			return cft;
> +	}
> +	return NULL;
> +}
> +
> +
>  #define _cg_virtualized(x) ((ve_is_super(get_exec_env())) ? (x) : 1)
>  
>  /* Display information about each subsystem and each hierarchy */
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index a14e6feabc54..285e84d1150f 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -70,6 +70,8 @@
>  /* let's not notify more than 100 times per second */
>  #define CGROUP_FILE_NOTIFY_MIN_INTV	DIV_ROUND_UP(HZ, 100)
>  
> +#define CGROUP_FILENAME_RELEASE_AGENT "release_agent"
> +
>  /*
>   * cgroup_mutex is the master lock.  Any modification to cgroup or its
>   * hierarchy must be performed while holding it.
> @@ -1921,56 +1923,137 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
>  	return 0;
>  }
>  
> +static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
> +			   struct cftype *cft, bool activate);
> +
>  #ifdef CONFIG_VE
> -void cgroup_mark_ve_root(struct ve_struct *ve)
> +int cgroup_mark_ve_roots(struct ve_struct *ve)
>  {
> +	int err;
>  	struct cgrp_cset_link *link;
>  	struct css_set *cset;
> -	struct cgroup *cgrp;
> +	struct cgroup *cgrp, *tmp;
> +	struct cftype *cft;
> +	struct cgroup_subsys_state *cpu_css;
> +	LIST_HEAD(pending);
>  
> -	spin_lock_irq(&css_set_lock);
> +	cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
> +	BUG_ON(!cft);
>  
> +	/*
> +	 * locking scheme:
> +	 * collect the right cgroups in a list under the locks below and
> +	 * take actions later without these locks:
> +	 * css_set_lock - to iterate cset->cgrp_links
> +	 *
> +	 * cgroup_add_file can not be called under css_set_lock spinlock,
> +	 * because eventually it calls __kernfs_new_node, which does some
> +	 * allocations.
> +	 */
> +
> +	spin_lock_irq(&css_set_lock);
>  	rcu_read_lock();
> +
>  	cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset;
> -	if (WARN_ON(!cset))
> -		goto unlock;
> +	if (WARN_ON(!cset)) {
> +		rcu_read_unlock();
> +		return -ENODEV;
> +	}
>  
>  	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
>  		cgrp = link->cgrp;
> -		rcu_assign_pointer(cgrp->ve_owner, ve);
> -		set_bit(CGRP_VE_ROOT, &cgrp->flags);
> +
> +		/*
> +		 * At container start, vzctl creates special cgroups to serve
> +		 * as virtualized cgroup roots. They are bind-mounted on top
> +		 * of original cgroup mount point in container namespace. But
> +		 * not all cgroup mounts undergo this procedure. We should
> +		 * skip cgroup mounts that are not virtualized.
> +		 */
> +		if (!is_virtualized_cgroup(cgrp))
> +			continue;
> +
> +		list_add_tail(&cgrp->cft_q_node, &pending);
>  	}
> -	link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
> -unlock:
> +	cpu_css = cset->subsys[cpu_cgrp_id];
>  	rcu_read_unlock();
> -
>  	spin_unlock_irq(&css_set_lock);
> +
> +	/*
> +	 * reads to cgrp->ve_owner is protected by rcu.
> +	 * writes to cgrp->ve_owner is protected by ve->op_sem taken
> +	 * by the caller. Possible race that would require cgroup_mutex
> +	 * is when two separate ve's will have same cgroups in their css
> +	 * sets, but this should be protected elsewhere.
> +	 */
> +	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
> +		rcu_assign_pointer(cgrp->ve_owner, ve);
> +		set_bit(CGRP_VE_ROOT, &cgrp->flags);
> +
> +		if (!cgroup_is_removed(cgrp)) {
> +			err = cgroup_add_file(NULL, cgrp, cft, true);
> +			if (err) {
> +				pr_warn("failed to add file to VE_ROOT cgroup,"
> +					" err:%d\n", err);
> +				break;
> +			}
> +		}
> +		list_del_init(&cgrp->cft_q_node);
> +	}
> +	link_ve_root_cpu_cgroup(cpu_css);
>  	synchronize_rcu();
> +
> +	if (err) {
> +		pr_warn("failed to mark cgroups as VE_ROOT,"
> +			" err:%d, falling back\n", err);
> +		cgroup_unmark_ve_roots(ve);
> +	}
> +	return err;
>  }
>  
>  void cgroup_unmark_ve_roots(struct ve_struct *ve)
>  {
>  	struct cgrp_cset_link *link;
>  	struct css_set *cset;
> -	struct cgroup *cgrp;
> +	struct cgroup *cgrp, *tmp;
> +	struct cftype *cft;
> +	LIST_HEAD(pending);
> +
> +	cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
>  
>  	spin_lock_irq(&css_set_lock);
>  
>  	rcu_read_lock();
>  	cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset;
> -	if (WARN_ON(!cset))
> -		goto unlock;
> +	if (WARN_ON(!cset)) {
> +		rcu_read_unlock();
> +		spin_unlock_irq(&css_set_lock);
> +		return;
> +	}
>  
>  	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
>  		cgrp = link->cgrp;
> -		rcu_assign_pointer(cgrp->ve_owner, NULL);
> -		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
> +
> +		/*
> +		 * For this line see comments in
> +		 * cgroup_mark_ve_roots
> +		 */
> +		if (!is_virtualized_cgroup(cgrp))
> +			continue;
> +
> +		BUG_ON(cgrp->ve_owner != ve);
> +
> +		list_add_tail(&cgrp->cft_q_node, &pending);
>  	}
> -	link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
> -unlock:
>  	rcu_read_unlock();
> -
>  	spin_unlock_irq(&css_set_lock);
> +
> +	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
> +		kernfs_remove_by_name(cgrp->kn, CGROUP_FILENAME_RELEASE_AGENT);
> +		rcu_assign_pointer(cgrp->ve_owner, NULL);
> +		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
> +		list_del_init(&cgrp->cft_q_node);
> +	}
>  	/* ve_owner == NULL will be visible */
>  	synchronize_rcu();
>  
> @@ -2023,6 +2106,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
>  	INIT_LIST_HEAD(&cgrp->self.children);
>  	INIT_LIST_HEAD(&cgrp->cset_links);
>  	INIT_LIST_HEAD(&cgrp->pidlists);
> +	INIT_LIST_HEAD(&cgrp->cft_q_node);
> +	INIT_LIST_HEAD(&cgrp->release_list);
>  	mutex_init(&cgrp->pidlist_mutex);
>  	cgrp->self.cgroup = cgrp;
>  	cgrp->self.flags |= CSS_ONLINE;
> @@ -2049,10 +2134,8 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
>  	idr_init(&root->cgroup_idr);
>  
>  	root->flags = opts->flags;
> -	if (opts.release_agent) {
> -		ret = ve_set_release_agent_path(root_cgrp,
> -			opts.release_agent);
> -	}
> +	if (opts->release_agent)
> +		ve_set_release_agent_path(cgrp, opts->release_agent);
>  
>  	if (opts->name)
>  		strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
> @@ -2260,6 +2343,7 @@ static void cgroup_kill_sb(struct super_block *sb)
>  		percpu_ref_kill(&root->cgrp.self.refcnt);
>  	cgroup_put(&root->cgrp);
>  	kernfs_kill_sb(sb);
> +	ve_cleanup_per_cgroot_data(NULL, &root->cgrp);
>  }
>  
>  struct file_system_type cgroup_fs_type = {
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index da6bd1671afd..176990a4a5a1 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -311,12 +311,13 @@ const char *ve_get_release_agent_path(struct cgroup *cgroot)
>  	struct per_cgroot_data *data;
>  	struct cgroup_rcu_string *str;
>  	struct ve_struct *ve;
> +	unsigned long flags;
>  
>  	ve = rcu_dereference(cgroot->ve_owner);
>  	if (!ve)
>  		return NULL;
>  
> -	raw_spin_lock(&ve->per_cgroot_list_lock);
> +	spin_lock_irqsave(&ve->per_cgroot_list_lock, flags);
>  
>  	data = per_cgroot_data_find_locked(&ve->per_cgroot_list, cgroot);
>  	if (data) {
> @@ -324,7 +325,7 @@ const char *ve_get_release_agent_path(struct cgroup *cgroot)
>  		if (str)
>  			result = str->val;
>  	}
> -	raw_spin_unlock(&ve->per_cgroot_list_lock);
> +	spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags);
>  	return result;
>  }
>  
> @@ -638,7 +639,9 @@ static int ve_start_container(struct ve_struct *ve)
>  	if (err < 0)
>  		goto err_iterate;
>  
> -	cgroup_mark_ve_root(ve);
> +	err = cgroup_mark_ve_roots(ve);
> +	if (err)
> +		goto err_mark_ve;
>  
>  	ve->is_running = 1;
>  
> @@ -648,6 +651,8 @@ static int ve_start_container(struct ve_struct *ve)
>  
>  	return 0;
>  
> +err_mark_ve:
> +	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>  err_iterate:
>  	ve_workqueue_stop(ve);
>  err_workqueue:
> @@ -662,22 +667,35 @@ static int ve_start_container(struct ve_struct *ve)
>  	return err;
>  }
>  
> -static void ve_per_cgroot_free(struct ve_struct *ve)
> +static inline void per_cgroot_data_free(struct per_cgroot_data *data)
> +{
> +	struct cgroup_rcu_string *release_agent = data->release_agent_path;
> +
> +	RCU_INIT_POINTER(data->release_agent_path, NULL);
> +	if (release_agent)
> +		kfree_rcu(release_agent, rcu_head);
> +	kfree(data);
> +}
> +
> +void ve_cleanup_per_cgroot_data(struct ve_struct *ve, struct cgroup *cgrp)
>  {
>  	struct per_cgroot_data *data, *saved;
>  	unsigned long flags;
> -	struct cgroup_rcu_string *release_agent;
> +
> +	BUG_ON(!ve && !cgrp);
> +	rcu_read_lock();
> +	if (!ve)
> +		ve = cgroup_get_ve_owner(cgrp);
>  
>  	spin_lock_irqsave(&ve->per_cgroot_list_lock, flags);
>  	list_for_each_entry_safe(data, saved, &ve->per_cgroot_list, list) {
> -		release_agent = data->release_agent_path;
> -		RCU_INIT_POINTER(data->release_agent_path, NULL);
> -		if (release_agent)
> -			kfree_rcu(release_agent, rcu_head);
> -		list_del_init(&data->list);
> -		kfree(data);
> +		if (!cgrp || data->cgroot == cgrp) {
> +			list_del_init(&data->list);
> +			per_cgroot_data_free(data);
> +		}
>  	}
>  	spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags);
> +	rcu_read_unlock();
>  }
>  
>  void ve_stop_ns(struct pid_namespace *pid_ns)
> @@ -736,7 +754,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
>  
>  	ve_workqueue_stop(ve);
>  
> -	ve_per_cgroot_free(ve);
> +	ve_cleanup_per_cgroot_data(ve, NULL);
>  
>  	/*
>  	 * At this point all userspace tasks in container are dead.
>