[Devel] [PATCH VZ8 v1 11/14] ve/cgroup: set release_agent_path for root cgroups separately

Mon Jan 25 15:53:53 MSK 2021

On 20.01.2021 12:56, Valeriy Vdovin wrote:
> This is done so that each container could set it's own release agent.
> Release agent information is now stored in per-cgroup-root data
> structure in ve.
> 
> https://jira.sw.ru/browse/PSBM-83887
> 
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> +++
> ve/cgroup: change resource release order in ve_drop_context
> 
> This fixes 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c
> In the mentioned patch in cgroup_show_options ve->ve_ns is checked to
> ensure that ve->root_css_set is usable. But in ve_drop_context
> root_css_set is being released before ve_ns, which is a bug.
> root_css_set will now be set to NULL after ve_ns is released.
> This reordering only affects the described piece of code in
> cgroup_show_options.
> 
> https://jira.sw.ru/browse/PSBM-121438
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> 
> +++
> cgroup: do not use cgroup_mutex in cgroup_show_options
> 
> In 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c function cgroup_show_options
> started to lock cgroup_mutex, which introduced new deadlock possibility,
> described below:
> 
> Thread A:
>   m_start() --> down_read(&namespace_sem);
>     cgroup_show_options() --> mutex_lock(&cgroup_mutex);
> 
> Thread B:
> attach_task_by_pid()
>   cgroup_lock_live_group -->  mutex_lock(&cgroup_mutex);
>   threadgroup_lock() -->  down_write(&tsk->signal->group_rwsem);
> 
> Thread C:
>  copy_process
>   threadgroup_change_begin() --> down_read(&tsk->signal->group_rwsem);
>   copy_namespaces
>    create_new_namespaces
>      copy_mnt_ns
>       namespace_lock() --> down_write(&namespace_sem)
> 
> Clearly cgroup_mutex can not be locked right after locking
> namespace_sem, because opposite locking order is also present in
> the code and should be removed from cgroup_show_options.
> After reviewing cgroup_show_options, it was established that
> cgroup_mutex is not absolutely needed to guarantee safe access to root_cgrp.
> It was used in combination with a call to task_cgroup_from_root to
> ensure that root_cgrp lived long enough to access it's value of
> release_agent path.
> But in this funciton we know that root_cgrp is part of ve->root_css_set,
> which holds reference to it. In turn root_css_set is referenced while
> ve->ve_ns is not NULL, the check of which we already have in the code.
> This means that root_cgrp is valid until ve->ve_ns is valid.
> ve->ve_ns is valid until the point of rcu_synchronize in ve_drop_context,
> that's why rcu_read_lock should be maintained all the time when root_cgrp
> is being accessed.
> The patch also removes BUG_ON from css_cgroup_from_root, because all 3 calls
> to this function pass ve->root_css_set as an argument and the above logic
> applies.
> 
> https://jira.sw.ru/browse/PSBM-121438
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> 
> +++
> ve: cleanup in function ve_get_release_agent_path
> 
> (Cherry-picked from f1199bd9589b7c0914343dcc72f49ddaa9b98496)
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>

Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> 
> ---
>  include/linux/cgroup-defs.h     |  3 --
>  include/linux/ve.h              |  6 +++
>  kernel/cgroup/cgroup-internal.h |  4 +-
>  kernel/cgroup/cgroup-v1.c       | 86 ++++++++++++++++++++++-----------
>  kernel/cgroup/cgroup.c          |  9 ++--
>  kernel/ve/ve.c                  | 76 +++++++++++++++++++++++++++++
>  6 files changed, 150 insertions(+), 34 deletions(-)
> 
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index d6fe95320819..4404862c1eaf 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -571,9 +571,6 @@ struct cgroup_root {
>  	/* IDs for cgroups in this hierarchy */
>  	struct idr cgroup_idr;
>  
> -	/* The path to use for release notifications. */
> -	char release_agent_path[PATH_MAX];
> -
>  	/* The name for this hierarchy - may be empty */
>  	char name[MAX_CGROUP_ROOT_NAMELEN];
>  };
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 44369dddeb24..65c19f2b9b98 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -144,6 +144,12 @@ extern int nr_ve;
>  #ifdef CONFIG_VE
>  void ve_add_to_release_list(struct cgroup *cgrp);
>  void ve_rm_from_release_list(struct cgroup *cgrp);
> +
> +int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot,
> +	const char *release_agent);
> +
> +const char *ve_get_release_agent_path(struct cgroup *cgrp_root);
> +
>  extern struct ve_struct *get_ve(struct ve_struct *ve);
>  extern void put_ve(struct ve_struct *ve);
>  
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index 4de66630d456..be0cd157d4dc 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -160,6 +160,9 @@ static inline bool notify_on_release(const struct cgroup *cgrp)
>  	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
>  }
>  
> +struct cgroup *cset_cgroup_from_root(struct css_set *cset,
> +					    struct cgroup_root *root);
> +
>  bool cgroup_ssid_enabled(int ssid);
>  bool cgroup_on_dfl(const struct cgroup *cgrp);
>  bool cgroup_is_thread_root(struct cgroup *cgrp);
> @@ -180,7 +183,6 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
>  struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
>  			       struct cgroup_root *root, unsigned long magic,
>  			       struct cgroup_namespace *ns);
> -
>  int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
>  void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
>  void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
> diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
> index c1891317ae3a..31585ab907a3 100644
> --- a/kernel/cgroup/cgroup-v1.c
> +++ b/kernel/cgroup/cgroup-v1.c
> @@ -37,12 +37,6 @@ static bool cgroup_no_v1_named;
>   */
>  static struct workqueue_struct *cgroup_pidlist_destroy_wq;
>  
> -/*
> - * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
> - * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
> - */
> -static DEFINE_SPINLOCK(release_agent_path_lock);
> -
>  bool cgroup1_ssid_disabled(int ssid)
>  {
>  	return cgroup_no_v1_mask & (1 << ssid);
> @@ -559,27 +553,36 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
>  					  char *buf, size_t nbytes, loff_t off)
>  {
>  	struct cgroup *cgrp;
> -
> -	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
> +	int ret = 0;
> +	struct cgroup *root_cgrp;
>  
>  	cgrp = cgroup_kn_lock_live(of->kn, false);
> -	if (!cgrp)
> -		return -ENODEV;
> -	spin_lock(&release_agent_path_lock);
> -	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
> -		sizeof(cgrp->root->release_agent_path));
> -	spin_unlock(&release_agent_path_lock);
> +	root_cgrp = cgroup_get_local_root(cgrp);
> +	BUG_ON(!root_cgrp);
> +	if (root_cgrp->ve_owner)
> +		ret = ve_set_release_agent_path(root_cgrp, buffer);
> +	else
> +		ret = -ENODEV;
> +
>  	cgroup_kn_unlock(of->kn);
> -	return nbytes;
> +	return ret;
>  }
>  
>  static int cgroup_release_agent_show(struct seq_file *seq, void *v)
>  {
> +	const char *release_agent;
> +	struct cgroup *root_cgrp;
>  	struct cgroup *cgrp = seq_css(seq)->cgroup;
>  
> -	spin_lock(&release_agent_path_lock);
> -	seq_puts(seq, cgrp->root->release_agent_path);
> -	spin_unlock(&release_agent_path_lock);
> +	root_cgrp = cgroup_get_local_root(cgrp);
> +	if (root_cgrp->ve_owner) {
> +		rcu_read_lock();
> +		release_agent = ve_get_release_agent_path(root_cgrp);
> +
> +		if (release_agent)
> +			seq_puts(seq, release_agent);
> +		rcu_read_unlock();
> +	}
>  	seq_putc(seq, '\n');
>  	return 0;
>  }
> @@ -950,8 +953,10 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent
>  
>  static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
>  {
> +	const char *release_agent;
>  	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
>  	struct cgroup_subsys *ss;
> +	struct cgroup *root_cgrp = &root->cgrp;
>  	int ssid;
>  
>  	for_each_subsys(ss, ssid)
> @@ -964,12 +969,36 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
>  	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
>  		seq_puts(seq, ",cpuset_v2_mode");
>  
> -	spin_lock(&release_agent_path_lock);
> -	if (strlen(root->release_agent_path))
> -		seq_show_option(seq, "release_agent",
> -				root->release_agent_path);
> -	spin_unlock(&release_agent_path_lock);
> -
> +	rcu_read_lock();
> +#ifdef CONFIG_VE
> +	{
> +		struct ve_struct *ve = get_exec_env();
> +		struct css_set *cset;
> +		struct nsproxy *ve_ns;
> +
> +		if (!ve_is_super(ve)) {
> +			/*
> +			 * ve->init_task is NULL in case when cgroup is accessed
> +			 * before ve_start_container has been called.
> +			 *
> +			 * ve->init_task is synchronized via ve->ve_ns rcu, see
> +			 * ve_grab_context/drop_context.
> +			 */
> +			ve_ns = rcu_dereference(ve->ve_ns);
> +			if (ve_ns) {
> +				spin_lock_irq(&css_set_lock);
> +				cset = ve_ns->cgroup_ns->root_cset;
> +				BUG_ON(!cset);
> +				root_cgrp = cset_cgroup_from_root(cset, root);
> +				spin_unlock_irq(&css_set_lock);
> +			}
> +		}
> +	}
> +#endif
> +	release_agent = ve_get_release_agent_path(root_cgrp);
> +	if (release_agent && release_agent[0])
> +		seq_show_option(seq, "release_agent", release_agent);
> +	rcu_read_unlock();
>  	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
>  		seq_puts(seq, ",clone_children");
>  	if (strlen(root->name))
> @@ -1160,9 +1189,12 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
>  	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
>  
>  	if (opts.release_agent) {
> -		spin_lock(&release_agent_path_lock);
> -		strcpy(root->release_agent_path, opts.release_agent);
> -		spin_unlock(&release_agent_path_lock);
> +		struct cgroup *root_cgrp;
> +
> +		root_cgrp = cgroup_get_local_root(&root->cgrp);
> +		if (root_cgrp->ve_owner)
> +			ret = ve_set_release_agent_path(root_cgrp,
> +				opts.release_agent);
>  	}
>  
>  	trace_cgroup_remount(root);
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index e1d4663780d7..a14e6feabc54 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -1434,7 +1434,7 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
>  }
>  
>  /* look up cgroup associated with given css_set on the specified hierarchy */
> -static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
> +struct cgroup *cset_cgroup_from_root(struct css_set *cset,
>  					    struct cgroup_root *root)
>  {
>  	struct cgroup *res = NULL;
> @@ -2049,8 +2049,11 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
>  	idr_init(&root->cgroup_idr);
>  
>  	root->flags = opts->flags;
> -	if (opts->release_agent)
> -		strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
> +	if (opts.release_agent) {
> +		ret = ve_set_release_agent_path(root_cgrp,
> +			opts.release_agent);
> +	}
> +
>  	if (opts->name)
>  		strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
>  	if (opts->cpuset_clone_children)
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 738f8b6465d4..da6bd1671afd 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -36,6 +36,12 @@ struct per_cgroot_data {
>  	 * data is related to this cgroup
>  	 */
>  	struct cgroup *cgroot;
> +
> +	/*
> +	 * path to release agent binaray, that should
> +	 * be spawned for all cgroups under this cgroup root
> +	 */
> +	struct cgroup_rcu_string __rcu *release_agent_path;
>  };
>  
>  extern struct kmapset_set sysfs_ve_perms_set;
> @@ -257,6 +263,71 @@ static inline struct per_cgroot_data *per_cgroot_get_or_create(
>  	return data;
>  }
>  
> +int ve_set_release_agent_path(struct cgroup *cgroot,
> +	const char *release_agent)
> +{
> +	struct ve_struct *ve;
> +	unsigned long flags;
> +	struct per_cgroot_data *data;
> +	struct cgroup_rcu_string *new_path, *old_path;
> +	int nbytes;
> +
> +	/*
> +	 * caller should grab cgroup_mutex to safely use
> +	 * ve_owner field
> +	 */
> +	ve = cgroot->ve_owner;
> +	BUG_ON(!ve);
> +
> +	nbytes = strlen(release_agent);
> +	new_path = cgroup_rcu_strdup(release_agent, nbytes);
> +	if (IS_ERR(new_path))
> +		return PTR_ERR(new_path);
> +
> +	data = per_cgroot_get_or_create(ve, cgroot);
> +	if (IS_ERR(data)) {
> +		kfree(new_path);
> +		return PTR_ERR(data);
> +	}
> +
> +	spin_lock_irqsave(&ve->per_cgroot_list_lock, flags);
> +
> +	old_path = rcu_dereference_protected(data->release_agent_path,
> +		lockdep_is_held(&ve->per_cgroot_list_lock));
> +
> +	rcu_assign_pointer(data->release_agent_path, new_path);
> +	spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags);
> +
> +	if (old_path)
> +		kfree_rcu(old_path, rcu_head);
> +
> +	return 0;
> +}
> +
> +const char *ve_get_release_agent_path(struct cgroup *cgroot)
> +{
> +	/* caller must grab rcu_read_lock */
> +	const char *result = NULL;
> +	struct per_cgroot_data *data;
> +	struct cgroup_rcu_string *str;
> +	struct ve_struct *ve;
> +
> +	ve = rcu_dereference(cgroot->ve_owner);
> +	if (!ve)
> +		return NULL;
> +
> +	raw_spin_lock(&ve->per_cgroot_list_lock);
> +
> +	data = per_cgroot_data_find_locked(&ve->per_cgroot_list, cgroot);
> +	if (data) {
> +		str = rcu_dereference(data->release_agent_path);
> +		if (str)
> +			result = str->val;
> +	}
> +	raw_spin_unlock(&ve->per_cgroot_list_lock);
> +	return result;
> +}
> +
>  struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id)
>  {
>  	struct cgroup_subsys_state *css;
> @@ -595,9 +666,14 @@ static void ve_per_cgroot_free(struct ve_struct *ve)
>  {
>  	struct per_cgroot_data *data, *saved;
>  	unsigned long flags;
> +	struct cgroup_rcu_string *release_agent;
>  
>  	spin_lock_irqsave(&ve->per_cgroot_list_lock, flags);
>  	list_for_each_entry_safe(data, saved, &ve->per_cgroot_list, list) {
> +		release_agent = data->release_agent_path;
> +		RCU_INIT_POINTER(data->release_agent_path, NULL);
> +		if (release_agent)
> +			kfree_rcu(release_agent, rcu_head);
>  		list_del_init(&data->list);
>  		kfree(data);
>  	}
>