[Devel] [PATCH 2/2 v0] ve/cgroup: Added cross links between cgroups and owning ve

Kirill Tkhai ktkhai at virtuozzo.com
Fri Mar 13 14:08:20 MSK 2020


On 13.03.2020 13:09, Valeriy Vdovin wrote:
> Follow-up patch to per-cgroup release_agent property. Currently there is
> a problem with release_agent notification semantics that all
> notification processes spawn under ve0 (host). Due to that any processes
> inside of a container expecting a notification will never get it
> because notification will never hit container namespaces. To address
> this problem a process should be launched from a namespace, relative to
> cgroup owning ve. So we need to somehow know the right 've' during
> spawning of a notification process. This time it's not possible to use
> current->task_ve, because notifications are spawned from a dedicated
> kthread which knows which cgroup needs to be released, but knows nothing
> about what ve 'owns' this cgroup. Naturally, this can be solved by
> adding 've_owner' field to 'struct cgroup'.
> 
> It's not enough because cgroup's lifespan isn't bound to ve's, ve
> might be destroyed earlier than cgroup and opposite is also true. At any
> time a link to ve from cgroup might become invalid without proper
> cleanup.
> 
> To manage this we add a list of all cgroups into ve structure. All
> cgroups that need a reference to owning ve, are added to this list. At
> destruction ve will clean all references on itself from cgroups in this
> list.
> 
> https://jira.sw.ru/browse/PSBM-83887
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
>  include/linux/cgroup.h | 11 ++++++++
>  include/linux/ve.h     | 19 +++++++++++++
>  kernel/cgroup.c        | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  kernel/ve/ve.c         | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 178 insertions(+)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index cad5b4f..833f249 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -287,6 +287,17 @@ struct cgroup {
>  	u64 subgroups_limit;
>  
>  	/*
> +	 * Cgroups that have non-empty release_agent_path get in this
> +	 * this list. At ve destruction, ve will walk this list
> +	 * to unlink itself from each cgroup.
> +	 */
> +	struct list_head ve_reference;
> +
> +	/* ve_owner, responsible for running release agent. */
> +	struct ve_struct *ve_owner;
> +	struct mutex ve_owner_mutex;
> +
> +	/*
>  	 * Per-cgroup path to release agent binary for release
>  	 * notifications.
>  	 */
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 9d60838..49f772c 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -91,6 +91,17 @@ struct ve_struct {
>  
>  	unsigned long		down_at;
>  	struct list_head	cleanup_list;
> +	/*
> +	 * cgroups that need to be unreferenced from this ve
> +	 * at this ve's destruction
> +	 */
> +	struct list_head	referring_cgroups;
> +
> +	/*
> +	 * operations on referring_cgroups are performed under this
> +	 * lock
> +	 */
> +	spinlock_t referring_cgroups_lock;
>  	unsigned long		meminfo_val;
>  	int _randomize_va_space;
>  
> @@ -268,6 +279,14 @@ static inline struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
>  struct seq_file;
>  struct kernel_cpustat;
>  
> +/*
> + * cgroup needs to know it's owning ve for some of operations, but
> + * cgroup's lifetime is independant of ve's, in theory ve can be destroyed
> + * earlier than some of it's cgroups.
> + */
> +void ve_add_referring_cgroup(struct ve_struct *ve, struct cgroup *cgrp);
> +void ve_remove_referring_cgroups(struct ve_struct *ve);
> +
>  #if defined(CONFIG_VE) && defined(CONFIG_CGROUP_SCHED)
>  int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p);
>  int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p);
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 0b64d88..3ecdb5b 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1442,10 +1442,12 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
>  	INIT_LIST_HEAD(&cgrp->allcg_node);
>  	INIT_LIST_HEAD(&cgrp->release_list);
>  	INIT_LIST_HEAD(&cgrp->pidlists);
> +	INIT_LIST_HEAD(&cgrp->ve_reference);
>  	mutex_init(&cgrp->pidlist_mutex);
>  	INIT_LIST_HEAD(&cgrp->event_list);
>  	spin_lock_init(&cgrp->event_list_lock);
>  	simple_xattrs_init(&cgrp->xattrs);
> +	mutex_init(&cgrp->ve_owner_mutex);
>  }
>  
>  static void init_cgroup_root(struct cgroupfs_root *root)
> @@ -2325,6 +2327,14 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
>  	if (!cgroup_lock_live_group(cgrp))
>  		return -ENODEV;
>  
> +#ifdef CONFIG_VE
> +	/*
> +	 * This is the only place when we can bind ve information to
> +	 * cgroup, assuming that the writer does it inside of the
> +	 * right VE.
> +	 */
> +	ve_add_referring_cgroup(get_exec_env(), cgrp);
> +#endif
>  	ret = cgroup_set_release_agent_locked(cgrp, buffer);
>  	mutex_unlock(&cgroup_mutex);
>  	return ret;
> @@ -4534,6 +4544,36 @@ static void css_ref_killed_fn(struct percpu_ref *ref)
>  	cgroup_css_killed(css->cgroup);
>  }
>  
> +/*
> + * cgroup_unreference_ve removes cgroup's reference to ve
> + * ve_owner_mutex ensures ve pointer is valid, then checks
> + * if cgoup is still in ve's list. Otherwise ve is already
> + * removing cgroup itself, so we dont' need to do it.
> + */
> +void cgroup_unreference_ve(struct cgroup *cgrp)
> +{
> +	struct list_head *pos;
> +	/*
> +	 * We are only safe to query ve_owner under it's lock, cause
> +	 * ve could zero it out at destruction step.
> +	 */
> +	mutex_lock(&cgrp->ve_owner_mutex);
> +	if (cgrp->ve_owner) {
> +		struct ve_struct *ve = cgrp->ve_owner;
> +		spin_lock(&ve->referring_cgroups_lock);
> +		list_for_each(pos, &ve->referring_cgroups) {
> +			if (pos == &cgrp->ve_reference) {
> +				list_del(&cgrp->ve_reference);
> +				break;
> +			}
> +		}
> +		spin_unlock(&ve->referring_cgroups_lock);
> +		cgrp->ve_owner = NULL;
> +	}
> +	mutex_unlock(&cgrp->ve_owner_mutex);
> +}
> +
> +
>  /**
>   * cgroup_destroy_locked - the first stage of cgroup destruction
>   * @cgrp: cgroup to be destroyed
> @@ -4645,6 +4685,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
>  	}
>  	spin_unlock(&cgrp->event_list_lock);
>  
> +	cgroup_unreference_ve(cgrp);
> +
>  	kfree(cgrp->release_agent_path);
>  
>  	return 0;
> @@ -5455,6 +5497,7 @@ static void cgroup_release_agent(struct work_struct *work)
>  	raw_spin_lock(&release_list_lock);
>  	while (!list_empty(&release_list)) {
>  		char *argv[3], *envp[3];
> +		struct ve_struct *ve;
>  		int i, err;
>  		char *pathbuf = NULL, *agentbuf = NULL;
>  		struct cgroup *root_cgrp;
> @@ -5468,7 +5511,33 @@ static void cgroup_release_agent(struct work_struct *work)
>  			goto continue_free;
>  		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, true) < 0)
>  			goto continue_free;
> +
> +		/*
> +		 * root_cgrp is the relative root for cgrp, for host
> +		 * cgroups root_cgrp is root->top_cgroup, for container
> +		 * cgroups it is any up the parent chain from cgrp marked
> +		 * as VE_ROOT.
> +		 */
>  		root_cgrp = cgroup_get_local_root(cgrp);
> +
> +		/*
> +		 * Under lock we are safe to check that ve_owner is not
> +		 * NULL. Until cgroup has non-NULL pointer to VE, we are
> +		 * safe to increase it's refcount and then until reference
> +		 * is held, we are safe to address it's memory. If at this
> +		 * point VE undergoes destruction steps, at maximum
> +		 * call_usermodehelper_fns_ve will gracefully fail, but
> +		 * nobody will care at desctuction.
> +		 */
> +		ve = NULL;
> +		mutex_lock(&root_cgrp->ve_owner_mutex);
> +		if (root_cgrp->ve_owner) {
> +			ve = root_cgrp->ve_owner;
> +			get_ve(ve);
> +		}
> +		mutex_unlock(&root_cgrp->ve_owner_mutex);
> +		if (!ve)
> +			goto continue_free;
>  		if (root_cgrp->release_agent_path)
>  			agentbuf = kstrdup(root_cgrp->release_agent_path,
>  				GFP_KERNEL);
> @@ -5490,7 +5559,13 @@ static void cgroup_release_agent(struct work_struct *work)
>  		 * since the exec could involve hitting disk and hence
>  		 * be a slow process */
>  		mutex_unlock(&cgroup_mutex);
> +#ifdef CONFIG_VE
> +		err = call_usermodehelper_fns_ve(ve, argv[0], argv,
> +			envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
> +		put_ve(ve);
> +#else
>  		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
> +#endif
>  		if (err < 0)
>  			pr_warn_ratelimited("cgroup release_agent "
>  					    "%s %s failed: %d\n",
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index a64b4a7..4ead5cc 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -575,6 +575,13 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
>  	ve->is_running = 0;
>  
>  	/*
> +	 * After is_running is cleared, we are safe to cleanup
> +	 * referring cgroups list. Additional cgroups will fail
> +	 * to add themselves on is_running check.
> +	 */
> +	ve_remove_referring_cgroups(ve);
> +
> +	/*
>  	 * Neither it can be in pseudosuper state
>  	 * anymore, setup it again if needed.
>  	 */
> @@ -704,6 +711,7 @@ do_init:
>  	INIT_LIST_HEAD(&ve->devices);
>  	INIT_LIST_HEAD(&ve->ve_list);
>  	INIT_LIST_HEAD(&ve->devmnt_list);
> +	INIT_LIST_HEAD(&ve->referring_cgroups);
>  	mutex_init(&ve->devmnt_mutex);
>  
>  #ifdef CONFIG_AIO
> @@ -1713,3 +1721,68 @@ int ve_get_cpu_stat(struct ve_struct *ve, struct kernel_cpustat *kstat)
>  }
>  EXPORT_SYMBOL(ve_get_cpu_stat);
>  #endif /* CONFIG_CGROUP_SCHED */
> +
> +/*
> + * Add cgroup to list of cgroups that hold a pointer to this
> + * ve. List and ve is protected by lock.
> + */
> +void ve_add_referring_cgroup(struct ve_struct *ve, struct cgroup *cgrp)
> +{
> +	/*
> +	 * The only case when we are adding a cgroup and ve isn't
> +	 * running is during or after ve_stop_ns. And it's already
> +	 * late to add.
> +	 */
> +	if (!ve->is_running)
> +		return;
> +
> +	/*
> +	 * ve_owner is protected by lock, so that cgroup could safely
> +	 * call get_ve if ve_owner is not NULL
> +	 */
> +	mutex_lock(&cgrp->ve_owner_mutex);
> +	cgrp->ve_owner = ve;
> +	mutex_unlock(&cgrp->ve_owner_mutex);
> +
> +	spin_lock(&ve->referring_cgroups_lock);
> +	list_add_tail(&ve->referring_cgroups, &cgrp->ve_reference);
> +	spin_unlock(&ve->referring_cgroups_lock);

How does this work?

ve is entry here, and you link it to ve_reference list.
In case of this function is called for another cgroup from the same ve,
ve will be linked at another list. How can it be linked to two lists
at the same time?

> +}
> +
> +/*
> + * ve_remove_referring_cgroups is called at destruction of VE to
> + * eliminate
> + */
> +void ve_remove_referring_cgroups(struct ve_struct *ve)
> +{
> +	LIST_HEAD(list);
> +	struct list_head *pos, *next;
> +	BUG_ON(ve->is_running);
> +
> +	/*
> +	 * Put ve list on stack to resolve cross locking of
> +	 * ve_owner_mutex and referring_cgroups_lock.
> +	 */
> +	spin_lock(&ve->referring_cgroups_lock);
> +	list_splice_init(&ve->referring_cgroups, &list);
> +	spin_unlock(&ve->referring_cgroups_lock);
> +
> +	list_for_each_safe(pos, next, &list) {
> +		struct cgroup *cgrp;
> +		cgrp = list_entry(pos, struct cgroup, ve_reference);
> +		list_del(&cgrp->ve_reference);
> +		/*
> +		 * This lock protects use of ve list on cgroup side.
> +		 * 1. cgroup can remove itself from ve's list, then
> +		 * it needs to get valid ve owner pointer first
> +		 * find itself in ve list, without lock, it can get
> +		 * a valid pointer and then start to work with a
> +		 * freed ve structure.
> +		 * 2. cgroup may want to get_ve at some point, same
> +		 * logic as in 1. applies to this case.
> +		 */
> +		mutex_lock(&cgrp->ve_owner_mutex);
> +		cgrp->ve_owner = NULL;
> +		mutex_unlock(&cgrp->ve_owner_mutex);
> +	}
> +}
> 



More information about the Devel mailing list