[Devel] [PATCH rh7] ve: rework veid handling

Thu Sep 24 02:45:46 PDT 2015

On 23.09.2015 17:50, Vladimir Davydov wrote:
> Currently, we reserve veid right on write to ve.veid, returning EEXIST
> if the id is already in use, and free it when the ve cgroup is removed
> (ve_offline). Before commit 33f3496e5d13 ("ms/cgroup: split cgroup
> destruction into two steps") ve_offline() was called synchronously on
> rmdir and everything worked fine, but the above mentioned commit changed
> this so that ve_offline() is now called from a workqueue. As a result,
> Setting the same ve.veid right after rmdir-mkdir sequence is likely to
> fail with EEXIST, because the work function removing the id might still
> be in progress. This leads to `vzctl start` failures if the container
> stopped by itself (e.g. called halt), because vzctl recreates the ve
> cgroup directory on start if it already exists.
> 
> To fix this issue, this patch reworks veid handling as follows. Now it's
> OK to write any allowed id to ve.veid as long as the container is
> stopped (if it's running EBUSY is returned), and no error will be
> returned even if there is a collision. Actual veid reservation happens
> only on container start (i.e. when START is written to ve.state). If
> ve_start_container() finds that the preferred id is already in use, it
> returns ENOSPC. Correspondingly, veid is freed on container stop
> (ve_stop_ns), not on cgroup removal. This ensures that once the
> container has stopped, it's OK to reuse its id, no matter if its ve
> cgroup is being destroyed or not.
> 
> https://jira.sw.ru/browse/PSBM-39338
> 
> Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
> ---
>  kernel/ve/ve.c | 71 +++++++++++++++++++++++++---------------------------------
>  1 file changed, 31 insertions(+), 40 deletions(-)
> 
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index c67ff3f8d19e..0d1913f14867 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -118,29 +118,36 @@ void put_ve(struct ve_struct *ve)
>  }
>  EXPORT_SYMBOL(put_ve);
>  
> +static int ve_alloc_id(struct ve_struct *ve)
> +{
> +	int err;
> +
> +	mutex_lock(&ve_list_lock);
> +	err = idr_alloc(&ve_idr, ve, ve->veid, ve->veid + 1, GFP_KERNEL);
> +	mutex_unlock(&ve_list_lock);
> +	return err < 0 ? err : 0;
> +}
> +
> +static void ve_free_id(struct ve_struct *ve)
> +{
> +	mutex_lock(&ve_list_lock);
> +	idr_remove(&ve_idr, ve->veid);
> +	mutex_unlock(&ve_list_lock);
> +}
> +
>  static void ve_list_add(struct ve_struct *ve)
>  {
>  	mutex_lock(&ve_list_lock);
> -	if (idr_replace(&ve_idr, ve, ve->veid) != NULL)
> -		WARN_ON(1);
>  	list_add(&ve->ve_list, &ve_list_head);
>  	nr_ve++;
>  	mutex_unlock(&ve_list_lock);
>  }
>  
> -static void ve_list_del(struct ve_struct *ve, bool free_id)
> +static void ve_list_del(struct ve_struct *ve)
>  {
>  	mutex_lock(&ve_list_lock);
> -	/* Check whether ve linked in list of ve's and unlink ve from list if so */
> -	if (!list_empty(&ve->ve_list)) {
> -		/* Hide ve from finding by veid */
> -		if (idr_replace(&ve_idr, NULL, ve->veid) != ve)
> -			WARN_ON(1);
> -		list_del_init(&ve->ve_list);
> -		nr_ve--;
> -	}
> -	if (free_id && ve->veid)
> -		idr_remove(&ve_idr, ve->veid);
> +	list_del_init(&ve->ve_list);
> +	nr_ve--;
>  	mutex_unlock(&ve_list_lock);
>  }
>  
> @@ -438,6 +445,10 @@ int ve_start_container(struct ve_struct *ve)
>  	ve_grab_context(ve);
>  	ve_list_add(ve);
>  
> +	err = ve_alloc_id(ve);
> +	if (err)
> +		goto err_id;

I'm confused in a fact we add a ve to &ve_list_head, before we know
the ve's id is unique. for_each_ve() users are able to pick it at the moment.

Furthermore, it looks confusing, we add a ve to the list, before it's completely
initialized.

Can't we move ve_list_add() down to the end of ve_start_container() function?

> +
>  	err = ve_start_kthread(ve);
>  	if (err)
>  		goto err_kthread;
> @@ -475,7 +486,9 @@ err_legacy_pty:
>  err_umh:
>  	ve_stop_kthread(ve);
>  err_kthread:
> -	ve_list_del(ve, false);
> +	ve_free_id(ve);
> +err_id:
> +	ve_list_del(ve);
>  	ve_drop_context(ve);
>  	return err;
>  }
> @@ -508,6 +521,7 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
>  	 * Stop kernel thread, or zap_pid_ns_processes() would wait it forever.
>  	 */
>  	ve_stop_kthread(ve);
> +	ve_free_id(ve);
>  	up_write(&ve->op_sem);
>  }
>  
> @@ -538,7 +552,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
>  	down_write(&ve->op_sem);
>  	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>  
> -	ve_list_del(ve, false);
> +	ve_list_del(ve);
>  	ve_drop_context(ve);
>  	up_write(&ve->op_sem);
>  
> @@ -644,13 +658,6 @@ err_ve:
>  	return ERR_PTR(err);
>  }
>  
> -static void ve_offline(struct cgroup *cg)
> -{
> -	struct ve_struct *ve = cgroup_ve(cg);
> -
> -	ve_list_del(ve, true);
> -}
> -
>  static void ve_devmnt_free(struct ve_devmnt *devmnt)
>  {
>  	if (!devmnt)
> @@ -839,32 +846,17 @@ static u64 ve_id_read(struct cgroup *cg, struct cftype *cft)
>  static int ve_id_write(struct cgroup *cg, struct cftype *cft, u64 value)
>  {
>  	struct ve_struct *ve = cgroup_ve(cg);
> -	int veid;
>  	int err = 0;
>  
>  	if (value <= 0 || value > INT_MAX)
>  		return -EINVAL;
>  
>  	down_write(&ve->op_sem);
> -	if (ve->veid) {
> +	if (ve->is_running) {
>  		if (ve->veid != value)
>  			err = -EBUSY;
> -		goto out;
> -	}
> -
> -	mutex_lock(&ve_list_lock);
> -	/* we forbid to start a container without veid (see ve_start_container)
> -	 * so the ve cannot be on the list */
> -	BUG_ON(!list_empty(&ve->ve_list));
> -	veid = idr_alloc(&ve_idr, NULL, value, value + 1, GFP_KERNEL);
> -	if (veid < 0) {
> -		err = veid;
> -		if (err == -ENOSPC)
> -			err = -EEXIST;
>  	} else
> -		ve->veid = veid;
> -	mutex_unlock(&ve_list_lock);
> -out:
> +		ve->veid = value;
>  	up_write(&ve->op_sem);
>  	return err;
>  }
> @@ -1207,7 +1199,6 @@ struct cgroup_subsys ve_subsys = {
>  	.name		= "ve",
>  	.subsys_id	= ve_subsys_id,
>  	.css_alloc	= ve_create,
> -	.css_offline	= ve_offline,
>  	.css_free	= ve_destroy,
>  	.can_attach	= ve_can_attach,
>  	.attach		= ve_attach,
>