[Devel] [PATCH rh7] ve: rework veid handling
Kirill Tkhai
ktkhai at odin.com
Thu Sep 24 02:45:46 PDT 2015
On 23.09.2015 17:50, Vladimir Davydov wrote:
> Currently, we reserve veid right on write to ve.veid, returning EEXIST
> if the id is already in use, and free it when the ve cgroup is removed
> (ve_offline). Before commit 33f3496e5d13 ("ms/cgroup: split cgroup
> destruction into two steps") ve_offline() was called synchronously on
> rmdir and everything worked fine, but the above mentioned commit changed
> this so that ve_offline() is now called from a workqueue. As a result,
> Setting the same ve.veid right after rmdir-mkdir sequence is likely to
> fail with EEXIST, because the work function removing the id might still
> be in progress. This leads to `vzctl start` failures if the container
> stopped by itself (e.g. called halt), because vzctl recreates the ve
> cgroup directory on start if it already exists.
>
> To fix this issue, this patch reworks veid handling as follows. Now it's
> OK to write any allowed id to ve.veid as long as the container is
> stopped (if it's running EBUSY is returned), and no error will be
> returned even if there is a collision. Actual veid reservation happens
> only on container start (i.e. when START is written to ve.state). If
> ve_start_container() finds that the preferred id is already in use, it
> returns ENOSPC. Correspondingly, veid is freed on container stop
> (ve_stop_ns), not on cgroup removal. This ensures that once the
> container has stopped, it's OK to reuse its id, no matter if its ve
> cgroup is being destroyed or not.
>
> https://jira.sw.ru/browse/PSBM-39338
>
> Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
> ---
> kernel/ve/ve.c | 71 +++++++++++++++++++++++++---------------------------------
> 1 file changed, 31 insertions(+), 40 deletions(-)
>
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index c67ff3f8d19e..0d1913f14867 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -118,29 +118,36 @@ void put_ve(struct ve_struct *ve)
> }
> EXPORT_SYMBOL(put_ve);
>
> +static int ve_alloc_id(struct ve_struct *ve)
> +{
> + int err;
> +
> + mutex_lock(&ve_list_lock);
> + err = idr_alloc(&ve_idr, ve, ve->veid, ve->veid + 1, GFP_KERNEL);
> + mutex_unlock(&ve_list_lock);
> + return err < 0 ? err : 0;
> +}
> +
> +static void ve_free_id(struct ve_struct *ve)
> +{
> + mutex_lock(&ve_list_lock);
> + idr_remove(&ve_idr, ve->veid);
> + mutex_unlock(&ve_list_lock);
> +}
> +
> static void ve_list_add(struct ve_struct *ve)
> {
> mutex_lock(&ve_list_lock);
> - if (idr_replace(&ve_idr, ve, ve->veid) != NULL)
> - WARN_ON(1);
> list_add(&ve->ve_list, &ve_list_head);
> nr_ve++;
> mutex_unlock(&ve_list_lock);
> }
>
> -static void ve_list_del(struct ve_struct *ve, bool free_id)
> +static void ve_list_del(struct ve_struct *ve)
> {
> mutex_lock(&ve_list_lock);
> - /* Check whether ve linked in list of ve's and unlink ve from list if so */
> - if (!list_empty(&ve->ve_list)) {
> - /* Hide ve from finding by veid */
> - if (idr_replace(&ve_idr, NULL, ve->veid) != ve)
> - WARN_ON(1);
> - list_del_init(&ve->ve_list);
> - nr_ve--;
> - }
> - if (free_id && ve->veid)
> - idr_remove(&ve_idr, ve->veid);
> + list_del_init(&ve->ve_list);
> + nr_ve--;
> mutex_unlock(&ve_list_lock);
> }
>
> @@ -438,6 +445,10 @@ int ve_start_container(struct ve_struct *ve)
> ve_grab_context(ve);
> ve_list_add(ve);
>
> + err = ve_alloc_id(ve);
> + if (err)
> + goto err_id;
I'm confused in a fact we add a ve to &ve_list_head, before we know
the ve's id is unique. for_each_ve() users are able to pick it at the moment.
Furthermore, it looks confusing, we add a ve to the list, before it's completely
initialized.
Can't we move ve_list_add() down to the end of ve_start_container() function?
> +
> err = ve_start_kthread(ve);
> if (err)
> goto err_kthread;
> @@ -475,7 +486,9 @@ err_legacy_pty:
> err_umh:
> ve_stop_kthread(ve);
> err_kthread:
> - ve_list_del(ve, false);
> + ve_free_id(ve);
> +err_id:
> + ve_list_del(ve);
> ve_drop_context(ve);
> return err;
> }
> @@ -508,6 +521,7 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
> * Stop kernel thread, or zap_pid_ns_processes() would wait it forever.
> */
> ve_stop_kthread(ve);
> + ve_free_id(ve);
> up_write(&ve->op_sem);
> }
>
> @@ -538,7 +552,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
> down_write(&ve->op_sem);
> ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>
> - ve_list_del(ve, false);
> + ve_list_del(ve);
> ve_drop_context(ve);
> up_write(&ve->op_sem);
>
> @@ -644,13 +658,6 @@ err_ve:
> return ERR_PTR(err);
> }
>
> -static void ve_offline(struct cgroup *cg)
> -{
> - struct ve_struct *ve = cgroup_ve(cg);
> -
> - ve_list_del(ve, true);
> -}
> -
> static void ve_devmnt_free(struct ve_devmnt *devmnt)
> {
> if (!devmnt)
> @@ -839,32 +846,17 @@ static u64 ve_id_read(struct cgroup *cg, struct cftype *cft)
> static int ve_id_write(struct cgroup *cg, struct cftype *cft, u64 value)
> {
> struct ve_struct *ve = cgroup_ve(cg);
> - int veid;
> int err = 0;
>
> if (value <= 0 || value > INT_MAX)
> return -EINVAL;
>
> down_write(&ve->op_sem);
> - if (ve->veid) {
> + if (ve->is_running) {
> if (ve->veid != value)
> err = -EBUSY;
> - goto out;
> - }
> -
> - mutex_lock(&ve_list_lock);
> - /* we forbid to start a container without veid (see ve_start_container)
> - * so the ve cannot be on the list */
> - BUG_ON(!list_empty(&ve->ve_list));
> - veid = idr_alloc(&ve_idr, NULL, value, value + 1, GFP_KERNEL);
> - if (veid < 0) {
> - err = veid;
> - if (err == -ENOSPC)
> - err = -EEXIST;
> } else
> - ve->veid = veid;
> - mutex_unlock(&ve_list_lock);
> -out:
> + ve->veid = value;
> up_write(&ve->op_sem);
> return err;
> }
> @@ -1207,7 +1199,6 @@ struct cgroup_subsys ve_subsys = {
> .name = "ve",
> .subsys_id = ve_subsys_id,
> .css_alloc = ve_create,
> - .css_offline = ve_offline,
> .css_free = ve_destroy,
> .can_attach = ve_can_attach,
> .attach = ve_attach,
>
More information about the Devel
mailing list