[Devel] [PATCH rh7] ve: rework veid handling

Wed Sep 23 07:50:29 PDT 2015

Currently, we reserve veid right on write to ve.veid, returning EEXIST
if the id is already in use, and free it when the ve cgroup is removed
(ve_offline). Before commit 33f3496e5d13 ("ms/cgroup: split cgroup
destruction into two steps") ve_offline() was called synchronously on
rmdir and everything worked fine, but the above mentioned commit changed
this so that ve_offline() is now called from a workqueue. As a result,
Setting the same ve.veid right after rmdir-mkdir sequence is likely to
fail with EEXIST, because the work function removing the id might still
be in progress. This leads to `vzctl start` failures if the container
stopped by itself (e.g. called halt), because vzctl recreates the ve
cgroup directory on start if it already exists.

To fix this issue, this patch reworks veid handling as follows. Now it's
OK to write any allowed id to ve.veid as long as the container is
stopped (if it's running EBUSY is returned), and no error will be
returned even if there is a collision. Actual veid reservation happens
only on container start (i.e. when START is written to ve.state). If
ve_start_container() finds that the preferred id is already in use, it
returns ENOSPC. Correspondingly, veid is freed on container stop
(ve_stop_ns), not on cgroup removal. This ensures that once the
container has stopped, it's OK to reuse its id, no matter if its ve
cgroup is being destroyed or not.

https://jira.sw.ru/browse/PSBM-39338

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 kernel/ve/ve.c | 71 +++++++++++++++++++++++++---------------------------------
 1 file changed, 31 insertions(+), 40 deletions(-)

diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index c67ff3f8d19e..0d1913f14867 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -118,29 +118,36 @@ void put_ve(struct ve_struct *ve)
 }
 EXPORT_SYMBOL(put_ve);
 
+static int ve_alloc_id(struct ve_struct *ve)
+{
+	int err;
+
+	mutex_lock(&ve_list_lock);
+	err = idr_alloc(&ve_idr, ve, ve->veid, ve->veid + 1, GFP_KERNEL);
+	mutex_unlock(&ve_list_lock);
+	return err < 0 ? err : 0;
+}
+
+static void ve_free_id(struct ve_struct *ve)
+{
+	mutex_lock(&ve_list_lock);
+	idr_remove(&ve_idr, ve->veid);
+	mutex_unlock(&ve_list_lock);
+}
+
 static void ve_list_add(struct ve_struct *ve)
 {
 	mutex_lock(&ve_list_lock);
-	if (idr_replace(&ve_idr, ve, ve->veid) != NULL)
-		WARN_ON(1);
 	list_add(&ve->ve_list, &ve_list_head);
 	nr_ve++;
 	mutex_unlock(&ve_list_lock);
 }
 
-static void ve_list_del(struct ve_struct *ve, bool free_id)
+static void ve_list_del(struct ve_struct *ve)
 {
 	mutex_lock(&ve_list_lock);
-	/* Check whether ve linked in list of ve's and unlink ve from list if so */
-	if (!list_empty(&ve->ve_list)) {
-		/* Hide ve from finding by veid */
-		if (idr_replace(&ve_idr, NULL, ve->veid) != ve)
-			WARN_ON(1);
-		list_del_init(&ve->ve_list);
-		nr_ve--;
-	}
-	if (free_id && ve->veid)
-		idr_remove(&ve_idr, ve->veid);
+	list_del_init(&ve->ve_list);
+	nr_ve--;
 	mutex_unlock(&ve_list_lock);
 }
 
@@ -438,6 +445,10 @@ int ve_start_container(struct ve_struct *ve)
 	ve_grab_context(ve);
 	ve_list_add(ve);
 
+	err = ve_alloc_id(ve);
+	if (err)
+		goto err_id;
+
 	err = ve_start_kthread(ve);
 	if (err)
 		goto err_kthread;
@@ -475,7 +486,9 @@ err_legacy_pty:
 err_umh:
 	ve_stop_kthread(ve);
 err_kthread:
-	ve_list_del(ve, false);
+	ve_free_id(ve);
+err_id:
+	ve_list_del(ve);
 	ve_drop_context(ve);
 	return err;
 }
@@ -508,6 +521,7 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
 	 * Stop kernel thread, or zap_pid_ns_processes() would wait it forever.
 	 */
 	ve_stop_kthread(ve);
+	ve_free_id(ve);
 	up_write(&ve->op_sem);
 }
 
@@ -538,7 +552,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
 	down_write(&ve->op_sem);
 	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
 
-	ve_list_del(ve, false);
+	ve_list_del(ve);
 	ve_drop_context(ve);
 	up_write(&ve->op_sem);
 
@@ -644,13 +658,6 @@ err_ve:
 	return ERR_PTR(err);
 }
 
-static void ve_offline(struct cgroup *cg)
-{
-	struct ve_struct *ve = cgroup_ve(cg);
-
-	ve_list_del(ve, true);
-}
-
 static void ve_devmnt_free(struct ve_devmnt *devmnt)
 {
 	if (!devmnt)
@@ -839,32 +846,17 @@ static u64 ve_id_read(struct cgroup *cg, struct cftype *cft)
 static int ve_id_write(struct cgroup *cg, struct cftype *cft, u64 value)
 {
 	struct ve_struct *ve = cgroup_ve(cg);
-	int veid;
 	int err = 0;
 
 	if (value <= 0 || value > INT_MAX)
 		return -EINVAL;
 
 	down_write(&ve->op_sem);
-	if (ve->veid) {
+	if (ve->is_running) {
 		if (ve->veid != value)
 			err = -EBUSY;
-		goto out;
-	}
-
-	mutex_lock(&ve_list_lock);
-	/* we forbid to start a container without veid (see ve_start_container)
-	 * so the ve cannot be on the list */
-	BUG_ON(!list_empty(&ve->ve_list));
-	veid = idr_alloc(&ve_idr, NULL, value, value + 1, GFP_KERNEL);
-	if (veid < 0) {
-		err = veid;
-		if (err == -ENOSPC)
-			err = -EEXIST;
 	} else
-		ve->veid = veid;
-	mutex_unlock(&ve_list_lock);
-out:
+		ve->veid = value;
 	up_write(&ve->op_sem);
 	return err;
 }
@@ -1207,7 +1199,6 @@ struct cgroup_subsys ve_subsys = {
 	.name		= "ve",
 	.subsys_id	= ve_subsys_id,
 	.css_alloc	= ve_create,
-	.css_offline	= ve_offline,
 	.css_free	= ve_destroy,
 	.can_attach	= ve_can_attach,
 	.attach		= ve_attach,
-- 
2.1.4