[Devel] [PATCH RHEL7 COMMIT] ve: rework veid handling

Vladimir Davydov vdavydov at virtuozzo.com
Thu Sep 24 03:55:57 PDT 2015


The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.6.9
------>
commit b56bb530160090c88ded36e05099655a0e799e19
Author: Vladimir Davydov <vdavydov at parallels.com>
Date:   Thu Sep 24 14:55:57 2015 +0400

    ve: rework veid handling
    
    Currently, we reserve veid right on write to ve.veid, returning EEXIST
    if the id is already in use, and free it when the ve cgroup is removed
    (ve_offline). Before commit 33f3496e5d13 ("ms/cgroup: split cgroup
    destruction into two steps") ve_offline() was called synchronously on
    rmdir and everything worked fine, but the above mentioned commit changed
    this so that ve_offline() is now called from a workqueue. As a result,
    Setting the same ve.veid right after rmdir-mkdir sequence is likely to
    fail with EEXIST, because the work function removing the id might still
    be in progress. This leads to `vzctl start` failures if the container
    stopped by itself (e.g. called halt), because vzctl recreates the ve
    cgroup directory on start if it already exists.
    
    To fix this issue, this patch reworks veid handling as follows. Now it's
    OK to write any allowed id to ve.veid as long as the container is
    stopped (if it's running EBUSY is returned), and no error will be
    returned even if there is a collision. Actual veid reservation happens
    only on container start (i.e. when START is written to ve.state). If
    ve_start_container() finds that the preferred id is already in use, it
    returns EEXIST. Correspondingly, veid is freed on container stop
    (ve_exit_ns), not on cgroup removal. This ensures that once the
    container has stopped, it's OK to reuse its id, no matter if its ve
    cgroup is being destroyed or not.
    
    https://jira.sw.ru/browse/PSBM-39338
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
    Acked-by: Kirill Tkhai <ktkhai at odin.com>
---
 kernel/ve/ve.c | 67 ++++++++++++++++++++++------------------------------------
 1 file changed, 25 insertions(+), 42 deletions(-)

diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index c67ff3f8d19e..aff3b03fcece 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -118,29 +118,31 @@ void put_ve(struct ve_struct *ve)
 }
 EXPORT_SYMBOL(put_ve);
 
-static void ve_list_add(struct ve_struct *ve)
+static int ve_list_add(struct ve_struct *ve)
 {
+	int err;
+
 	mutex_lock(&ve_list_lock);
-	if (idr_replace(&ve_idr, ve, ve->veid) != NULL)
-		WARN_ON(1);
+	err = idr_alloc(&ve_idr, ve, ve->veid, ve->veid + 1, GFP_KERNEL);
+	if (err < 0) {
+		if (err == -ENOSPC)
+			err = -EEXIST;
+		goto out;
+	}
 	list_add(&ve->ve_list, &ve_list_head);
 	nr_ve++;
+	err = 0;
+out:
 	mutex_unlock(&ve_list_lock);
+	return err;
 }
 
-static void ve_list_del(struct ve_struct *ve, bool free_id)
+static void ve_list_del(struct ve_struct *ve)
 {
 	mutex_lock(&ve_list_lock);
-	/* Check whether ve linked in list of ve's and unlink ve from list if so */
-	if (!list_empty(&ve->ve_list)) {
-		/* Hide ve from finding by veid */
-		if (idr_replace(&ve_idr, NULL, ve->veid) != ve)
-			WARN_ON(1);
-		list_del_init(&ve->ve_list);
-		nr_ve--;
-	}
-	if (free_id && ve->veid)
-		idr_remove(&ve_idr, ve->veid);
+	idr_remove(&ve_idr, ve->veid);
+	list_del_init(&ve->ve_list);
+	nr_ve--;
 	mutex_unlock(&ve_list_lock);
 }
 
@@ -436,7 +438,10 @@ int ve_start_container(struct ve_struct *ve)
 	ve->start_jiffies = get_jiffies_64();
 
 	ve_grab_context(ve);
-	ve_list_add(ve);
+
+	err = ve_list_add(ve);
+	if (err)
+		goto err_list;
 
 	err = ve_start_kthread(ve);
 	if (err)
@@ -475,7 +480,8 @@ err_legacy_pty:
 err_umh:
 	ve_stop_kthread(ve);
 err_kthread:
-	ve_list_del(ve, false);
+	ve_list_del(ve);
+err_list:
 	ve_drop_context(ve);
 	return err;
 }
@@ -538,7 +544,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
 	down_write(&ve->op_sem);
 	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
 
-	ve_list_del(ve, false);
+	ve_list_del(ve);
 	ve_drop_context(ve);
 	up_write(&ve->op_sem);
 
@@ -644,13 +650,6 @@ err_ve:
 	return ERR_PTR(err);
 }
 
-static void ve_offline(struct cgroup *cg)
-{
-	struct ve_struct *ve = cgroup_ve(cg);
-
-	ve_list_del(ve, true);
-}
-
 static void ve_devmnt_free(struct ve_devmnt *devmnt)
 {
 	if (!devmnt)
@@ -839,32 +838,17 @@ static u64 ve_id_read(struct cgroup *cg, struct cftype *cft)
 static int ve_id_write(struct cgroup *cg, struct cftype *cft, u64 value)
 {
 	struct ve_struct *ve = cgroup_ve(cg);
-	int veid;
 	int err = 0;
 
 	if (value <= 0 || value > INT_MAX)
 		return -EINVAL;
 
 	down_write(&ve->op_sem);
-	if (ve->veid) {
+	if (ve->is_running || ve->ve_ns) {
 		if (ve->veid != value)
 			err = -EBUSY;
-		goto out;
-	}
-
-	mutex_lock(&ve_list_lock);
-	/* we forbid to start a container without veid (see ve_start_container)
-	 * so the ve cannot be on the list */
-	BUG_ON(!list_empty(&ve->ve_list));
-	veid = idr_alloc(&ve_idr, NULL, value, value + 1, GFP_KERNEL);
-	if (veid < 0) {
-		err = veid;
-		if (err == -ENOSPC)
-			err = -EEXIST;
 	} else
-		ve->veid = veid;
-	mutex_unlock(&ve_list_lock);
-out:
+		ve->veid = value;
 	up_write(&ve->op_sem);
 	return err;
 }
@@ -1207,7 +1191,6 @@ struct cgroup_subsys ve_subsys = {
 	.name		= "ve",
 	.subsys_id	= ve_subsys_id,
 	.css_alloc	= ve_create,
-	.css_offline	= ve_offline,
 	.css_free	= ve_destroy,
 	.can_attach	= ve_can_attach,
 	.attach		= ve_attach,



More information about the Devel mailing list