[Devel] [PATCH RHEL7 COMMIT] ve/cgroup: Add pseudosuper state for restore sake

Konstantin Khorenko khorenko at virtuozzo.com
Mon Jan 18 02:43:52 PST 2016


The commit is pushed to "branch-rh7-3.10.0-229.7.2.vz7.9.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.9.22
------>
commit 91857f316240a0d5d7b0bb25a7979b41c1290d4b
Author: Cyrill Gorcunov <gorcunov at virtuozzo.com>
Date:   Mon Jan 18 14:43:52 2016 +0400

    ve/cgroup: Add pseudosuper state for restore sake
    
    Currently we allow to mount cgroups from inside of VEs context for
    restore sake. But this will be a problem in future: every new mount
    from inside of VE is actually degradates kernel performance.
    
    For this we introduce that named "pseudosuper" state of a container.
    This cgroup member can be only set up from ve0 context but dropped
    off from any context (including veX). Which allows us to restore
    container and bring inability to mount cgroups once restore is done.
    
    In fact there are three players: the kernel itself which check for
    pseudosuper status, the libvzctl which setup this status when
    start and restore container, and criu which drops this status once
    it complete restoring cgroups (calling libvzctl script upon namespace
    creation).
    
    https://jira.sw.ru/browse/PSBM-34299
    https://jira.sw.ru/browse/PSBM-43169
    https://jira.sw.ru/browse/PSBM-42573
    
    Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
    CC: Vladimir Davydov <vdavydov at virtuozzo.com>
    CC: Konstantin Khorenko <khorenko at virtuozzo.com>
    CC: Andrey Vagin <avagin at virtuozzo.com>
    CC: Igor Sukhih <igor at parallels.com>
    CC: Pavel Emelyanov <xemul at virtuozzo.com>
---
 include/linux/ve.h |  1 +
 kernel/cgroup.c    | 11 ++---------
 kernel/ve/ve.c     | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/include/linux/ve.h b/include/linux/ve.h
index b9374a1..b206526 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -49,6 +49,7 @@ struct ve_struct {
 	struct rw_semaphore	op_sem;
 	int			is_running;
 	int			is_locked;
+	int			is_pseudosuper;
 	atomic_t		suspend;
 	/* see vzcalluser.h for VE_FEATURE_XXX definitions */
 	__u64			features;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a80b8bf..8b74c39 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1604,15 +1604,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
 #ifdef CONFIG_VE
 	if (!ve_is_super(get_exec_env()) && !(flags & MS_KERNMOUNT)) {
-		/*
-		 * We should allow mounting cgroups from inside of
-		 * VE only when VE inside a special "restoring" state.
-		 * At moment we don't have yet this state implemented
-		 * but to not block the container from the restore
-		 * lets allow this temporarily.
-		 */
-		/* return ERR_PTR(-EACCES); */
-		pr_warn_once("FIXME: Mounting cgroups from inside of VE, restore?");
+		if (!get_exec_env()->is_pseudosuper)
+			return ERR_PTR(-EACCES);
 	}
 #endif
 
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 1f8e5da..40ce827 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -68,6 +68,7 @@ struct ve_struct ve0 = {
 	RCU_POINTER_INITIALIZER(ve_ns, &init_nsproxy),
 	.ve_netns		= &init_net,
 	.is_running		= 1,
+	.is_pseudosuper		= 1,
 #ifdef CONFIG_VE_IPTABLES
 	.ipt_mask		= VE_IP_ALL,	/* everything is allowed */
 #endif
@@ -532,6 +533,12 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
 	 */
 	ve->is_running = 0;
 
+	/*
+	 * Neither it can be in pseudosuper state
+	 * anymore, setup it again if needed.
+	 */
+	ve->is_pseudosuper = 0;
+
 	ve_tty_console_fini(ve);
 	ve_legacy_pty_fini(ve);
 
@@ -1146,6 +1153,7 @@ enum {
 	VE_CF_STATE,
 	VE_CF_FEATURES,
 	VE_CF_IPTABLES_MASK,
+	VE_CF_PSEUDOSUPER,
 };
 
 static u64 ve_read_u64(struct cgroup *cg, struct cftype *cft)
@@ -1156,6 +1164,37 @@ static u64 ve_read_u64(struct cgroup *cg, struct cftype *cft)
 	else if (cft->private == VE_CF_IPTABLES_MASK)
 		return cgroup_ve(cg)->ipt_mask;
 #endif
+	else if (cft->private == VE_CF_PSEUDOSUPER)
+		return cgroup_ve(cg)->is_pseudosuper;
+	return 0;
+}
+
+/*
+ * Move VE into pseudosuper state where some of privilegued
+ * operations such as mounting cgroups from inside of VE context
+ * is allowed in a sake of container restore for example.
+ *
+ * While dropping pseudosuper privilegues is allowed from
+ * any context to set this value up one have to be a real
+ * node's owner.
+ */
+static int ve_write_pseudosuper(struct cgroup *cg,
+				struct cftype *cft,
+				u64 value)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+
+	if (!ve_is_super(get_exec_env()) && value)
+		return -EPERM;
+
+	down_write(&ve->op_sem);
+	if (value && (ve->is_running || ve->ve_ns)) {
+		up_write(&ve->op_sem);
+		return -EBUSY;
+	}
+	ve->is_pseudosuper = value;
+	up_write(&ve->op_sem);
+
 	return 0;
 }
 
@@ -1225,6 +1264,13 @@ static struct cftype ve_cftypes[] = {
 		.write_u64		= ve_write_u64,
 		.private		= VE_CF_IPTABLES_MASK,
 	},
+	{
+		.name			= "pseudosuper",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_pseudosuper,
+		.private		= VE_CF_PSEUDOSUPER,
+	},
 	{ }
 };
 


More information about the Devel mailing list