[Devel] [RFC rhel7] Disabling mounting cgroups from inside of container

Cyrill Gorcunov gorcunov at virtuozzo.com
Sat Jan 16 12:13:15 PST 2016


Guys, we've found a problem in cgorups management code: currently we
allow to mount cgroups from inside of veX context which have a few
problems:

 - performance issue (as Vladimir always pointed)
 - security issue (as been fixed by Stas in commit
   1867565c8c6df8c2a18e391d9e6d721cf29e251e)

I propose to being pseudosuper state which we gonna use
on restore procedure and disable mounting cgroups from
inside of veX context.

All cgroups needed should be prepared upon containers
starup procedure and nothing else allowed.

Please see changelogs for the patches attached.

	Cyrill
-------------- next part --------------
From: Cyrill Gorcunov <gorcunov at virtuozzo.com>
Subject: [RFC rh7] ve/cgroup: Add pseudosuper state for restore sake

Currently we allow to mount cgroups from inside of VEs context for
restore sake. But this will be a problem in future: every new mount
from inside of VE is actually degradates kernel performance.

For this we introduce that named "pseudosuper" state of a container.
This cgroup member can be only set up from ve0 context but dropped
off from any context (including veX). Which allows us to restore
container and bring inability to mount cgroups once restore is done.

In fact there are three players: the kernel itself which check for
pseudosuper status, the libvzctl which setup this status when
start and restore container, and criu which drops this status once
it complete restoring cgroups (calling libvzctl script upon namespace
creation).

https://jira.sw.ru/browse/PSBM-34299
https://jira.sw.ru/browse/PSBM-43169
https://jira.sw.ru/browse/PSBM-42573

Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
CC: Vladimir Davydov <vdavydov at virtuozzo.com>
CC: Konstantin Khorenko <khorenko at virtuozzo.com>
CC: Andrey Vagin <avagin at virtuozzo.com>
CC: Igor Sukhih <igor at parallels.com>
CC: Pavel Emelyanov <xemul at virtuozzo.com>
---
 include/linux/ve.h |    1 +
 kernel/cgroup.c    |   11 ++---------
 kernel/ve/ve.c     |   46 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 9 deletions(-)

Index: linux-pcs7.git/include/linux/ve.h
===================================================================
--- linux-pcs7.git.orig/include/linux/ve.h
+++ linux-pcs7.git/include/linux/ve.h
@@ -49,6 +49,7 @@ struct ve_struct {
 	struct rw_semaphore	op_sem;
 	int			is_running;
 	int			is_locked;
+	int			is_pseudosuper;
 	atomic_t		suspend;
 	/* see vzcalluser.h for VE_FEATURE_XXX definitions */
 	__u64			features;
Index: linux-pcs7.git/kernel/cgroup.c
===================================================================
--- linux-pcs7.git.orig/kernel/cgroup.c
+++ linux-pcs7.git/kernel/cgroup.c
@@ -1604,15 +1604,8 @@ static struct dentry *cgroup_mount(struc
 
 #ifdef CONFIG_VE
 	if (!ve_is_super(get_exec_env()) && !(flags & MS_KERNMOUNT)) {
-		/*
-		 * We should allow mounting cgroups from inside of
-		 * VE only when VE inside a special "restoring" state.
-		 * At moment we don't have yet this state implemented
-		 * but to not block the container from the restore
-		 * lets allow this temporarily.
-		 */
-		/* return ERR_PTR(-EACCES); */
-		pr_warn_once("FIXME: Mounting cgroups from inside of VE, restore?");
+		if (!get_exec_env()->is_pseudosuper)
+			return ERR_PTR(-EACCES);
 	}
 #endif
 
Index: linux-pcs7.git/kernel/ve/ve.c
===================================================================
--- linux-pcs7.git.orig/kernel/ve/ve.c
+++ linux-pcs7.git/kernel/ve/ve.c
@@ -68,6 +68,7 @@ struct ve_struct ve0 = {
 	RCU_POINTER_INITIALIZER(ve_ns, &init_nsproxy),
 	.ve_netns		= &init_net,
 	.is_running		= 1,
+	.is_pseudosuper		= 1,
 #ifdef CONFIG_VE_IPTABLES
 	.ipt_mask		= VE_IP_ALL,	/* everything is allowed */
 #endif
@@ -532,6 +533,12 @@ void ve_stop_ns(struct pid_namespace *pi
 	 */
 	ve->is_running = 0;
 
+	/*
+	 * Neither it can be in pseudosuper state
+	 * anymore, setup it again if needed.
+	 */
+	ve->is_pseudosuper = 0;
+
 	ve_tty_console_fini(ve);
 	ve_legacy_pty_fini(ve);
 
@@ -1146,6 +1153,7 @@ enum {
 	VE_CF_STATE,
 	VE_CF_FEATURES,
 	VE_CF_IPTABLES_MASK,
+	VE_CF_PSEUDOSUPER,
 };
 
 static u64 ve_read_u64(struct cgroup *cg, struct cftype *cft)
@@ -1156,6 +1164,37 @@ static u64 ve_read_u64(struct cgroup *cg
 	else if (cft->private == VE_CF_IPTABLES_MASK)
 		return cgroup_ve(cg)->ipt_mask;
 #endif
+	else if (cft->private == VE_CF_PSEUDOSUPER)
+		return cgroup_ve(cg)->is_pseudosuper;
+	return 0;
+}
+
+/*
+ * Move VE into pseudosuper state where some of privilegued
+ * operations such as mounting cgroups from inside of VE context
+ * is allowed in a sake of container restore for example.
+ *
+ * While dropping pseudosuper privilegues is allowed from
+ * any context to set this value up one have to be a real
+ * node's owner.
+ */
+static int ve_write_pseudosuper(struct cgroup *cg,
+				struct cftype *cft,
+				u64 value)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+
+	if (!ve_is_super(get_exec_env()) && value)
+		return -EPERM;
+
+	down_write(&ve->op_sem);
+	if (value && (ve->is_running || ve->ve_ns)) {
+		up_write(&ve->op_sem);
+		return -EBUSY;
+	}
+	ve->is_pseudosuper = value;
+	up_write(&ve->op_sem);
+
 	return 0;
 }
 
@@ -1225,6 +1264,13 @@ static struct cftype ve_cftypes[] = {
 		.write_u64		= ve_write_u64,
 		.private		= VE_CF_IPTABLES_MASK,
 	},
+	{
+		.name			= "pseudosuper",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_pseudosuper,
+		.private		= VE_CF_PSEUDOSUPER,
+	},
 	{ }
 };
 
-------------- next part --------------
From: Cyrill Gorcunov <gorcunov at virtuozzo.com>
Subject: [RFC rh7] ve/cgroup: Revert commit 1867565c8c6df8c2a18e391d9e6d721cf29e251e

In commit 1867565c8c6df8c2a18e391d9e6d721cf29e251e we've disabled
mounting cgroups if its root already present but this is ruines
restore procedure for the container because of how libvzctl + criu
bundle works (moreover it doesn't even check if the mounting
is done on ve0 which is definitely changes vanilla kernel
behaviour).

Now a few works on how current restore procedure is implemented

 - libvzctl prepares cgroups set and mount them in the places
   they should belong passing appropriate parameters to the criu

 - criu in turn has to remount all cgroups again because it
   has to work not only on Virtuozzo envinronment but on
   vanilla kernels too, so in criu we prepare cgroups
   yard from the scratch

But with commit in the subject we start clashing with
names and kernel refuses restore procedure.

https://jira.sw.ru/browse/PSBM-43169
https://jira.sw.ru/browse/PSBM-42573

Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
CC: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
CC: Vladimir Davydov <vdavydov at virtuozzo.com>
CC: Konstantin Khorenko <khorenko at virtuozzo.com>
CC: Andrey Vagin <avagin at virtuozzo.com>
CC: Igor Sukhih <igor at parallels.com>
CC: Pavel Emelyanov <xemul at virtuozzo.com>
---
 include/linux/cgroup.h |    7 +------
 kernel/cgroup.c        |   15 +++------------
 2 files changed, 4 insertions(+), 18 deletions(-)

Index: linux-pcs7.git/include/linux/cgroup.h
===================================================================
--- linux-pcs7.git.orig/include/linux/cgroup.h
+++ linux-pcs7.git/include/linux/cgroup.h
@@ -301,9 +301,7 @@ enum {
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
 };
-#ifdef CONFIG_VE
-struct ve_struct;
-#endif
+
 /*
  * A cgroupfs_root represents the root of a cgroup hierarchy, and may be
  * associated with a superblock to form an active hierarchy.  This is
@@ -350,9 +348,6 @@ struct cgroupfs_root {
 
 	/* The name for this hierarchy - may be empty */
 	char name[MAX_CGROUP_ROOT_NAMELEN];
-#ifdef CONFIG_VE
-	struct ve_struct *ve;
-#endif
 };
 
 /*
Index: linux-pcs7.git/kernel/cgroup.c
===================================================================
--- linux-pcs7.git.orig/kernel/cgroup.c
+++ linux-pcs7.git/kernel/cgroup.c
@@ -1479,10 +1479,7 @@ static int cgroup_test_super(struct supe
 {
 	struct cgroup_sb_opts *opts = data;
 	struct cgroupfs_root *root = sb->s_fs_info;
-#ifdef CONFIG_VE
-	if (get_exec_env() != root->ve)
-		return 0;
-#endif
+
 	/* If we asked for a name then it must match */
 	if (opts->name && strcmp(opts->name, root->name))
 		return 0;
@@ -1662,18 +1659,12 @@ static struct dentry *cgroup_mount(struc
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
 		mutex_lock(&cgroup_root_mutex);
-#ifdef CONFIG_VE
-		root->ve = get_exec_env();
-#endif
+
 		/* Check for name clashes with existing mounts */
 		ret = -EBUSY;
 		if (strlen(root->name))
 			for_each_active_root(existing_root)
-				if (!strcmp(existing_root->name, root->name)
-#ifdef CONFIG_VE
-				    && (root->ve == existing_root->ve)
-#endif
-				    )
+				if (!strcmp(existing_root->name, root->name))
 					goto unlock_drop;
 
 		/*
-------------- next part --------------
>From 480726fa818f7c53503d81894af127eade357b04 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov at openvz.org>
Date: Sat, 16 Jan 2016 21:52:28 +0300
Subject: [PATCH libvzctl] env_nsops: Use @pseudosuper feature on the restore procedure

For a long time we've been allowing mounting cgroups from
inside of containers. The primary reason was that we've
had to create venet devices from inside of already existing
VEs (due to its structure).

But we've reworked venet code so now it's aware of net-namespaces.
Still curent design of restore procedure is that we're creating
cgroups from inside of libvzctl on ve0 and then move self
into veX and proceed restore from there.

CRIU in turn restores cgroups remounting them from veX context
(strictly speaking CRIU works in this way to be able to restore
 not only Virtuozzo based containers but general containers as well).

https://jira.sw.ru/browse/PSBM-34299
https://jira.sw.ru/browse/PSBM-43169
https://jira.sw.ru/browse/PSBM-42573

Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
CC: Vladimir Davydov <vdavydov at virtuozzo.com>
CC: Konstantin Khorenko <khorenko at virtuozzo.com>
CC: Andrey Vagin <avagin at virtuozzo.com>
CC: Igor Sukhih <igor at parallels.com>
CC: Pavel Emelyanov <xemul at virtuozzo.com>
---
 lib/cgroup.c             | 5 +++++
 lib/cgroup.h             | 1 +
 lib/env_nsops.c          | 4 ++++
 scripts/vz-rst-action.in | 1 +
 4 files changed, 11 insertions(+)

diff --git a/lib/cgroup.c b/lib/cgroup.c
index 64eee22..bf43955 100644
--- a/lib/cgroup.c
+++ b/lib/cgroup.c
@@ -430,6 +430,11 @@ int cg_destroy_cgroup(const char *ctid)
 	return ret;
 }
 
+int cg_enable_pseudosuper(const char *ctid)
+{
+	return cg_set_ul(ctid, CG_VE, "ve.pseudosuper", 1);
+}
+
 int cg_attach_task(const char *ctid, pid_t pid)
 {
 	int ret, i;
diff --git a/lib/cgroup.h b/lib/cgroup.h
index cd52534..3724d97 100644
--- a/lib/cgroup.h
+++ b/lib/cgroup.h
@@ -46,6 +46,7 @@ int cg_get_path(const char *ctid, const char *subsys, const char *name,
 int write_data(const char *path, const char *data);
 int cg_new_cgroup(const char *ctid);
 int cg_destroy_cgroup(const char *ctid);
+int cg_enable_pseudosuper(const char *ctid);
 int cg_attach_task(const char *ctid, pid_t pid);
 int cg_set_param(const char *ctid, const char *subsys, const char *name, const char *data);
 int cg_get_param(const char *ctid, const char *subsys, const char *name, char *out, int size);
diff --git a/lib/env_nsops.c b/lib/env_nsops.c
index f650e2b..7b96e17 100644
--- a/lib/env_nsops.c
+++ b/lib/env_nsops.c
@@ -60,6 +60,7 @@
 #include "vcmm.h"
 
 int systemd_start_ve_scope(struct vzctl_env_handle *h, pid_t pid);
+static int restore_FN(struct vzctl_env_handle *h, struct start_param *data);
 
 #ifndef HAVE_SETNS
 
@@ -640,6 +641,9 @@ static int do_env_create(struct vzctl_env_handle *h, struct start_param *param)
 	if (ret)
 		goto err;
 
+	if (param->fn == restore_FN)
+		cg_enable_pseudosuper(h->ctid);
+
 	ret = cg_attach_task(h->ctid, getpid());
 	if (ret)
 		goto err;
diff --git a/scripts/vz-rst-action.in b/scripts/vz-rst-action.in
index 531744a..f6eb8ec 100755
--- a/scripts/vz-rst-action.in
+++ b/scripts/vz-rst-action.in
@@ -37,6 +37,7 @@ case "$CRTOOLS_SCRIPT_ACTION" in
 	ln -s /proc/$pid/ns/net $VE_NETNS_FILE
 
 	if [ -n "$VEID" ]; then
+		cgset -r ve.pseudosuper="0" $VEID
 		cgset -r ve.state="START $pid" $VEID || exit
 	fi
 	;;
-- 
2.5.0



More information about the Devel mailing list