[Devel] [RFC rh7] ve/cgroup: Add pseudosuper state for restore sake
Cyrill Gorcunov
gorcunov at virtuozzo.com
Fri Sep 11 12:06:34 PDT 2015
Currently we allow to mount cgroups from inside of VEs context becuse
otherwise we simply can't restore them in other way: CRIU itself runs
from inside of freshly created VE context due to limits of venet module
(well, not limits to be precise but rather due to implementation specifics
made for code simplicity).
In other words we need to do some privileged operations from inside of
VE context. Still such operations (as cgroups mounting on restore stage)
must be limited and yield from VE0 context only.
For this we introduce that named "pseudosuper" state of a container.
This cgroup member can be only set up from VE0 context but dropped
off from any context (including veX). Which allows us to restore
container and bring inability to mount cgroups once restore is done.
In fact there are three players: the kernel itself which check for
pseudosuper status, the libvzctl which setup this status when
start and restore container, and criu whic drops this status once
it complete restoring cgroups.
https://jira.sw.ru/browse/PSBM-34299
Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
CC: Vladimir Davydov <vdavydov at virtuozzo.com>
CC: Konstantin Khorenko <khorenko at virtuozzo.com>
CC: Andrey Vagin <avagin at virtuozzo.com>
CC: Igor Sukhih <igor at parallels.com>
CC: Pavel Emelyanov <xemul at virtuozzo.com>
---
Guys, please take a look, this is early RFC, which I tested with
modified libvzctl and latest criu repo. So any ideas are highly
appreciated!
include/linux/ve.h | 1 +
kernel/cgroup.c | 11 ++---------
kernel/ve/ve.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 50 insertions(+), 9 deletions(-)
Index: linux-pcs7.git/include/linux/ve.h
===================================================================
--- linux-pcs7.git.orig/include/linux/ve.h
+++ linux-pcs7.git/include/linux/ve.h
@@ -47,6 +47,7 @@ struct ve_struct {
struct rw_semaphore op_sem;
int is_running;
int is_locked;
+ int is_pseudosuper;
atomic_t suspend;
/* see vzcalluser.h for VE_FEATURE_XXX definitions */
__u64 features;
Index: linux-pcs7.git/kernel/cgroup.c
===================================================================
--- linux-pcs7.git.orig/kernel/cgroup.c
+++ linux-pcs7.git/kernel/cgroup.c
@@ -1573,15 +1573,8 @@ static struct dentry *cgroup_mount(struc
#ifdef CONFIG_VE
if (!ve_is_super(get_exec_env()) && !(flags & MS_KERNMOUNT)) {
- /*
- * We should allow mounting cgroups from inside of
- * VE only when VE inside a special "restoring" state.
- * At moment we don't have yet this state implemented
- * but to not block the container from the restore
- * lets allow this temporarily.
- */
- /* return ERR_PTR(-EACCES); */
- pr_warn_once("FIXME: Mounting cgroups from inside of VE, restore?");
+ if (!get_exec_env()->is_pseudosuper)
+ return ERR_PTR(-EACCES);
}
#endif
Index: linux-pcs7.git/kernel/ve/ve.c
===================================================================
--- linux-pcs7.git.orig/kernel/ve/ve.c
+++ linux-pcs7.git/kernel/ve/ve.c
@@ -68,6 +68,7 @@ struct ve_struct ve0 = {
RCU_POINTER_INITIALIZER(ve_ns, &init_nsproxy),
.ve_netns = &init_net,
.is_running = 1,
+ .is_pseudosuper = 1,
#ifdef CONFIG_VE_IPTABLES
.ipt_mask = VE_IP_ALL, /* everything is allowed */
#endif
@@ -500,6 +501,12 @@ void ve_stop_ns(struct pid_namespace *pi
*/
ve->is_running = 0;
+ /*
+ * Neither it can be in pseudosuper state
+ * anymore, setup it again if needed.
+ */
+ ve->is_pseudosuper = 0;
+
ve_tty_console_fini(ve);
ve_legacy_pty_fini(ve);
@@ -1121,6 +1128,7 @@ enum {
VE_CF_STATE,
VE_CF_FEATURES,
VE_CF_IPTABLES_MASK,
+ VE_CF_PSEUDOSUPER,
};
static u64 ve_read_u64(struct cgroup *cg, struct cftype *cft)
@@ -1131,6 +1139,37 @@ static u64 ve_read_u64(struct cgroup *cg
else if (cft->private == VE_CF_IPTABLES_MASK)
return cgroup_ve(cg)->ipt_mask;
#endif
+ else if (cft->private == VE_CF_PSEUDOSUPER)
+ return cgroup_ve(cg)->is_pseudosuper;
+ return 0;
+}
+
+/*
+ * Move VE into pseudosuper state where some of privilegued
+ * operations such as mounting cgroups from inside of VE context
+ * is allowed in a sake of container restore for example.
+ *
+ * While dropping pseudosuper privilegues is allowed from
+ * any context to set this value up one have to be a real
+ * node's owner.
+ */
+static int ve_write_pseudosuper(struct cgroup *cg,
+ struct cftype *cft,
+ u64 value)
+{
+ struct ve_struct *ve = cgroup_ve(cg);
+
+ if (!ve_is_super(get_exec_env()) && value)
+ return -EPERM;
+
+ down_write(&ve->op_sem);
+ if (value && (ve->is_running || ve->ve_ns)) {
+ up_write(&ve->op_sem);
+ return -EBUSY;
+ }
+ ve->is_pseudosuper = value;
+ up_write(&ve->op_sem);
+
return 0;
}
@@ -1153,6 +1192,7 @@ static int ve_write_u64(struct cgroup *c
else if (cft->private == VE_CF_IPTABLES_MASK)
ve->ipt_mask = ve_setup_iptables_mask(value);
#endif
+
up_write(&ve->op_sem);
return 0;
}
@@ -1200,6 +1240,13 @@ static struct cftype ve_cftypes[] = {
.write_u64 = ve_write_u64,
.private = VE_CF_IPTABLES_MASK,
},
+ {
+ .name = "pseudosuper",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = ve_read_u64,
+ .write_u64 = ve_write_pseudosuper,
+ .private = VE_CF_PSEUDOSUPER,
+ },
{ }
};
More information about the Devel
mailing list