[Devel] [PATCH RHEL7 COMMIT] ve/cgroup: Add pseudosuper state for restore sake
Konstantin Khorenko
khorenko at virtuozzo.com
Mon Jan 18 02:43:52 PST 2016
The commit is pushed to "branch-rh7-3.10.0-229.7.2.vz7.9.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.9.22
------>
commit 91857f316240a0d5d7b0bb25a7979b41c1290d4b
Author: Cyrill Gorcunov <gorcunov at virtuozzo.com>
Date: Mon Jan 18 14:43:52 2016 +0400
ve/cgroup: Add pseudosuper state for restore sake
Currently we allow to mount cgroups from inside of VEs context for
restore sake. But this will be a problem in future: every new mount
from inside of VE is actually degradates kernel performance.
For this we introduce that named "pseudosuper" state of a container.
This cgroup member can be only set up from ve0 context but dropped
off from any context (including veX). Which allows us to restore
container and bring inability to mount cgroups once restore is done.
In fact there are three players: the kernel itself which check for
pseudosuper status, the libvzctl which setup this status when
start and restore container, and criu which drops this status once
it complete restoring cgroups (calling libvzctl script upon namespace
creation).
https://jira.sw.ru/browse/PSBM-34299
https://jira.sw.ru/browse/PSBM-43169
https://jira.sw.ru/browse/PSBM-42573
Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
CC: Vladimir Davydov <vdavydov at virtuozzo.com>
CC: Konstantin Khorenko <khorenko at virtuozzo.com>
CC: Andrey Vagin <avagin at virtuozzo.com>
CC: Igor Sukhih <igor at parallels.com>
CC: Pavel Emelyanov <xemul at virtuozzo.com>
---
include/linux/ve.h | 1 +
kernel/cgroup.c | 11 ++---------
kernel/ve/ve.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 49 insertions(+), 9 deletions(-)
diff --git a/include/linux/ve.h b/include/linux/ve.h
index b9374a1..b206526 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -49,6 +49,7 @@ struct ve_struct {
struct rw_semaphore op_sem;
int is_running;
int is_locked;
+ int is_pseudosuper;
atomic_t suspend;
/* see vzcalluser.h for VE_FEATURE_XXX definitions */
__u64 features;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a80b8bf..8b74c39 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1604,15 +1604,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
#ifdef CONFIG_VE
if (!ve_is_super(get_exec_env()) && !(flags & MS_KERNMOUNT)) {
- /*
- * We should allow mounting cgroups from inside of
- * VE only when VE inside a special "restoring" state.
- * At moment we don't have yet this state implemented
- * but to not block the container from the restore
- * lets allow this temporarily.
- */
- /* return ERR_PTR(-EACCES); */
- pr_warn_once("FIXME: Mounting cgroups from inside of VE, restore?");
+ if (!get_exec_env()->is_pseudosuper)
+ return ERR_PTR(-EACCES);
}
#endif
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 1f8e5da..40ce827 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -68,6 +68,7 @@ struct ve_struct ve0 = {
RCU_POINTER_INITIALIZER(ve_ns, &init_nsproxy),
.ve_netns = &init_net,
.is_running = 1,
+ .is_pseudosuper = 1,
#ifdef CONFIG_VE_IPTABLES
.ipt_mask = VE_IP_ALL, /* everything is allowed */
#endif
@@ -532,6 +533,12 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
*/
ve->is_running = 0;
+ /*
+ * Neither it can be in pseudosuper state
+ * anymore, setup it again if needed.
+ */
+ ve->is_pseudosuper = 0;
+
ve_tty_console_fini(ve);
ve_legacy_pty_fini(ve);
@@ -1146,6 +1153,7 @@ enum {
VE_CF_STATE,
VE_CF_FEATURES,
VE_CF_IPTABLES_MASK,
+ VE_CF_PSEUDOSUPER,
};
static u64 ve_read_u64(struct cgroup *cg, struct cftype *cft)
@@ -1156,6 +1164,37 @@ static u64 ve_read_u64(struct cgroup *cg, struct cftype *cft)
else if (cft->private == VE_CF_IPTABLES_MASK)
return cgroup_ve(cg)->ipt_mask;
#endif
+ else if (cft->private == VE_CF_PSEUDOSUPER)
+ return cgroup_ve(cg)->is_pseudosuper;
+ return 0;
+}
+
+/*
+ * Move VE into pseudosuper state where some of privilegued
+ * operations such as mounting cgroups from inside of VE context
+ * is allowed in a sake of container restore for example.
+ *
+ * While dropping pseudosuper privilegues is allowed from
+ * any context to set this value up one have to be a real
+ * node's owner.
+ */
+static int ve_write_pseudosuper(struct cgroup *cg,
+ struct cftype *cft,
+ u64 value)
+{
+ struct ve_struct *ve = cgroup_ve(cg);
+
+ if (!ve_is_super(get_exec_env()) && value)
+ return -EPERM;
+
+ down_write(&ve->op_sem);
+ if (value && (ve->is_running || ve->ve_ns)) {
+ up_write(&ve->op_sem);
+ return -EBUSY;
+ }
+ ve->is_pseudosuper = value;
+ up_write(&ve->op_sem);
+
return 0;
}
@@ -1225,6 +1264,13 @@ static struct cftype ve_cftypes[] = {
.write_u64 = ve_write_u64,
.private = VE_CF_IPTABLES_MASK,
},
+ {
+ .name = "pseudosuper",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = ve_read_u64,
+ .write_u64 = ve_write_pseudosuper,
+ .private = VE_CF_PSEUDOSUPER,
+ },
{ }
};
More information about the Devel
mailing list