[Devel] [PATCH RHEL9 COMMIT] ve/cgroup: Skip non-virtualized roots in cgroup_{, un}mark_ve_roots()

Konstantin Khorenko khorenko at virtuozzo.com
Wed Oct 13 14:13:30 MSK 2021


The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-4.vz9.10.1
------>
commit 5697a105caf70bf9468025ca7192afebdbe80fbd
Author: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
Date:   Wed Oct 13 14:13:29 2021 +0300

    ve/cgroup: Skip non-virtualized roots in cgroup_{,un}mark_ve_roots()
    
    During container start there might be a situation when not all cgroup
    hierarchies get virtualized by container manager (like vzctl). By
    virtualizing a cgroup hierarchy I mean creation of sub-directory within
    a particular mounted cgroup. When container starts it looks in css set
    of it's init process to list all affilated cgroups and perform actions
    on each. But non-virtualized cgroups will also be present in init's css_set
    and they should not be touched from inside of any non root ve.
    
    Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
    Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    vz9 changes: we want to drop release agent virtualization, but let's
    still sanitize ve cgroup roots marking, all related patches are merged
    and reworked here.
    
    https://jira.sw.ru/browse/PSBM-134002
    (cherry-picked from vz8 commit de090f989b240d1004540d2f4d775e66996b57e2)
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 include/linux/cgroup.h |  2 +-
 kernel/cgroup/cgroup.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/ve/ve.c         |  6 ++++-
 3 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 99bd069a476d..4dc3f2f007f1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -887,7 +887,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
 		   struct cgroup_namespace *ns);
 
 #ifdef CONFIG_VE
-extern void cgroup_mark_ve_root(struct ve_struct *ve);
+extern int cgroup_mark_ve_root(struct ve_struct *ve);
 void cgroup_unmark_ve_roots(struct ve_struct *ve);
 #endif
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a425f06eceea..fbe8483b7035 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1961,7 +1961,58 @@ struct ve_struct *get_curr_ve(void)
 	return ve;
 }
 
-void cgroup_mark_ve_root(struct ve_struct *ve)
+/*
+ * Let's skip optional cgroups in Virtuozzo containers. Admin on host can
+ * do "mount -t cgroup cgroup -onone,name=namedcgroup /mnt", and this should
+ * not break containers.
+ */
+static inline bool is_virtualized_cgroup(struct cgroup *cgrp)
+{
+	/* Cgroup v2 */
+	if (cgrp->root == &cgrp_dfl_root)
+		return false;
+
+#if IS_ENABLED(CONFIG_CGROUP_DEBUG)
+	if (cgrp->subsys[debug_cgrp_id])
+		return false;
+#endif
+
+	if (cgrp->root->subsys_mask)
+		return true;
+
+	if (!strcmp(cgrp->root->name, "systemd"))
+		return true;
+
+	return false;
+}
+
+/*
+ * Iterate all cgroups in a given css_set and for all obligatory Virtuozzo
+ * container cgroups check that container has its own cgroup subdirectory:
+ * non-host and non-intersecting with other container subdirectories.
+ */
+static inline bool ve_check_root_cgroups(struct css_set *cset)
+{
+	struct cgrp_cset_link *link;
+
+	lockdep_assert_held(&css_set_lock);
+
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+		if (!is_virtualized_cgroup(link->cgrp))
+			continue;
+
+		/* Host cgroups not allowed */
+		if (!link->cgrp->kn->parent)
+			return true;
+
+		/* Nested CGRP_VE_ROOT not allowed */
+		if (cgroup_get_ve_root1(link->cgrp))
+			return true;
+	}
+	return false;
+}
+
+int cgroup_mark_ve_root(struct ve_struct *ve)
 {
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
@@ -1977,13 +2028,23 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
 	cset = rcu_dereference_protected(ve->ve_ns,
 			lockdep_is_held(&ve->op_sem))->cgroup_ns->root_cset;
 
+	if (ve_check_root_cgroups(cset)) {
+		spin_unlock_irq(&css_set_lock);
+		return -EINVAL;
+	}
+
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		cgrp = link->cgrp;
+
+		if (!is_virtualized_cgroup(cgrp))
+			continue;
+
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
 	}
 
 	link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
 	spin_unlock_irq(&css_set_lock);
+	return 0;
 }
 
 void cgroup_unmark_ve_roots(struct ve_struct *ve)
@@ -2004,6 +2065,10 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve)
 
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		cgrp = link->cgrp;
+
+		if (!is_virtualized_cgroup(cgrp))
+			continue;
+
 		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
 	}
 
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index f9aaf135f630..5acd2baa2a08 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -505,7 +505,9 @@ static int ve_start_container(struct ve_struct *ve)
 	if (err < 0)
 		goto err_iterate;
 
-	cgroup_mark_ve_root(ve);
+	err = cgroup_mark_ve_root(ve);
+	if (err)
+		goto err_mark_ve;
 
 	ve->is_running = 1;
 
@@ -515,6 +517,8 @@ static int ve_start_container(struct ve_struct *ve)
 
 	return 0;
 
+err_mark_ve:
+	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
 err_iterate:
 	ve_stop_umh(ve);
 err_umh:


More information about the Devel mailing list