[Devel] [PATCH RHEL8 COMMIT] ve/device_cgroup: Show all devices allowed in ct to fool docker

Konstantin Khorenko khorenko at virtuozzo.com
Thu May 13 14:47:19 MSK 2021


The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.28
------>
commit 849dc660c95a8b71f54e05d4ab0660c251479f62
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Tue Dec 15 12:30:55 2020 +0300

    ve/device_cgroup: Show all devices allowed in ct to fool docker
    
    We've seen that docker 20+ not only writes "a *:* rwm" to privileged
    docker container device-cgroup (as pre-19 version did) but also checks
    the content after write, and docker expects that all devices are allowed
    for privileged docker container.
    
    In our VZCT we obviously can't afford to actually allow all devices
    because root device cgroup of VZCT should restrict which devices are
    allowed to be read/modified/mknod in VZCT and which are not, and all
    nested cgroup inherit this. Before the patch reading devices list in
    VZCT one would see a whitelist there each allowed device is present:
    
      CT-101 /# cat /sys/fs/cgroup/devices/test/devices.list
      ...
      c 1:11 rwm
      c 10:200 rwm
      c 10:235 rwm
      c 10:229 rwm
      b 182:177568 rm
      b 182:177569 rm
    
    Docker expects to see "a *:* rwm" as if docker is on bare host and
    nobody touched device cgroup before that.
    
    As a solution we can just show docker what he wants. The idea is to
    detect if the content of the whitelist of the device cgroup to be
    shown is equal to the content of the whitelist of the root device cgroup
    of the VZCT, then always show "a *:* rwm".
    
      CT-101 /# cat /sys/fs/cgroup/devices/test/devices.list
      a *:* rwm
    
    If one changes the whitelist (even reorder) this cgroup would show a
    full list of all allowed devices as before.
    
    This change of the output looks consistent enough: when you see
    "a *:* rwm" in your cgroup it means that all devices of your VZCT are
    available for you.
    
    Only difference to mainstream behaviour is when you prohibit some device
    via devices.deny you get not a blacklist but an inverse whitelist.
    
    Related task - a CRIU/vzctl task for devices cgroup migration support:
    https://jira.sw.ru/browse/PSBM-123668
    
    https://jira.sw.ru/browse/PSBM-123630
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    vz8 rebase:
     - introduced css_get_local_root() similar to cgroup_get_local_root()
    
    (cherry picked from vz7 commit a6dba9fbee35 ("ve/device_cgroup: show all
    devices allowed in ct to fool docker"))
    In the scope of
    https://jira.sw.ru/browse/PSBM-123743
    
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    Reviewed-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 include/linux/cgroup.h   |  1 +
 kernel/cgroup/cgroup.c   | 27 +++++++++++++++++++++++++
 security/device_cgroup.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 45da7bedc29d..7cf6e1e69242 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -874,6 +874,7 @@ void cgroup1_release_agent(struct work_struct *work);
 #ifdef CONFIG_VE
 int cgroup_mark_ve_roots(struct ve_struct *ve);
 void cgroup_unmark_ve_roots(struct ve_struct *ve);
+struct cgroup_subsys_state *css_get_local_root(struct cgroup_subsys_state *css);
 struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
 #endif
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5ccc3edb7007..1bc15fb9802c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -329,6 +329,33 @@ struct cgroup *cgroup_get_local_root(struct cgroup *cgrp)
 	return cgrp;
 }
 
+struct cgroup_subsys_state *css_get_local_root(struct cgroup_subsys_state *css)
+{
+	/*
+	 * Find css for nearest "root" cgroup, which might be
+	 * - host cgroup root
+	 *   or
+	 * - ve cgroup root.
+	 *
+	 *    <host_root_cgroup> -> local_root
+	 *     \                    ^
+	 *      <cgroup>            |
+	 *       \                  |
+	 *        <cgroup>   --->   from here
+	 *        \
+	 *         <ve_root_cgroup> -> local_root
+	 *         \                   ^
+	 *          <cgroup>           |
+	 *          \                  |
+	 *           <cgroup>  --->    from here
+	 */
+
+	while (css->parent && !test_bit(CGRP_VE_ROOT, &css->cgroup->flags))
+		css = css->parent;
+
+	return css;
+}
+
 struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp)
 {
 	struct ve_struct *ve;
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 8a017fc2e1b0..98bbd9561cd7 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -268,12 +268,42 @@ static void set_majmin(char *str, unsigned m)
 		sprintf(str, "%u", m);
 }
 
+struct dev_exception_item *dev_exeption_next(struct list_head *head)
+{
+	return list_entry_rcu(head->next, struct dev_exception_item, list);
+}
+
+static bool dev_exceptions_equal(struct list_head *first, struct list_head *second)
+{
+	struct dev_exception_item *exf, *exs;
+
+	for (exf = dev_exeption_next(first->next),
+	     exs = dev_exeption_next(second->next);
+	     &exf->list != first && &exs->list != second;
+	     exf = dev_exeption_next(exf->list.next),
+	     exs = dev_exeption_next(exs->list.next)) {
+		/* Check that exceptions are equal */
+		if (exf->type != exs->type ||
+		    exf->major != exs->major ||
+		    exf->minor != exs->minor ||
+		    exf->access != exs->access)
+			return false;
+	}
+
+	if (&exf->list != first || &exs->list != second)
+		return false;
+
+	return true;
+}
+
 static int devcgroup_seq_show(struct seq_file *m, void *v)
 {
 	struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m));
 	struct dev_exception_item *ex;
 	char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
 	short type, mask;
+	struct cgroup_subsys_state *root_css;
+	struct dev_cgroup *root_cgrp;
 
 	type = (short)seq_cft(m)->private;
 	mask = (type == DEVCG_EXTRA_LIST) ?
@@ -293,6 +323,27 @@ static int devcgroup_seq_show(struct seq_file *m, void *v)
 		seq_printf(m, "%c %s:%s %s\n", type_to_char(DEVCG_DEV_ALL),
 			   maj, min, acc);
 	} else {
+		/*
+		 * Fooling docker in CT again: if exceptions in ve are the same
+		 * as in ve root cgroup - show as if we allow everyting
+		 */
+		if (!ve_is_super(get_exec_env())) {
+			root_css  = css_get_local_root(seq_css(m));
+			root_cgrp = css_to_devcgroup(root_css);
+
+			if (dev_exceptions_equal(&devcgroup->exceptions,
+						 &root_cgrp->exceptions)) {
+				set_access(acc, mask);
+				set_majmin(maj, ~0);
+				set_majmin(min, ~0);
+				seq_printf(m, "%c %s:%s %s\n",
+					   type_to_char(DEVCG_DEV_ALL),
+					   maj, min, acc);
+				rcu_read_unlock();
+				return 0;
+			}
+		}
+
 		list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) {
 			set_access(acc, ex->access & mask);
 			set_majmin(maj, ex->major);


More information about the Devel mailing list