[Devel] [PATCH RHEL7 COMMIT] ve/device_cgroup: show all devices allowed in ct to fool docker
Vasily Averin
vvs at virtuozzo.com
Tue Dec 15 12:30:55 MSK 2020
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.6.1.vz7.171.4
------>
commit 510bede663a9018c86c88882b45ede7c5e26f6f8
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date: Tue Dec 15 12:30:55 2020 +0300
ve/device_cgroup: show all devices allowed in ct to fool docker
We've seen that docker 20+ not only writes "a *:* rwm" to privileged
docker container device-cgroup (as pre-19 version did) but also checks
the content after write, and docker expects that all devices are allowed
for privileged docker container.
In our VZCT we obviously can't afford to actually allow all devices
because root device cgroup of VZCT should restrict which devices are
allowed to be read/modified/mknod in VZCT and which are not, and all
nested cgroup inherit this. Before the patch reading devices list in
VZCT one would see a whitelist there each allowed device is present:
CT-101 /# cat /sys/fs/cgroup/devices/test/devices.list
...
c 1:11 rwm
c 10:200 rwm
c 10:235 rwm
c 10:229 rwm
b 182:177568 rm
b 182:177569 rm
Docker expects to see "a *:* rwm" as if docker is on bare host and
nobody touched device cgroup before that.
As a solution we can just show docker what he wants. The idea is to
detect if the content of the whitelist of the device cgroup to be
shown is equal to the content of the whitelist of the root device cgroup
of the VZCT, then always show "a *:* rwm".
CT-101 /# cat /sys/fs/cgroup/devices/test/devices.list
a *:* rwm
If one changes the whitelist (even reorder) this cgroup would show a
full list of all allowed devices as before.
This change of the output looks consistent enough: when you see
"a *:* rwm" in your cgroup it means that all devices of your VZCT are
available for you.
Only difference to mainstream behaviour is when you prohibit some device
via devices.deny you get not a blacklist but an inverse whitelist.
FIXME: we have a problem here because this approach does not survive
container migration as devices cgroup c/r looks broken:
https://jira.sw.ru/browse/PSBM-123668
https://jira.sw.ru/browse/PSBM-123630
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
include/linux/cgroup.h | 1 +
security/device_cgroup.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 49 insertions(+)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 645c9fd..ac255e4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -673,6 +673,7 @@ void cgroup_release_agent(struct work_struct *work);
#ifdef CONFIG_VE
int cgroup_mark_ve_roots(struct ve_struct *ve);
void cgroup_unmark_ve_roots(struct ve_struct *ve);
+struct cgroup *cgroup_get_local_root(struct cgroup *cgrp);
struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
#endif
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index d980020..f9d205f 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -304,12 +304,41 @@ static void set_majmin(char *str, unsigned m)
sprintf(str, "%u", m);
}
+struct dev_exception_item *dev_exeption_next(struct list_head *head)
+{
+ return list_entry_rcu(head->next, struct dev_exception_item, list);
+}
+
+static bool dev_exceptions_equal(struct list_head *first, struct list_head *second)
+{
+ struct dev_exception_item *exf, *exs;
+
+ for (exf = dev_exeption_next(first->next),
+ exs = dev_exeption_next(second->next);
+ &exf->list != first && &exs->list != second;
+ exf = dev_exeption_next(exf->list.next),
+ exs = dev_exeption_next(exs->list.next)) {
+ /* Check that exceptions are equal */
+ if (exf->type != exs->type ||
+ exf->major != exs->major ||
+ exf->minor != exs->minor ||
+ exf->access != exs->access)
+ return false;
+ }
+
+ if (&exf->list != first || &exs->list != second)
+ return false;
+
+ return true;
+}
+
static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
struct seq_file *m)
{
struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
struct dev_exception_item *ex;
char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
+ struct dev_cgroup *root_cgrp;
rcu_read_lock();
/*
@@ -325,6 +354,25 @@ static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL),
maj, min, acc);
} else {
+ /*
+ * Fooling docker in CT again: if exceptions in ve are the same
+ * as in ve root cgroup - show as if we allow everyting
+ */
+ if (!ve_is_super(get_exec_env())) {
+ root_cgrp = cgroup_to_devcgroup(cgroup_get_local_root(cgroup));
+
+ if (dev_exceptions_equal(&devcgroup->exceptions,
+ &root_cgrp->exceptions)) {
+ set_access(acc, ACC_MKNOD | ACC_READ | ACC_WRITE);
+ set_majmin(maj, ~0);
+ set_majmin(min, ~0);
+ seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL),
+ maj, min, acc);
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+
list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) {
set_access(acc, ex->access);
set_majmin(maj, ex->major);
More information about the Devel
mailing list