[Devel] [vz7 PATCH 2/2] devcg: Allow wildcard exceptions in DENY child cgroups PSBM-144033

Nikolay Borisov nikolay.borisov at virtuozzo.com
Fri Dec 16 17:38:43 MSK 2022


In containerized environments there arise cases where we might want to
allow wildcard exceptions when the parent cg doesn't have such. This for
example arises when systemd services are being setup in containers. In
order to allow systemd to function we must allow it to write wildcard
(i.e b *:* rwm) rules in the child group. At the same time in order not
to break the fundamental invariant of the device cgroup hierarchy that
children cannot be more permissive than their parents instead of blindly
trusting those rules, simply skip them in the child cgroup and defer to
the parent's exceptions.

For example assume we have A/B, where A has default behavior 'deny' and
B was created afterwards and subsequently also has 'deny' default
behavior. With this patch it's possible to write "b *:* rwm" in B which
would also result in EPERM when trying to access any device that doesn't
contain an exception in A:

    mkdir A
    echo "a" > A/devices.deny
    mkdir A/B
    echo "c *:*" > A/B/devices.allow <-- allows to create the exception
					 but it's essentially a noop

    echo "c 1:3 rw" > A < -- now trying to open /dev/nul (matching 1:3)
			     by a process in B would succeed.

Implementing this doesn't really break the main invariant that children
shouldn't have more access than their ancestors.

Signed-off-by: Nikolay Borisov <nikolay.borisov at virtuozzo.com>
---
 security/device_cgroup.c | 54 +++++++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index f7948334e318..302159d21d15 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -61,6 +61,11 @@ struct dev_cgroup {
 	struct list_head propagate_pending;
 };

+static bool is_wildcard_exception(struct dev_exception_item *ex)
+{
+	return ex->minor == ~0 || ex->major == ~0;
+}
+
 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
 {
 	return container_of(s, struct dev_cgroup, css);
@@ -224,6 +229,7 @@ static int devcgroup_online(struct cgroup *cgroup)
 		if (!ret)
 			dev_cgroup->behavior = parent_dev_cgroup->behavior;
 	}
+
 	mutex_unlock(&devcgroup_mutex);

 	return ret;
@@ -434,9 +440,9 @@ static bool match_exception_partial(struct list_head *exceptions, short type,

 /**
  * match_exception	- iterates the exception list trying to match a rule
- *			  based on type, major, minor and access type. It is
- *			  considered a match if an exception is found that
- *			  will contain the entire range of provided parameters.
+ * 			  based on type, major, minor and access type. It is
+ * 			  considered a match if an exception is found that
+ * 			  will contain the entire range of provided parameters.
  * @exceptions: list of exceptions
  * @type: device type (DEV_BLOCK or DEV_CHAR)
  * @major: device file major number, ~0 to match all
@@ -446,10 +452,11 @@ static bool match_exception_partial(struct list_head *exceptions, short type,
  * returns: true in case it matches an exception completely
  */
 static bool match_exception(struct dev_cgroup *dev_cgroup, short type,
-			    u32 major, u32 minor, short access)
+			    u32 major, u32 minor, short access, bool check_parent)
 {
 	struct dev_exception_item *ex;
 	struct cgroup *cgrp = dev_cgroup->css.cgroup;
+	bool wildcard_skipped = false;
 	struct list_head *exceptions = &dev_cgroup->exceptions;

 	list_for_each_entry_rcu(ex, exceptions, list) {
@@ -464,6 +471,11 @@ static bool match_exception(struct dev_cgroup *dev_cgroup, short type,
 			continue;
 		if (ex->minor != ~0 && ex->minor != minor)
 			continue;
+		/* skip wildcard rule if we are child cg */
+		if (is_wildcard_exception(ex) && !test_bit(CGRP_VE_ROOT, &cgrp->flags)) {
+			wildcard_skipped = true;
+			continue;
+		}

 		/* provided access cannot have more than the exception rule */
 		mismatched_bits = access & (~ex->access) & ~ACC_MOUNT;
@@ -477,9 +489,26 @@ static bool match_exception(struct dev_cgroup *dev_cgroup, short type,
 		return true;
 	}

+	/* we matched a wildcard rule, so let's check for a
+	 * more specific one in the parent
+	 */
+	if (wildcard_skipped && check_parent) {
+		struct cgroup *parent = cgrp->parent;
+		struct dev_cgroup *devcg_parent = cgroup_to_devcgroup(parent);
+		if (devcg_parent->behavior == DEVCG_DEFAULT_ALLOW)
+			/* Can't match any of the exceptions, even partially */
+			return  !match_exception_partial(&devcg_parent->exceptions,
+						      type, major, minor, access);
+		else
+			/* Need to match completely one exception to be allowed */
+			return match_exception(devcg_parent, type, major,
+					       minor, access, false);
+	}
+
 	return false;
 }

+
 /**
  * verify_new_ex - verifies if a new exception is part of what is allowed
  *		   by a dev cgroup based on the default policy +
@@ -504,14 +533,14 @@ static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
 			/*
 			 * new exception in the child doesn't matter, only
 			 * adding extra restrictions
-			 */
+			 */
 			return true;
 		} else {
 			/*
 			 * new exception in the child will add more devices
 			 * that can be acessed, so it can't match any of
 			 * parent's exceptions, even slightly
-			 */
+			 */
 			match = match_exception_partial(&dev_cgroup->exceptions,
 							refex->type,
 							refex->major,
@@ -529,9 +558,8 @@ static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
 		 * be contained completely in an parent's exception to be
 		 * allowed
 		 */
-		match = match_exception(&dev_cgroup->exceptions, refex->type,
-					refex->major, refex->minor,
-					refex->access);
+		match = match_exception(dev_cgroup, refex->type, refex->major,
+					refex->minor, refex->access, false);

 		if (match)
 			/* parent has an exception that matches the proposed */
@@ -705,6 +733,7 @@ static inline bool has_children(struct dev_cgroup *devcgroup)
 	return !list_empty(&cgrp->children);
 }

+
 /*
  * Modify the exception list using allow/deny rules.
  * CAP_SYS_ADMIN is needed for this.  It's at least separate from CAP_MKNOD
@@ -864,8 +893,10 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 			break;
 		}

-		if (!parent_has_perm(devcgroup, &ex))
+		if (!is_wildcard_exception(&ex) &&
+		    !parent_has_perm(devcgroup, &ex))
 			return -EPERM;
+
 		rc = dev_exception_add(devcgroup, &ex);
 		break;
 	case DEVCG_DENY:
@@ -956,8 +987,7 @@ static int __devcgroup_check_permission(short type, u32 major, u32 minor,
 					      type, major, minor, access);
 	else
 		/* Need to match completely one exception to be allowed */
-		rc = match_exception(&dev_cgroup->exceptions, type, major,
-				     minor, access);
+		rc = match_exception(dev_cgroup, type, major, minor, access, true);
 	rcu_read_unlock();

 #ifdef CONFIG_VE
--
2.34.1



More information about the Devel mailing list