[Devel] [vz7 v2 PATCH 2/2] devcg: Allow wildcard exceptions in DENY child cgroups

Nikolay Borisov nikolay.borisov at virtuozzo.com
Thu Jan 19 15:52:55 MSK 2023


In containerized environments there arise cases where we might want to
allow wildcard exceptions when the parent cg doesn't have such. This for
example arises when systemd services are being setup in containers. In
order to allow systemd to function we must allow it to write wildcard
(i.e b *:* rwm) rules in the child group. At the same time in order not
to break the fundamental invariant of the device cgroup hierarchy that
children cannot be more permissive than their parents instead of blindly
trusting those rules, simply skip them in the child cgroup and defer to
the parent's exceptions.

For example assume we have A/B, where A has default behavior 'deny' and
B was created afterwards and subsequently also has 'deny' default
behavior. With this patch it's possible to write "b *:* rwm" in B which
would also result in EPERM when trying to access any device that doesn't
contain an exception in A:

    mkdir A
    echo "a" > A/devices.deny
    mkdir A/B
    echo "c *:*" > A/B/devices.allow <-- allows to create the exception
					 but it's essentially a noop

    echo "c 1:3 rw" > A < -- now trying to open /dev/nul (matching 1:3)
			     by a process in B would succeed.

Implementing this doesn't really break the main invariant that children
shouldn't have more access than their ancestors.

Signed-off-by: Nikolay Borisov <nikolay.borisov at virtuozzo.com>
---

Changes in v2:
 * Patch is now self-contained.

 security/device_cgroup.c | 55 +++++++++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 2f6d5e0ffd00..0e5fdcc0cbff 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -61,6 +61,11 @@ struct dev_cgroup {
 	struct list_head propagate_pending;
 };

+static bool is_wildcard_exception(struct dev_exception_item *ex)
+{
+	return ex->minor == ~0 || ex->major == ~0;
+}
+
 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
 {
 	return container_of(s, struct dev_cgroup, css);
@@ -434,10 +439,10 @@ static bool match_exception_partial(struct list_head *exceptions, short type,

 /**
  * match_exception	- iterates the exception list trying to match a rule
- *			  based on type, major, minor and access type. It is
- *			  considered a match if an exception is found that
- *			  will contain the entire range of provided parameters.
- * @exceptions: list of exceptions
+ * 			  based on type, major, minor and access type. It is
+ * 			  considered a match if an exception is found that
+ * 			  will contain the entire range of provided parameters.
+ * @dev_cgroup: cgroup whose exceptions we are checking
  * @type: device type (DEV_BLOCK or DEV_CHAR)
  * @major: device file major number, ~0 to match all
  * @minor: device file minor number, ~0 to match all
@@ -445,10 +450,13 @@ static bool match_exception_partial(struct list_head *exceptions, short type,
  *
  * returns: true in case it matches an exception completely
  */
-static bool match_exception(struct list_head *exceptions, short type,
-			    u32 major, u32 minor, short access)
+static bool match_exception(struct dev_cgroup *dev_cgroup, short type,
+			    u32 major, u32 minor, short access, bool check_parent)
 {
 	struct dev_exception_item *ex;
+	struct cgroup *cgrp = dev_cgroup->css.cgroup;
+	struct list_head *exceptions = &dev_cgroup->exceptions;
+	bool wildcard_skipped = false;

 	list_for_each_entry_rcu(ex, exceptions, list) {
 		short mismatched_bits;
@@ -462,6 +470,11 @@ static bool match_exception(struct list_head *exceptions, short type,
 			continue;
 		if (ex->minor != ~0 && ex->minor != minor)
 			continue;
+		/* skip wildcard rule if we are child cg */
+		if (is_wildcard_exception(ex) && !test_bit(CGRP_VE_ROOT, &cgrp->flags)) {
+			wildcard_skipped = true;
+			continue;
+		}

 		/* provided access cannot have more than the exception rule */
 		mismatched_bits = access & (~ex->access) & ~ACC_MOUNT;
@@ -475,9 +488,26 @@ static bool match_exception(struct list_head *exceptions, short type,
 		return true;
 	}

+	/* we matched a wildcard rule, so let's check for a
+	 * more specific one in the parent
+	 */
+	if (wildcard_skipped && check_parent) {
+		struct cgroup *parent = cgrp->parent;
+		struct dev_cgroup *devcg_parent = cgroup_to_devcgroup(parent);
+		if (devcg_parent->behavior == DEVCG_DEFAULT_ALLOW)
+			/* Can't match any of the exceptions, even partially */
+			return  !match_exception_partial(&devcg_parent->exceptions,
+						      type, major, minor, access);
+		else
+			/* Need to match completely one exception to be allowed */
+			return match_exception(devcg_parent, type, major,
+					       minor, access, false);
+	}
+
 	return false;
 }

+
 /**
  * verify_new_ex - verifies if a new exception is part of what is allowed
  *		   by a dev cgroup based on the default policy +
@@ -527,9 +557,8 @@ static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
 		 * be contained completely in an parent's exception to be
 		 * allowed
 		 */
-		match = match_exception(&dev_cgroup->exceptions, refex->type,
-					refex->major, refex->minor,
-					refex->access);
+		match = match_exception(dev_cgroup, refex->type, refex->major,
+					refex->minor, refex->access, false);

 		if (match)
 			/* parent has an exception that matches the proposed */
@@ -703,6 +732,7 @@ static inline bool has_children(struct dev_cgroup *devcgroup)
 	return !list_empty(&cgrp->children);
 }

+
 /*
  * Modify the exception list using allow/deny rules.
  * CAP_SYS_ADMIN is needed for this.  It's at least separate from CAP_MKNOD
@@ -862,8 +892,10 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 			break;
 		}

-		if (!parent_has_perm(devcgroup, &ex))
+		if (!is_wildcard_exception(&ex) &&
+		    !parent_has_perm(devcgroup, &ex))
 			return -EPERM;
+
 		rc = dev_exception_add(devcgroup, &ex);
 		break;
 	case DEVCG_DENY:
@@ -954,8 +986,7 @@ static int __devcgroup_check_permission(short type, u32 major, u32 minor,
 					      type, major, minor, access);
 	else
 		/* Need to match completely one exception to be allowed */
-		rc = match_exception(&dev_cgroup->exceptions, type, major,
-				     minor, access);
+		rc = match_exception(dev_cgroup, type, major, minor, access, true);
 	rcu_read_unlock();

 #ifdef CONFIG_VE
--
2.34.1



More information about the Devel mailing list