[Devel] [PATCH RHEL COMMIT] device_cgroup: add device visibility virtualization in CT

Konstantin Khorenko khorenko at virtuozzo.com
Fri Sep 24 14:50:16 MSK 2021


The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit 09cef704f070de63ec1d126a885f9f8e5c5fc7dc
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Fri Sep 24 14:50:16 2021 +0300

    device_cgroup: add device visibility virtualization in CT
    
    For device cgroup(whitelist based case) we hide all devices, which are
    not in a whitelist from processes in these cgroup.
    
     - Cut devices in /proc/partitions and /proc/devices if have no read or
    write permission for them.
    
     - Cut devices in sys_ustat() if have no read permission for them.
    
    Allow mounting device even if no write permission for it, use "M" as
    mount permission same as we do "r"/"w"/"m" for read/write/mknod.
    
    Allow access .allow/.deny to change device permissions for CT root.
    
    Also show stats about devices in CTs in /proc/vz/devperms for backward
    compatibility, vzt-vzctl uses it.
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    +++
    devcgroup: Allow mounting device with no write perm for new mount api
    
    After rebase to RH8.4 we now have alternative code path to mount bdev
    for filesystems which switched to new mount api (e.g. checked it on
    xfs). Without this fix if block device with xfs is available in
    container and has "b major:minor rM" device cgroup allow rule for this
    container, the user inside would still not be able to mount this disk,
    which means "M" would not work.
    
    Note: we use "M" instead of "w" when we want to only allow mounting of
    the given disk but not writes to it.
    
    https://jira.sw.ru/browse/PSBM-131978
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    https://jira.sw.ru/browse/PSBM-133986
    
    1. Original FMODE_MOUNT intersects with FMODE_CREATED
    so changed value to 0x800000
    
    2. __devcgroup_check_permission -> devcgroup_legacy_check_permission
    WARN: we completely ignore eBPF control for device cgroup
    
    3. struct file_operations proc_devperms_ops ->
    struct proc_ops proc_devperms_ops API conversion
    
    (cherry picked from commit f5d0ef3585219ec1df5558a137fd22364c085f17)
    Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
---
 block/genhd.c                 |  18 +++++-
 fs/block_dev.c                |   3 +-
 fs/char_dev.c                 |   4 ++
 fs/statfs.c                   |  19 ++++++-
 fs/super.c                    |   4 +-
 include/linux/device_cgroup.h |  16 +++++-
 include/linux/fs.h            |   5 ++
 kernel/ve/vecalls.c           |  43 ++++++++++++++
 security/device_cgroup.c      | 126 ++++++++++++++++++++++++++++++++++++++++--
 9 files changed, 224 insertions(+), 14 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 298ee78c1bda..4a58b36141c4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -24,6 +24,7 @@
 #include <linux/log2.h>
 #include <linux/pm_runtime.h>
 #include <linux/badblocks.h>
+#include <linux/device_cgroup.h>
 
 #include "blk.h"
 
@@ -177,9 +178,13 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
 	struct blk_major_name *dp;
 
 	mutex_lock(&major_names_lock);
-	for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
+	for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) {
+		if (!devcgroup_device_visible(S_IFBLK, dp->major,
+					0, INT_MAX))
+			continue;
 		if (dp->major == offset)
 			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
+	}
 	mutex_unlock(&major_names_lock);
 }
 #endif /* CONFIG_PROC_FS */
@@ -796,10 +801,17 @@ static int show_partition(struct seq_file *seqf, void *v)
 
 	rcu_read_lock();
 	xa_for_each(&sgp->part_tbl, idx, part) {
+		unsigned int major = MAJOR(part->bd_dev);
+		unsigned int minor = MINOR(part->bd_dev);
+
 		if (!bdev_nr_sectors(part))
 			continue;
+
+		if (!devcgroup_device_visible(S_IFBLK, major, minor, 1))
+			continue;
+
 		seq_printf(seqf, "%4d  %7d %10llu %s\n",
-			   MAJOR(part->bd_dev), MINOR(part->bd_dev),
+			   major, minor,
 			   bdev_nr_sectors(part) >> 1,
 			   disk_name(sgp, part->bd_partno, buf));
 	}
@@ -1190,7 +1202,7 @@ static const struct seq_operations diskstats_op = {
 static int __init proc_genhd_init(void)
 {
 	proc_create_seq("diskstats", 0, NULL, &diskstats_op);
-	proc_create_seq("partitions", 0, NULL, &partitions_op);
+	proc_create_seq("partitions", S_ISVTX, NULL, &partitions_op);
 	return 0;
 }
 module_init(proc_genhd_init);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9ef4f1fc2cb0..aec818b9a2d9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1390,7 +1390,8 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
 	ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
 			MAJOR(dev), MINOR(dev),
 			((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) |
-			((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0));
+			((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0) |
+			((mode & FMODE_MOUNT) ? DEVCG_ACC_MOUNT : 0));
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/fs/char_dev.c b/fs/char_dev.c
index ba0ded7842a7..83ceffe546ed 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -22,6 +22,7 @@
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
 #include <linux/tty.h>
+#include <linux/device_cgroup.h>
 
 #include "internal.h"
 
@@ -54,6 +55,9 @@ void chrdev_show(struct seq_file *f, off_t offset)
 
 	mutex_lock(&chrdevs_lock);
 	for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) {
+		if (!devcgroup_device_visible(S_IFCHR, cd->major,
+					cd->baseminor, cd->minorct))
+			continue;
 		if (cd->major == offset)
 			seq_printf(f, "%3d %s\n", cd->major, cd->name);
 	}
diff --git a/fs/statfs.c b/fs/statfs.c
index 0ba34c135593..40571a62c239 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -9,6 +9,7 @@
 #include <linux/security.h>
 #include <linux/uaccess.h>
 #include <linux/compat.h>
+#include <linux/device_cgroup.h>
 #include "internal.h"
 
 static int flags_by_mnt(int mnt_flags)
@@ -247,9 +248,16 @@ static int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
 
 SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
 {
+	dev_t kdev = new_decode_dev(dev);
 	struct ustat tmp;
 	struct kstatfs sbuf;
-	int err = vfs_ustat(new_decode_dev(dev), &sbuf);
+	int err;
+
+	err = devcgroup_device_permission(S_IFBLK, kdev, MAY_READ);
+	if (err)
+		return err;
+
+	err = vfs_ustat(new_decode_dev(dev), &sbuf);
 	if (err)
 		return err;
 
@@ -390,9 +398,16 @@ COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct co
  */
 COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
 {
+	dev_t kdev = new_decode_dev(dev);
 	struct compat_ustat tmp;
 	struct kstatfs sbuf;
-	int err = vfs_ustat(new_decode_dev(dev), &sbuf);
+	int err;
+
+	err = devcgroup_device_permission(S_IFBLK, kdev, MAY_READ);
+	if (err)
+		return err;
+
+	err = vfs_ustat(kdev, &sbuf);
 	if (err)
 		return err;
 
diff --git a/fs/super.c b/fs/super.c
index 45e6e73db933..d9ca3f5406bd 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1241,7 +1241,7 @@ int get_tree_bdev(struct fs_context *fc,
 	if (!fc->source)
 		return invalf(fc, "No source specified");
 
-	bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type);
+	bdev = blkdev_get_by_path(fc->source, mode | FMODE_MOUNT, fc->fs_type);
 	if (IS_ERR(bdev)) {
 		errorf(fc, "%s: Can't open blockdev", fc->source);
 		return PTR_ERR(bdev);
@@ -1324,7 +1324,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
-	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
+	bdev = blkdev_get_by_path(dev_name, mode | FMODE_MOUNT, fs_type);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index d02f32b7514e..365ffc2c516e 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -4,7 +4,9 @@
 #define DEVCG_ACC_MKNOD 1
 #define DEVCG_ACC_READ  2
 #define DEVCG_ACC_WRITE 4
-#define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE)
+#define DEVCG_ACC_MOUNT 64
+#define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE | \
+			DEVCG_ACC_MOUNT)
 
 #define DEVCG_DEV_BLOCK 1
 #define DEVCG_DEV_CHAR  2
@@ -32,6 +34,8 @@ static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 		access |= DEVCG_ACC_WRITE;
 	if (mask & MAY_READ)
 		access |= DEVCG_ACC_READ;
+	if (mask & MAY_MOUNT)
+		access |= DEVCG_ACC_MOUNT;
 
 	return devcgroup_check_permission(type, imajor(inode), iminor(inode),
 					  access);
@@ -56,6 +60,11 @@ static inline int devcgroup_inode_mknod(int mode, dev_t dev)
 					  DEVCG_ACC_MKNOD);
 }
 
+extern int devcgroup_device_permission(umode_t mode, dev_t dev, int mask);
+extern int devcgroup_device_visible(umode_t mode, int major,
+		int start_minor, int nr_minors);
+struct ve_struct;
+extern int devcgroup_seq_show_ve(struct ve_struct *, struct seq_file *);
 #else
 static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
 			       short access)
@@ -64,4 +73,9 @@ static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 { return 0; }
 static inline int devcgroup_inode_mknod(int mode, dev_t dev)
 { return 0; }
+static inline int devcgroup_device_permission(umode_t mode, dev_t dev, int mask)
+{ return 0; }
+static inline int devcgroup_device_visible(umode_t mode, int major,
+		int start_minor, int nr_minors)
+{ return 0; }
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bf45b4aa5989..1c1e29a59084 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -105,6 +105,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* called from RCU mode, don't block */
 #define MAY_NOT_BLOCK		0x00000080
 
+#define MAY_MOUNT		0x00020000
+
 /*
  * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
  * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open()
@@ -166,6 +168,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File is stream-like */
 #define FMODE_STREAM		((__force fmode_t)0x200000)
 
+/* File is a block device opened by mount(2)  */
+#define FMODE_MOUNT		((__force fmode_t)0x800000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x4000000)
 
diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c
index 051091331ceb..e5ec7609d192 100644
--- a/kernel/ve/vecalls.c
+++ b/kernel/ve/vecalls.c
@@ -30,6 +30,7 @@
 #include <linux/vecalls.h>
 #include <linux/vzctl.h>
 #include <linux/veowner.h>
+#include <linux/device_cgroup.h>
 
 /**********************************************************************
  **********************************************************************
@@ -68,6 +69,42 @@ static void ve_seq_stop(struct seq_file *m, void *v)
 	mutex_unlock(&ve_list_lock);
 }
 
+static int devperms_seq_show(struct seq_file *m, void *v)
+{
+	struct ve_struct *ve = list_entry(v, struct ve_struct, ve_list);
+
+	if (m->private == (void *)0) {
+		seq_printf(m, "Version: 2.7\n");
+		m->private = (void *)-1;
+	}
+
+	if (ve_is_super(ve))
+		seq_printf(m, "%10u b 016 *:*\n%10u c 006 *:*\n", 0, 0);
+	else
+		devcgroup_seq_show_ve(ve, m);
+
+	return 0;
+}
+
+static struct seq_operations devperms_seq_op = {
+	.start	= ve_seq_start,
+	.next	= ve_seq_next,
+	.stop	= ve_seq_stop,
+	.show	= devperms_seq_show,
+};
+
+static int devperms_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &devperms_seq_op);
+}
+
+static struct proc_ops proc_devperms_ops = {
+	.proc_open		= devperms_open,
+	.proc_read		= seq_read,
+	.proc_lseek		= seq_lseek,
+	.proc_release		= seq_release,
+};
+
 static int vz_version_show(struct seq_file *file, void* v)
 {
 	static const char ver[] = VZVERSION "\n";
@@ -147,6 +184,11 @@ static int __init init_vecalls_proc(void)
 {
 	struct proc_dir_entry *de;
 
+	de = proc_create("devperms", S_IFREG | S_IRUSR, proc_vz_dir,
+			&proc_devperms_ops);
+	if (!de)
+		printk(KERN_WARNING "VZMON: can't make devperms proc entry\n");
+
 	de = proc_create("version", S_IFREG | S_IRUGO, proc_vz_dir,
 			&proc_vz_version_operations);
 	if (!de)
@@ -162,6 +204,7 @@ static int __init init_vecalls_proc(void)
 
 static void __exit fini_vecalls_proc(void)
 {
+	remove_proc_entry("devperms", proc_vz_dir);
 	remove_proc_entry("version", proc_vz_dir);
 	remove_proc_entry("veinfo", proc_vz_dir);
 }
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index bd1bf41b4a0d..ee76745795b8 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -234,7 +234,7 @@ static void devcgroup_css_free(struct cgroup_subsys_state *css)
 #define DEVCG_LIST 3
 
 #define MAJMINLEN 13
-#define ACCLEN 4
+#define ACCLEN 5
 
 static void set_access(char *acc, short access)
 {
@@ -246,6 +246,8 @@ static void set_access(char *acc, short access)
 		acc[idx++] = 'w';
 	if (access & DEVCG_ACC_MKNOD)
 		acc[idx++] = 'm';
+	if (access & DEVCG_ACC_MOUNT)
+		acc[idx++] = 'M';
 }
 
 static char type_to_char(short type)
@@ -319,6 +321,9 @@ static bool match_exception(struct list_head *exceptions, short type,
 	struct dev_exception_item *ex;
 
 	list_for_each_entry_rcu(ex, exceptions, list) {
+		short mismatched_bits;
+		bool allowed_mount;
+
 		if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK))
 			continue;
 		if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR))
@@ -328,7 +333,12 @@ static bool match_exception(struct list_head *exceptions, short type,
 		if (ex->minor != ~0 && ex->minor != minor)
 			continue;
 		/* provided access cannot have more than the exception rule */
-		if (access & (~ex->access))
+		mismatched_bits = access & (~ex->access) & ~DEVCG_ACC_MOUNT;
+		allowed_mount = !(mismatched_bits & ~DEVCG_ACC_WRITE) &&
+				(ex->access & DEVCG_ACC_MOUNT) &&
+				(access & DEVCG_ACC_MOUNT);
+
+		if (mismatched_bits && !allowed_mount)
 			continue;
 		return true;
 	}
@@ -605,7 +615,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 	struct dev_exception_item ex;
 	struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent);
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	memset(&ex, 0, sizeof(ex));
@@ -700,7 +710,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 	}
 	if (!isspace(*b))
 		return -EINVAL;
-	for (b++, count = 0; count < 3; count++, b++) {
+	for (b++, count = 0; count < ACCLEN - 1; count++, b++) {
 		switch (*b) {
 		case 'r':
 			ex.access |= DEVCG_ACC_READ;
@@ -711,9 +721,12 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 		case 'm':
 			ex.access |= DEVCG_ACC_MKNOD;
 			break;
+		case 'M':
+			ex.access |= DEVCG_ACC_MOUNT;
+			break;
 		case '\n':
 		case '\0':
-			count = 3;
+			count = ACCLEN - 1;
 			break;
 		default:
 			return -EINVAL;
@@ -870,4 +883,107 @@ int devcgroup_check_permission(short type, u32 major, u32 minor, short access)
 	#endif /* CONFIG_CGROUP_DEVICE */
 }
 EXPORT_SYMBOL(devcgroup_check_permission);
+
+int devcgroup_device_permission(umode_t mode, dev_t dev, int mask)
+{
+	short type, access = 0;
+
+	if (S_ISBLK(mode))
+		type = DEVCG_DEV_BLOCK;
+	if (S_ISCHR(mode))
+		type = DEVCG_DEV_CHAR;
+	if (mask & MAY_WRITE)
+		access |= DEVCG_ACC_WRITE;
+	if (mask & MAY_READ)
+		access |= DEVCG_ACC_READ;
+
+	return devcgroup_legacy_check_permission(type, MAJOR(dev), MINOR(dev), access);
+}
+
+int devcgroup_device_visible(umode_t mode, int major, int start_minor, int nr_minors)
+{
+	struct dev_cgroup *dev_cgroup;
+	struct dev_exception_item *ex;
+	short access = DEVCG_ACC_READ | DEVCG_ACC_WRITE;
+	bool match = false;
+
+	rcu_read_lock();
+	dev_cgroup = task_devcgroup(current);
+
+	if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
+		match = true;
+		goto out;
+	}
+
+	list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) {
+		if ((ex->type & DEVCG_DEV_BLOCK) && !S_ISBLK(mode))
+			continue;
+		if ((ex->type & DEVCG_DEV_CHAR) && !S_ISCHR(mode))
+			continue;
+		if (ex->major != ~0 && ex->major != major)
+			continue;
+		if (ex->minor != ~0 && (ex->minor < start_minor ||
+					ex->minor >= start_minor + nr_minors))
+			continue;
+		if (!(access & ex->access))
+			continue;
+		match = true;
+		break;
+	}
+out:
+	rcu_read_unlock();
+	return match;
+}
+
+#ifdef CONFIG_VE
+
+static unsigned encode_ve_perms(unsigned mask)
+{
+	unsigned perm = 0;
+
+	if (mask & DEVCG_ACC_READ)
+		perm |= S_IROTH;
+	if (mask & DEVCG_ACC_WRITE)
+		perm |= S_IWOTH;
+	if (mask & DEVCG_ACC_MOUNT)
+		perm |= S_IXUSR;
+
+	return perm;
+}
+
+int devcgroup_seq_show_ve(struct ve_struct *ve, struct seq_file *m)
+{
+	struct dev_exception_item *wh;
+	struct dev_cgroup *devcgroup;
+	struct cgroup_subsys_state *css;
+
+	css = ve_get_init_css(ve, devices_cgrp_id);
+	devcgroup = css_to_devcgroup(css);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(wh, &devcgroup->exceptions, list) {
+		char maj[MAJMINLEN], min[MAJMINLEN];
+		unsigned perm;
+
+		set_majmin(maj, wh->major);
+		set_majmin(min, wh->minor);
+
+		perm = encode_ve_perms(wh->access);
+		if (perm & (S_IROTH | S_IWOTH))
+			perm |= S_IXOTH;
+
+		seq_printf(m, "%10u %c %03o %s:%s\n",
+				ve->veid,
+				type_to_char(wh->type),
+				perm, maj, min);
+	}
+	rcu_read_unlock();
+
+	css_put(css);
+	return 0;
+}
+EXPORT_SYMBOL(devcgroup_seq_show_ve);
+
+#endif /* CONFIG_VE */
+
 #endif /* defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) */


More information about the Devel mailing list