[Devel] [PATCH RHEL COMMIT] ve/kernfs: implement ve-based permissions

Konstantin Khorenko khorenko at virtuozzo.com
Wed Sep 22 14:50:52 MSK 2021


The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit 52a22d0044ca6a8eb24b3629bdb4da2c83bdb400
Author: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
Date:   Wed Sep 22 14:50:52 2021 +0300

    ve/kernfs: implement ve-based permissions
    
    Here are the main ideas:
    1) Store pointer to 've' on kernfs superblock info.
    2) Keep on each kernfs node kmapset mapping (ve -> permissions).
    3) Store kmapset key on each ve (used to check dentry visibility).
    4) Store key offset in ve_struct on each kernfs superblock info to be able to
    get the key pointer by current VE.
    
    And do not inherit ve permissions from parent.
    Otherwise when a new ploop is created, all containers that have access
    to devices/virtual/block will gain access to the new ploop too, which is
    a security breach.
    
    https://jira.sw.ru/browse/PSBM-20892
    https://jira.sw.ru/browse/PSBM-34682
    
    Signed-off-by: Konstantin Khlebnikov <khlebnikov at openvz.org>
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
    Signed-off-by: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
    
    +++
    fs/kernfs: remove bogus kernfs_test_ve() check
    
    cgroups now also use kernfs subsystem. kernfs_test_ve() check breaks
    it when we try to mount cgroup from ve.
    
    cgroup1_mount() tries to reuse the existing superblock by calling
    kernfs_pin_sb to pin it during mount operation
    
    cgroup1_mount():
                    pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
                    ...
    
    Then it calls cgroup_do_mount() which fails to reuse the existing
    superblock because of failed kernfs_test_ve() check, so instead
    it creates a new one:
    
            dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
                                     CGROUP_SUPER_MAGIC, ns);
    
    So now we have two superblocks instead of one which may lead to
    the double percpu_ref_kill_and_confirm() call. E.g. :
    
    umount /sys/fs/cgroup/rdma
    vzctl exec CTID umount /sys/fs/cgroup/rdma
    
            ------------[ cut here ]------------
            percpu_ref_kill_and_confirm called more than once on css_release!
            WARNING: CPU: 2 PID: 2664 at lib/percpu-refcount.c:336 percpu_ref_kill_and_confirm+0x7c/0xa0
            CPU: 2 PID: 2664 Comm: kworker/2:5 ve: / Tainted: G                 --------- -t - 4.18.0-80.1.2.vz8.3.12 #1 3.12
            RIP: 0010:percpu_ref_kill_and_confirm+0x7c/0xa0
            Call Trace:
             cgroup_kill_sb+0x64/0x90
             deactivate_locked_super+0x34/0x70
             cleanup_mnt+0x3b/0x70
             delayed_mntput+0x26/0x40
             process_one_work+0x1a7/0x360
             worker_thread+0x30/0x390
             kthread+0x112/0x130
             ret_from_fork+0x35/0x40
            ---[ end trace 0cb674141e1a86e3 ]---
            CT: d3cd4dca-ce04-475d-b5d5-c9ef1f24642a: stopped
    
    Remove the kernfs_test_ve() check to fix this.
    
    As for sysfs which is also uses kernfs, I don't think it gonna cause
    something bad. Sysfs uses net namespaces (info->ns && sb_info->ns) which
    should be enough for us. We enter CT's net ns before mounting sysfs in CT.
    
    Also note that commands like "vzctl set 101 --netif_add eth0 --save"
    calls ip util (which mount sysfs) from ve0 and CT's net namespace.
    So with the removal of kernfs_test_ve() check kernfs_test_super()
    now return success instead of fail as before. Which is totally
    fine IMO, as we reuse the existing CT's sysfs superblock
    instead of creating a new one.
    
    mFixes: c5cf56e9cb77 ("ve/kernfs: implement ve-based permissions")
    
    https://jira.sw.ru/browse/PSBM-104902
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
    
    (cherry-picked from vz8 commit 1f6295629f7325f18e91df95df27a2b2b99a328d)
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 fs/kernfs/Makefile          |   1 +
 fs/kernfs/dir.c             |   3 ++
 fs/kernfs/inode.c           |   8 +++-
 fs/kernfs/kernfs-internal.h |   4 ++
 fs/kernfs/kernfs-ve.h       |  44 +++++++++++++++++++
 fs/kernfs/mount.c           |   1 +
 fs/kernfs/ve.c              | 100 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/kernfs-ve.h   |  31 ++++++++++++++
 include/linux/kernfs.h      |   6 +++
 9 files changed, 197 insertions(+), 1 deletion(-)

diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile
index 4ca54ff54c98..d3dfb0720fec 100644
--- a/fs/kernfs/Makefile
+++ b/fs/kernfs/Makefile
@@ -4,3 +4,4 @@
 #
 
 obj-y		:= mount.o inode.o dir.o file.o symlink.o
+obj-$(CONFIG_VE) += ve.o
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 33166ec90a11..f77ca8e1a301 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -16,6 +16,7 @@
 #include <linux/hash.h>
 
 #include "kernfs-internal.h"
+#include "kernfs-ve.h"
 
 DEFINE_MUTEX(kernfs_mutex);
 static DEFINE_SPINLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
@@ -531,6 +532,7 @@ void kernfs_put(struct kernfs_node *kn)
 		simple_xattrs_free(&kn->iattr->xattrs);
 		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
 	}
+	kernfs_put_ve_perms(kn);
 	spin_lock(&kernfs_idr_lock);
 	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
 	spin_unlock(&kernfs_idr_lock);
@@ -750,6 +752,7 @@ int kernfs_add_one(struct kernfs_node *kn)
 		ktime_get_real_ts64(&ps_iattr->ia_ctime);
 		ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 	}
+	kernfs_get_ve_perms(kn);
 
 	mutex_unlock(&kernfs_mutex);
 
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 26f2aa3586f9..14c826798647 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -16,6 +16,7 @@
 #include <linux/security.h>
 
 #include "kernfs-internal.h"
+#include "kernfs-ve.h"
 
 static const struct inode_operations kernfs_iops = {
 	.permission	= kernfs_iop_permission,
@@ -116,6 +117,9 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (!kn)
 		return -EINVAL;
 
+	if (!kernfs_ve_allowed(kn))
+		return -EPERM;
+
 	mutex_lock(&kernfs_mutex);
 	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
@@ -272,6 +276,7 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
 			  struct inode *inode, int mask)
 {
 	struct kernfs_node *kn;
+	int ret;
 
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
@@ -279,10 +284,11 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
 	kn = inode->i_private;
 
 	mutex_lock(&kernfs_mutex);
+	ret = kernfs_ve_permission(kn, kernfs_info(inode->i_sb), mask);
 	kernfs_refresh_inode(kn, inode);
 	mutex_unlock(&kernfs_mutex);
 
-	return generic_permission(&init_user_ns, inode, mask);
+	return ret ? ret : generic_permission(&init_user_ns, inode, mask);
 }
 
 int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index ccc3b44f6306..b457b7a19015 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -71,6 +71,10 @@ struct kernfs_super_info {
 
 	/* anchored at kernfs_root->supers, protected by kernfs_mutex */
 	struct list_head	node;
+#ifdef CONFIG_VE
+	struct ve_struct	*ve;
+	off_t			ve_perms_off;
+#endif
 };
 #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
 
diff --git a/fs/kernfs/kernfs-ve.h b/fs/kernfs/kernfs-ve.h
new file mode 100644
index 000000000000..2420af5d5091
--- /dev/null
+++ b/fs/kernfs/kernfs-ve.h
@@ -0,0 +1,44 @@
+/*
+ *  fs/kernfs/kernfs-ve.h
+ *
+ *  Copyright (c) 2018-2021 Virtuozzo International GmbH. All rights reserved.
+ *
+ */
+
+#ifndef __KERNFS_VE_H
+#define __KERNFS_VE_H
+
+struct kernfs_root;
+struct kernfs_super_info;
+struct kernfs_node;
+struct kmapset;
+
+#ifdef CONFIG_VE
+
+void kernfs_get_ve_perms(struct kernfs_node *kn);
+void kernfs_put_ve_perms(struct kernfs_node *kn);
+
+int kernfs_ve_permission(struct kernfs_node *kn,
+			 struct kernfs_super_info *info, int mask);
+
+int kernfs_ve_allowed(struct kernfs_node *kn);
+
+#else // CONFIG_VE
+
+void kernfs_get_ve_perms(struct kernfs_node *kn) { }
+void kernfs_put_ve_perms(struct kernfs_node *kn) { }
+
+static inline int kernfs_ve_permission(struct kernfs_node *kn,
+				 struct kernfs_super_info *info, int mask)
+{
+	return 0;
+}
+
+static inline int kernfs_ve_allowed(void)
+{
+	return 1;
+}
+
+#endif
+
+#endif
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 9dc7e7a64e10..fd0ae3b16bcf 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -18,6 +18,7 @@
 #include <linux/exportfs.h>
 
 #include "kernfs-internal.h"
+#include "kernfs-ve.h"
 
 struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
 
diff --git a/fs/kernfs/ve.c b/fs/kernfs/ve.c
new file mode 100644
index 000000000000..94cd5913ca3a
--- /dev/null
+++ b/fs/kernfs/ve.c
@@ -0,0 +1,100 @@
+/*
+ *  fs/kernfs/ve.c
+ *
+ *  Copyright (c) 2000-2017 Virtuozzo International GmbH.
+ *  All rights reserved.
+ *
+ */
+
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+
+#include <linux/ve.h>
+#include <linux/kmapset.h>
+
+#include "kernfs-internal.h"
+
+#include "kernfs-ve.h"
+
+void kernfs_set_ve_perms(struct dentry *root, off_t key_off)
+{
+	struct kernfs_super_info *info = kernfs_info(root->d_sb);
+
+	info->ve_perms_off = key_off;
+	info->ve = get_exec_env();
+}
+
+int kernfs_init_ve_perms(struct kernfs_root *root,
+			 struct kmapset_set *perms_set)
+{
+	struct kernfs_node *kn = root->kn;
+
+	kmapset_init_set(perms_set);
+	kn->ve_perms_map = kmapset_new(perms_set);
+	if (!kn->ve_perms_map)
+		return -ENOMEM;
+	kmapset_commit(kn->ve_perms_map);
+
+	root->ve_perms_set = perms_set;
+	return 0;
+}
+
+int kernfs_ve_allowed(struct kernfs_node *kn)
+{
+	return !kn->ve_perms_map || ve_is_super(get_exec_env());
+}
+
+static struct kmapset_key *kernfs_info_perms_key(struct kernfs_super_info *info)
+{
+	return (void *)get_exec_env() + info->ve_perms_off;
+}
+
+int kernfs_ve_permission(struct kernfs_node *kn,
+			 struct kernfs_super_info *info, int mask)
+{
+	struct kernfs_node *tmp_kn = kn;
+	int perm;
+
+	if (kernfs_ve_allowed(kn))
+		return 0;
+
+	/* Entries with namespace tag and their sub-entries always visible */
+	while (tmp_kn) {
+		if (tmp_kn->ns)
+			return 0;
+		tmp_kn = tmp_kn->parent;
+	}
+
+	if (kernfs_type(kn) == KERNFS_LINK)
+		kn = kn->symlink.target_kn;
+
+	perm = kmapset_get_value(kn->ve_perms_map, kernfs_info_perms_key(info));
+	if ((mask & ~perm & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+		return 0;
+
+	return -EACCES;
+}
+
+void kernfs_get_ve_perms(struct kernfs_node *kn)
+{
+	struct kernfs_root *root = kernfs_root(kn);
+	struct kmapset_map *kms;
+
+	if (!root->ve_perms_set)
+		return;
+
+	kms = kmapset_new(root->ve_perms_set);
+	if (kms)
+		kn->ve_perms_map = kmapset_commit(kms);
+}
+
+void kernfs_put_ve_perms(struct kernfs_node *kn)
+{
+	if (kn->ve_perms_map)
+		kmapset_put(kn->ve_perms_map);
+}
diff --git a/include/linux/kernfs-ve.h b/include/linux/kernfs-ve.h
new file mode 100644
index 000000000000..325ff6a22f33
--- /dev/null
+++ b/include/linux/kernfs-ve.h
@@ -0,0 +1,31 @@
+/*
+ *  include/linux/kernfs-ve.h
+ *
+ *  Copyright (c) 2018-2021 Virtuozzo International GmbH. All rights reserved.
+ *
+ */
+
+#ifndef __LINUX_KERNFS_VE_H
+#define __LINUX_KERNFS_VE_H
+
+#include <linux/kmapset.h>
+#include <linux/mutex.h>
+
+struct kernfs_root;
+struct dentry;
+
+#ifdef CONFIG_VE
+int kernfs_init_ve_perms(struct kernfs_root *root,
+			 struct kmapset_set *perms_set);
+void kernfs_set_ve_perms(struct dentry *root, off_t key_off);
+#else   /* CONFIG_VE */
+static inline int kernfs_init_ve_perms(struct kernfs_root *root,
+				       struct kmapset_set *perms_set)
+{
+	return 0;
+}
+static inline void kernfs_set_ve_perms(struct dentry *root,
+				       off_t key_off) { }
+#endif  /* CONFIG_VE */
+
+#endif  /* __LINUX_KERNFS_VE_H */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 9e8ca8743c26..147d1ef4cd21 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -156,6 +156,9 @@ struct kernfs_node {
 	unsigned short		flags;
 	umode_t			mode;
 	struct kernfs_iattrs	*iattr;
+#ifdef CONFIG_VE
+	struct kmapset_map	*ve_perms_map;
+#endif
 };
 
 /*
@@ -192,6 +195,9 @@ struct kernfs_root {
 	struct list_head	supers;
 
 	wait_queue_head_t	deactivate_waitq;
+#ifdef CONFIG_VE
+	struct kmapset_set	*ve_perms_set;
+#endif
 };
 
 struct kernfs_open_file {


More information about the Devel mailing list