[Devel] [PATCH RH9 02/13] ve/kernfs: implement ve-based permissions

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Tue Sep 21 19:04:20 MSK 2021


From: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>

Here are the main ideas:
1) Store pointer to 've' on kernfs superblock info.
2) Keep on each kernfs node kmapset mapping (ve -> permissions).
3) Store kmapset key on each ve (used to check dentry visibility).
4) Store key offset in ve_struct on each kernfs superblock info to be able to
get the key pointer by current VE.

And do not inherit ve permissions from parent.
Otherwise when a new ploop is created, all containers that have access
to devices/virtual/block will gain access to the new ploop too, which is
a security breach.

https://jira.sw.ru/browse/PSBM-20892
https://jira.sw.ru/browse/PSBM-34682

Signed-off-by: Konstantin Khlebnikov <khlebnikov at openvz.org>
Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
Signed-off-by: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>

+++
fs/kernfs: remove bogus kernfs_test_ve() check

cgroups now also use kernfs subsystem. kernfs_test_ve() check breaks
it when we try to mount cgroup from ve.

cgroup1_mount() tries to reuse the existing superblock by calling
kernfs_pin_sb to pin it during mount operation

cgroup1_mount():
		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
		...

Then it calls cgroup_do_mount() which fails to reuse the existing
superblock because of failed kernfs_test_ve() check, so instead
it creates a new one:

	dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
				 CGROUP_SUPER_MAGIC, ns);

So now we have two superblocks instead of one which may lead to
the double percpu_ref_kill_and_confirm() call. E.g. :

umount /sys/fs/cgroup/rdma
vzctl exec CTID umount /sys/fs/cgroup/rdma

	------------[ cut here ]------------
	percpu_ref_kill_and_confirm called more than once on css_release!
	WARNING: CPU: 2 PID: 2664 at lib/percpu-refcount.c:336 percpu_ref_kill_and_confirm+0x7c/0xa0
	CPU: 2 PID: 2664 Comm: kworker/2:5 ve: / Tainted: G                 --------- -t - 4.18.0-80.1.2.vz8.3.12 #1 3.12
	RIP: 0010:percpu_ref_kill_and_confirm+0x7c/0xa0
	Call Trace:
	 cgroup_kill_sb+0x64/0x90
	 deactivate_locked_super+0x34/0x70
	 cleanup_mnt+0x3b/0x70
	 delayed_mntput+0x26/0x40
	 process_one_work+0x1a7/0x360
	 worker_thread+0x30/0x390
	 kthread+0x112/0x130
	 ret_from_fork+0x35/0x40
	---[ end trace 0cb674141e1a86e3 ]---
	CT: d3cd4dca-ce04-475d-b5d5-c9ef1f24642a: stopped

Remove the kernfs_test_ve() check to fix this.

As for sysfs which is also uses kernfs, I don't think it gonna cause
something bad. Sysfs uses net namespaces (info->ns && sb_info->ns) which
should be enough for us. We enter CT's net ns before mounting sysfs in CT.

Also note that commands like "vzctl set 101 --netif_add eth0 --save"
calls ip util (which mount sysfs) from ve0 and CT's net namespace.
So with the removal of kernfs_test_ve() check kernfs_test_super()
now return success instead of fail as before. Which is totally
fine IMO, as we reuse the existing CT's sysfs superblock
instead of creating a new one.

mFixes: c5cf56e9cb77 ("ve/kernfs: implement ve-based permissions")

https://jira.sw.ru/browse/PSBM-104902
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>

(cherry-picked from vz8 commit 1f6295629f7325f18e91df95df27a2b2b99a328d)
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 fs/kernfs/Makefile          |   1 +
 fs/kernfs/dir.c             |   3 ++
 fs/kernfs/inode.c           |   8 ++-
 fs/kernfs/kernfs-internal.h |   4 ++
 fs/kernfs/kernfs-ve.h       |  44 ++++++++++++++++
 fs/kernfs/mount.c           |   1 +
 fs/kernfs/ve.c              | 100 ++++++++++++++++++++++++++++++++++++
 include/linux/kernfs-ve.h   |  32 ++++++++++++
 include/linux/kernfs.h      |   6 +++
 9 files changed, 198 insertions(+), 1 deletion(-)
 create mode 100644 fs/kernfs/kernfs-ve.h
 create mode 100644 fs/kernfs/ve.c
 create mode 100644 include/linux/kernfs-ve.h

diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile
index 4ca54ff54c98..d3dfb0720fec 100644
--- a/fs/kernfs/Makefile
+++ b/fs/kernfs/Makefile
@@ -4,3 +4,4 @@
 #
 
 obj-y		:= mount.o inode.o dir.o file.o symlink.o
+obj-$(CONFIG_VE) += ve.o
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 33166ec90a11..f77ca8e1a301 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -16,6 +16,7 @@
 #include <linux/hash.h>
 
 #include "kernfs-internal.h"
+#include "kernfs-ve.h"
 
 DEFINE_MUTEX(kernfs_mutex);
 static DEFINE_SPINLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
@@ -531,6 +532,7 @@ void kernfs_put(struct kernfs_node *kn)
 		simple_xattrs_free(&kn->iattr->xattrs);
 		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
 	}
+	kernfs_put_ve_perms(kn);
 	spin_lock(&kernfs_idr_lock);
 	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
 	spin_unlock(&kernfs_idr_lock);
@@ -750,6 +752,7 @@ int kernfs_add_one(struct kernfs_node *kn)
 		ktime_get_real_ts64(&ps_iattr->ia_ctime);
 		ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 	}
+	kernfs_get_ve_perms(kn);
 
 	mutex_unlock(&kernfs_mutex);
 
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 26f2aa3586f9..14c826798647 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -16,6 +16,7 @@
 #include <linux/security.h>
 
 #include "kernfs-internal.h"
+#include "kernfs-ve.h"
 
 static const struct inode_operations kernfs_iops = {
 	.permission	= kernfs_iop_permission,
@@ -116,6 +117,9 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (!kn)
 		return -EINVAL;
 
+	if (!kernfs_ve_allowed(kn))
+		return -EPERM;
+
 	mutex_lock(&kernfs_mutex);
 	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
@@ -272,6 +276,7 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
 			  struct inode *inode, int mask)
 {
 	struct kernfs_node *kn;
+	int ret;
 
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
@@ -279,10 +284,11 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
 	kn = inode->i_private;
 
 	mutex_lock(&kernfs_mutex);
+	ret = kernfs_ve_permission(kn, kernfs_info(inode->i_sb), mask);
 	kernfs_refresh_inode(kn, inode);
 	mutex_unlock(&kernfs_mutex);
 
-	return generic_permission(&init_user_ns, inode, mask);
+	return ret ? ret : generic_permission(&init_user_ns, inode, mask);
 }
 
 int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index ccc3b44f6306..b457b7a19015 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -71,6 +71,10 @@ struct kernfs_super_info {
 
 	/* anchored at kernfs_root->supers, protected by kernfs_mutex */
 	struct list_head	node;
+#ifdef CONFIG_VE
+	struct ve_struct	*ve;
+	off_t			ve_perms_off;
+#endif
 };
 #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
 
diff --git a/fs/kernfs/kernfs-ve.h b/fs/kernfs/kernfs-ve.h
new file mode 100644
index 000000000000..2420af5d5091
--- /dev/null
+++ b/fs/kernfs/kernfs-ve.h
@@ -0,0 +1,44 @@
+/*
+ *  fs/kernfs/kernfs-ve.h
+ *
+ *  Copyright (c) 2018-2021 Virtuozzo International GmbH. All rights reserved.
+ *
+ */
+
+#ifndef __KERNFS_VE_H
+#define __KERNFS_VE_H
+
+struct kernfs_root;
+struct kernfs_super_info;
+struct kernfs_node;
+struct kmapset;
+
+#ifdef CONFIG_VE
+
+void kernfs_get_ve_perms(struct kernfs_node *kn);
+void kernfs_put_ve_perms(struct kernfs_node *kn);
+
+int kernfs_ve_permission(struct kernfs_node *kn,
+			 struct kernfs_super_info *info, int mask);
+
+int kernfs_ve_allowed(struct kernfs_node *kn);
+
+#else // CONFIG_VE
+
+void kernfs_get_ve_perms(struct kernfs_node *kn) { }
+void kernfs_put_ve_perms(struct kernfs_node *kn) { }
+
+static inline int kernfs_ve_permission(struct kernfs_node *kn,
+				 struct kernfs_super_info *info, int mask)
+{
+	return 0;
+}
+
+static inline int kernfs_ve_allowed(void)
+{
+	return 1;
+}
+
+#endif
+
+#endif
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 9dc7e7a64e10..fd0ae3b16bcf 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -18,6 +18,7 @@
 #include <linux/exportfs.h>
 
 #include "kernfs-internal.h"
+#include "kernfs-ve.h"
 
 struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
 
diff --git a/fs/kernfs/ve.c b/fs/kernfs/ve.c
new file mode 100644
index 000000000000..94cd5913ca3a
--- /dev/null
+++ b/fs/kernfs/ve.c
@@ -0,0 +1,100 @@
+/*
+ *  fs/kernfs/ve.c
+ *
+ *  Copyright (c) 2000-2017 Virtuozzo International GmbH.
+ *  All rights reserved.
+ *
+ */
+
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+
+#include <linux/ve.h>
+#include <linux/kmapset.h>
+
+#include "kernfs-internal.h"
+
+#include "kernfs-ve.h"
+
+void kernfs_set_ve_perms(struct dentry *root, off_t key_off)
+{
+	struct kernfs_super_info *info = kernfs_info(root->d_sb);
+
+	info->ve_perms_off = key_off;
+	info->ve = get_exec_env();
+}
+
+int kernfs_init_ve_perms(struct kernfs_root *root,
+			 struct kmapset_set *perms_set)
+{
+	struct kernfs_node *kn = root->kn;
+
+	kmapset_init_set(perms_set);
+	kn->ve_perms_map = kmapset_new(perms_set);
+	if (!kn->ve_perms_map)
+		return -ENOMEM;
+	kmapset_commit(kn->ve_perms_map);
+
+	root->ve_perms_set = perms_set;
+	return 0;
+}
+
+int kernfs_ve_allowed(struct kernfs_node *kn)
+{
+	return !kn->ve_perms_map || ve_is_super(get_exec_env());
+}
+
+static struct kmapset_key *kernfs_info_perms_key(struct kernfs_super_info *info)
+{
+	return (void *)get_exec_env() + info->ve_perms_off;
+}
+
+int kernfs_ve_permission(struct kernfs_node *kn,
+			 struct kernfs_super_info *info, int mask)
+{
+	struct kernfs_node *tmp_kn = kn;
+	int perm;
+
+	if (kernfs_ve_allowed(kn))
+		return 0;
+
+	/* Entries with namespace tag and their sub-entries always visible */
+	while (tmp_kn) {
+		if (tmp_kn->ns)
+			return 0;
+		tmp_kn = tmp_kn->parent;
+	}
+
+	if (kernfs_type(kn) == KERNFS_LINK)
+		kn = kn->symlink.target_kn;
+
+	perm = kmapset_get_value(kn->ve_perms_map, kernfs_info_perms_key(info));
+	if ((mask & ~perm & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+		return 0;
+
+	return -EACCES;
+}
+
+void kernfs_get_ve_perms(struct kernfs_node *kn)
+{
+	struct kernfs_root *root = kernfs_root(kn);
+	struct kmapset_map *kms;
+
+	if (!root->ve_perms_set)
+		return;
+
+	kms = kmapset_new(root->ve_perms_set);
+	if (kms)
+		kn->ve_perms_map = kmapset_commit(kms);
+}
+
+void kernfs_put_ve_perms(struct kernfs_node *kn)
+{
+	if (kn->ve_perms_map)
+		kmapset_put(kn->ve_perms_map);
+}
diff --git a/include/linux/kernfs-ve.h b/include/linux/kernfs-ve.h
new file mode 100644
index 000000000000..791d6fb2e8ad
--- /dev/null
+++ b/include/linux/kernfs-ve.h
@@ -0,0 +1,32 @@
+/*
+ *  include/linux/kernfs-ve.h
+ *
+ *  Copyright (c) 2018-2021 Virtuozzo International GmbH. All rights reserved.
+ *
+ */
+
+#ifndef __LINUX_KERNFS_VE_H
+#define __LINUX_KERNFS_VE_H
+
+#include <linux/kmapset.h>
+#include <linux/mutex.h>
+
+struct kernfs_root;
+struct dentry;
+
+#ifdef CONFIG_VE
+int kernfs_init_ve_perms(struct kernfs_root *root,
+			 struct kmapset_set *perms_set);
+void kernfs_set_ve_perms(struct dentry *root, off_t key_off);
+#else   /* CONFIG_VE */
+static inline int kernfs_init_ve_perms(struct kernfs_root *root,
+				       struct kmapset_set *perms_set)
+{
+	return 0;
+}
+static inline void kernfs_set_ve_perms(struct dentry *root,
+				       off_t key_off) { }
+#endif  /* CONFIG_VE */
+
+#endif  /* __LINUX_KERNFS_VE_H */
+
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 9e8ca8743c26..147d1ef4cd21 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -156,6 +156,9 @@ struct kernfs_node {
 	unsigned short		flags;
 	umode_t			mode;
 	struct kernfs_iattrs	*iattr;
+#ifdef CONFIG_VE
+	struct kmapset_map	*ve_perms_map;
+#endif
 };
 
 /*
@@ -192,6 +195,9 @@ struct kernfs_root {
 	struct list_head	supers;
 
 	wait_queue_head_t	deactivate_waitq;
+#ifdef CONFIG_VE
+	struct kmapset_set	*ve_perms_set;
+#endif
 };
 
 struct kernfs_open_file {
-- 
2.31.1



More information about the Devel mailing list