[Devel] [PATCH RHEL COMMIT] ve/kernfs: implement ve-based permissions
Konstantin Khorenko
khorenko at virtuozzo.com
Wed Sep 22 14:50:52 MSK 2021
The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit 52a22d0044ca6a8eb24b3629bdb4da2c83bdb400
Author: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
Date: Wed Sep 22 14:50:52 2021 +0300
ve/kernfs: implement ve-based permissions
Here are the main ideas:
1) Store pointer to 've' on kernfs superblock info.
2) Keep on each kernfs node kmapset mapping (ve -> permissions).
3) Store kmapset key on each ve (used to check dentry visibility).
4) Store key offset in ve_struct on each kernfs superblock info to be able to
get the key pointer by current VE.
And do not inherit ve permissions from parent.
Otherwise when a new ploop is created, all containers that have access
to devices/virtual/block will gain access to the new ploop too, which is
a security breach.
https://jira.sw.ru/browse/PSBM-20892
https://jira.sw.ru/browse/PSBM-34682
Signed-off-by: Konstantin Khlebnikov <khlebnikov at openvz.org>
Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
Signed-off-by: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
+++
fs/kernfs: remove bogus kernfs_test_ve() check
cgroups now also use kernfs subsystem. kernfs_test_ve() check breaks
it when we try to mount cgroup from ve.
cgroup1_mount() tries to reuse the existing superblock by calling
kernfs_pin_sb to pin it during mount operation
cgroup1_mount():
pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
...
Then it calls cgroup_do_mount() which fails to reuse the existing
superblock because of failed kernfs_test_ve() check, so instead
it creates a new one:
dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
CGROUP_SUPER_MAGIC, ns);
So now we have two superblocks instead of one which may lead to
the double percpu_ref_kill_and_confirm() call. E.g. :
umount /sys/fs/cgroup/rdma
vzctl exec CTID umount /sys/fs/cgroup/rdma
------------[ cut here ]------------
percpu_ref_kill_and_confirm called more than once on css_release!
WARNING: CPU: 2 PID: 2664 at lib/percpu-refcount.c:336 percpu_ref_kill_and_confirm+0x7c/0xa0
CPU: 2 PID: 2664 Comm: kworker/2:5 ve: / Tainted: G --------- -t - 4.18.0-80.1.2.vz8.3.12 #1 3.12
RIP: 0010:percpu_ref_kill_and_confirm+0x7c/0xa0
Call Trace:
cgroup_kill_sb+0x64/0x90
deactivate_locked_super+0x34/0x70
cleanup_mnt+0x3b/0x70
delayed_mntput+0x26/0x40
process_one_work+0x1a7/0x360
worker_thread+0x30/0x390
kthread+0x112/0x130
ret_from_fork+0x35/0x40
---[ end trace 0cb674141e1a86e3 ]---
CT: d3cd4dca-ce04-475d-b5d5-c9ef1f24642a: stopped
Remove the kernfs_test_ve() check to fix this.
As for sysfs which is also uses kernfs, I don't think it gonna cause
something bad. Sysfs uses net namespaces (info->ns && sb_info->ns) which
should be enough for us. We enter CT's net ns before mounting sysfs in CT.
Also note that commands like "vzctl set 101 --netif_add eth0 --save"
calls ip util (which mount sysfs) from ve0 and CT's net namespace.
So with the removal of kernfs_test_ve() check kernfs_test_super()
now return success instead of fail as before. Which is totally
fine IMO, as we reuse the existing CT's sysfs superblock
instead of creating a new one.
mFixes: c5cf56e9cb77 ("ve/kernfs: implement ve-based permissions")
https://jira.sw.ru/browse/PSBM-104902
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
(cherry-picked from vz8 commit 1f6295629f7325f18e91df95df27a2b2b99a328d)
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
fs/kernfs/Makefile | 1 +
fs/kernfs/dir.c | 3 ++
fs/kernfs/inode.c | 8 +++-
fs/kernfs/kernfs-internal.h | 4 ++
fs/kernfs/kernfs-ve.h | 44 +++++++++++++++++++
fs/kernfs/mount.c | 1 +
fs/kernfs/ve.c | 100 ++++++++++++++++++++++++++++++++++++++++++++
include/linux/kernfs-ve.h | 31 ++++++++++++++
include/linux/kernfs.h | 6 +++
9 files changed, 197 insertions(+), 1 deletion(-)
diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile
index 4ca54ff54c98..d3dfb0720fec 100644
--- a/fs/kernfs/Makefile
+++ b/fs/kernfs/Makefile
@@ -4,3 +4,4 @@
#
obj-y := mount.o inode.o dir.o file.o symlink.o
+obj-$(CONFIG_VE) += ve.o
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 33166ec90a11..f77ca8e1a301 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -16,6 +16,7 @@
#include <linux/hash.h>
#include "kernfs-internal.h"
+#include "kernfs-ve.h"
DEFINE_MUTEX(kernfs_mutex);
static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */
@@ -531,6 +532,7 @@ void kernfs_put(struct kernfs_node *kn)
simple_xattrs_free(&kn->iattr->xattrs);
kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
}
+ kernfs_put_ve_perms(kn);
spin_lock(&kernfs_idr_lock);
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
spin_unlock(&kernfs_idr_lock);
@@ -750,6 +752,7 @@ int kernfs_add_one(struct kernfs_node *kn)
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
+ kernfs_get_ve_perms(kn);
mutex_unlock(&kernfs_mutex);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 26f2aa3586f9..14c826798647 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -16,6 +16,7 @@
#include <linux/security.h>
#include "kernfs-internal.h"
+#include "kernfs-ve.h"
static const struct inode_operations kernfs_iops = {
.permission = kernfs_iop_permission,
@@ -116,6 +117,9 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (!kn)
return -EINVAL;
+ if (!kernfs_ve_allowed(kn))
+ return -EPERM;
+
mutex_lock(&kernfs_mutex);
error = setattr_prepare(&init_user_ns, dentry, iattr);
if (error)
@@ -272,6 +276,7 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
struct kernfs_node *kn;
+ int ret;
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
@@ -279,10 +284,11 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
kn = inode->i_private;
mutex_lock(&kernfs_mutex);
+ ret = kernfs_ve_permission(kn, kernfs_info(inode->i_sb), mask);
kernfs_refresh_inode(kn, inode);
mutex_unlock(&kernfs_mutex);
- return generic_permission(&init_user_ns, inode, mask);
+ return ret ? ret : generic_permission(&init_user_ns, inode, mask);
}
int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index ccc3b44f6306..b457b7a19015 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -71,6 +71,10 @@ struct kernfs_super_info {
/* anchored at kernfs_root->supers, protected by kernfs_mutex */
struct list_head node;
+#ifdef CONFIG_VE
+ struct ve_struct *ve;
+ off_t ve_perms_off;
+#endif
};
#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
diff --git a/fs/kernfs/kernfs-ve.h b/fs/kernfs/kernfs-ve.h
new file mode 100644
index 000000000000..2420af5d5091
--- /dev/null
+++ b/fs/kernfs/kernfs-ve.h
@@ -0,0 +1,44 @@
+/*
+ * fs/kernfs/kernfs-ve.h
+ *
+ * Copyright (c) 2018-2021 Virtuozzo International GmbH. All rights reserved.
+ *
+ */
+
+#ifndef __KERNFS_VE_H
+#define __KERNFS_VE_H
+
+struct kernfs_root;
+struct kernfs_super_info;
+struct kernfs_node;
+struct kmapset;
+
+#ifdef CONFIG_VE
+
+void kernfs_get_ve_perms(struct kernfs_node *kn);
+void kernfs_put_ve_perms(struct kernfs_node *kn);
+
+int kernfs_ve_permission(struct kernfs_node *kn,
+ struct kernfs_super_info *info, int mask);
+
+int kernfs_ve_allowed(struct kernfs_node *kn);
+
+#else // CONFIG_VE
+
+void kernfs_get_ve_perms(struct kernfs_node *kn) { }
+void kernfs_put_ve_perms(struct kernfs_node *kn) { }
+
+static inline int kernfs_ve_permission(struct kernfs_node *kn,
+ struct kernfs_super_info *info, int mask)
+{
+ return 0;
+}
+
+static inline int kernfs_ve_allowed(void)
+{
+ return 1;
+}
+
+#endif
+
+#endif
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 9dc7e7a64e10..fd0ae3b16bcf 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -18,6 +18,7 @@
#include <linux/exportfs.h>
#include "kernfs-internal.h"
+#include "kernfs-ve.h"
struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
diff --git a/fs/kernfs/ve.c b/fs/kernfs/ve.c
new file mode 100644
index 000000000000..94cd5913ca3a
--- /dev/null
+++ b/fs/kernfs/ve.c
@@ -0,0 +1,100 @@
+/*
+ * fs/kernfs/ve.c
+ *
+ * Copyright (c) 2000-2017 Virtuozzo International GmbH.
+ * All rights reserved.
+ *
+ */
+
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+
+#include <linux/ve.h>
+#include <linux/kmapset.h>
+
+#include "kernfs-internal.h"
+
+#include "kernfs-ve.h"
+
+void kernfs_set_ve_perms(struct dentry *root, off_t key_off)
+{
+ struct kernfs_super_info *info = kernfs_info(root->d_sb);
+
+ info->ve_perms_off = key_off;
+ info->ve = get_exec_env();
+}
+
+int kernfs_init_ve_perms(struct kernfs_root *root,
+ struct kmapset_set *perms_set)
+{
+ struct kernfs_node *kn = root->kn;
+
+ kmapset_init_set(perms_set);
+ kn->ve_perms_map = kmapset_new(perms_set);
+ if (!kn->ve_perms_map)
+ return -ENOMEM;
+ kmapset_commit(kn->ve_perms_map);
+
+ root->ve_perms_set = perms_set;
+ return 0;
+}
+
+int kernfs_ve_allowed(struct kernfs_node *kn)
+{
+ return !kn->ve_perms_map || ve_is_super(get_exec_env());
+}
+
+static struct kmapset_key *kernfs_info_perms_key(struct kernfs_super_info *info)
+{
+ return (void *)get_exec_env() + info->ve_perms_off;
+}
+
+int kernfs_ve_permission(struct kernfs_node *kn,
+ struct kernfs_super_info *info, int mask)
+{
+ struct kernfs_node *tmp_kn = kn;
+ int perm;
+
+ if (kernfs_ve_allowed(kn))
+ return 0;
+
+ /* Entries with namespace tag and their sub-entries always visible */
+ while (tmp_kn) {
+ if (tmp_kn->ns)
+ return 0;
+ tmp_kn = tmp_kn->parent;
+ }
+
+ if (kernfs_type(kn) == KERNFS_LINK)
+ kn = kn->symlink.target_kn;
+
+ perm = kmapset_get_value(kn->ve_perms_map, kernfs_info_perms_key(info));
+ if ((mask & ~perm & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+ return 0;
+
+ return -EACCES;
+}
+
+void kernfs_get_ve_perms(struct kernfs_node *kn)
+{
+ struct kernfs_root *root = kernfs_root(kn);
+ struct kmapset_map *kms;
+
+ if (!root->ve_perms_set)
+ return;
+
+ kms = kmapset_new(root->ve_perms_set);
+ if (kms)
+ kn->ve_perms_map = kmapset_commit(kms);
+}
+
+void kernfs_put_ve_perms(struct kernfs_node *kn)
+{
+ if (kn->ve_perms_map)
+ kmapset_put(kn->ve_perms_map);
+}
diff --git a/include/linux/kernfs-ve.h b/include/linux/kernfs-ve.h
new file mode 100644
index 000000000000..325ff6a22f33
--- /dev/null
+++ b/include/linux/kernfs-ve.h
@@ -0,0 +1,31 @@
+/*
+ * include/linux/kernfs-ve.h
+ *
+ * Copyright (c) 2018-2021 Virtuozzo International GmbH. All rights reserved.
+ *
+ */
+
+#ifndef __LINUX_KERNFS_VE_H
+#define __LINUX_KERNFS_VE_H
+
+#include <linux/kmapset.h>
+#include <linux/mutex.h>
+
+struct kernfs_root;
+struct dentry;
+
+#ifdef CONFIG_VE
+int kernfs_init_ve_perms(struct kernfs_root *root,
+ struct kmapset_set *perms_set);
+void kernfs_set_ve_perms(struct dentry *root, off_t key_off);
+#else /* CONFIG_VE */
+static inline int kernfs_init_ve_perms(struct kernfs_root *root,
+ struct kmapset_set *perms_set)
+{
+ return 0;
+}
+static inline void kernfs_set_ve_perms(struct dentry *root,
+ off_t key_off) { }
+#endif /* CONFIG_VE */
+
+#endif /* __LINUX_KERNFS_VE_H */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 9e8ca8743c26..147d1ef4cd21 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -156,6 +156,9 @@ struct kernfs_node {
unsigned short flags;
umode_t mode;
struct kernfs_iattrs *iattr;
+#ifdef CONFIG_VE
+ struct kmapset_map *ve_perms_map;
+#endif
};
/*
@@ -192,6 +195,9 @@ struct kernfs_root {
struct list_head supers;
wait_queue_head_t deactivate_waitq;
+#ifdef CONFIG_VE
+ struct kmapset_set *ve_perms_set;
+#endif
};
struct kernfs_open_file {
More information about the Devel
mailing list