[Devel] [PATCH RHEL8 COMMIT] ve/devtmpfs: lightweight virtualization

Konstantin Khorenko khorenko at virtuozzo.com
Mon Jul 19 19:02:55 MSK 2021


The commit is pushed to "branch-rh8-4.18.0-305.3.1.vz8.7.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-305.3.1.el8
------>
commit 67564051ee761ed3764d21d575f9617ee23c4a51
Author: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
Date:   Mon Jul 19 19:02:55 2021 +0300

    ve/devtmpfs: lightweight virtualization
    
    Due to changes in RH8.4 we need to rewrork it, actually the logic
    becomes much more simple, we mount/umount single tmpts per ve on cgroup
    creation/removal, all actual devtmpfs mount calls only increase a
    refcount on corresponding ve's mount like with hosts devtmps.
    
    Original commit message:
    
    Previousely, we implemented full-featured devtmpfs virtualization for
    VE: when a device is created in a VE "namespace", we send a signal to
    kdevtmpfs to create the devnode on devtmpfs mount corresponding to the
    VE. This seems to be over-complicated: all this work can be done from
    userspace, because we only have a hardcoded list of devices created
    exclusively for VE on container start. Those are tty-related stuff and
    mem devices, and we only need the latter to create devtmpfs nodes.
    Moreover, it is buggy: ve_stop_ns, which destroys VE devtmpfs mount can
    be called before a VE tty device is unregistered, resulting in a KP:
    
    https://jira.sw.ru/browse/PSBM-35077
    
    This patch therefore simplified it. It makes the kernel only provide a
    single empty tmpfs mount per VE, which appears on an attempt to mount
    devtmpfs from inside a VE. The content of the fs is to be filled by the
    userspace on container start, which will be done in the scope of
    
    https://jira.sw.ru/browse/PSBM-35146
    
    All this patch does is provides each VE with its own empty single tmpfs
    mount, which appears on an attempt to mount "devtmpfs". It's up to the
    userspace to populate this fs on container start, all kernel requests to
    create a device node inside a VE are ignored.
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
    Signed-off-by: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
    
    https://jira.sw.ru/browse/PSBM-131158
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    v2 by khorenko@: s/FS_USERNS_MOUNT/FS_VE_MOUNT/
---
 drivers/base/devtmpfs.c | 25 +++++++++++++++++++++++++
 fs/namespace.c          |  1 +
 include/linux/device.h  |  2 ++
 include/linux/ve.h      |  3 +++
 kernel/ve/ve.c          |  6 ++++++
 5 files changed, 37 insertions(+)

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index fc7b883f36c9..a967a49b9f14 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/kthread.h>
 #include <uapi/linux/mount.h>
+#include <linux/ve.h>
 #include "base.h"
 
 static struct task_struct *thread;
@@ -62,6 +63,13 @@ static struct dentry *public_dev_mount(struct file_system_type *fs_type, int fla
 		      const char *dev_name, void *data)
 {
 	struct super_block *s = mnt->mnt_sb;
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+
+	if (!ve_is_super(ve))
+		s = ve->devtmpfs_mnt->mnt_sb;
+#endif
+
 	atomic_inc(&s->s_active);
 	down_write(&s->s_umount);
 	return dget(s->s_root);
@@ -82,6 +90,7 @@ static struct file_system_type internal_fs_type = {
 static struct file_system_type dev_fs_type = {
 	.name = "devtmpfs",
 	.mount = public_dev_mount,
+	.fs_flags = FS_VIRTUALIZED | FS_VE_MOUNT,
 };
 
 #ifdef CONFIG_BLOCK
@@ -425,6 +434,22 @@ static int devtmpfsd(void *p)
 	return *err;
 }
 
+int ve_mount_devtmpfs(struct ve_struct *ve)
+{
+	char opts[] = "mode=0755";
+	struct vfsmount *mnt;
+
+	mnt = vfs_kern_mount(&internal_fs_type, 0, "devtmpfs", opts);
+	if (IS_ERR(mnt)) {
+		printk(KERN_ERR "CT#%s: devtmpfs: unable to create devtmpfs %ld\n",
+		       ve_name(ve), PTR_ERR(mnt));
+		return PTR_ERR(mnt);
+	}
+	ve->devtmpfs_mnt = mnt;
+
+	return 0;
+}
+
 /*
  * Create devtmpfs instance, driver-core devices will add their device
  * nodes here.
diff --git a/fs/namespace.c b/fs/namespace.c
index 9128029f5a78..2009130cd51e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -30,6 +30,7 @@
 #include <uapi/linux/mount.h>
 #include <linux/fs_context.h>
 #include <linux/shmem_fs.h>
+#include <linux/mount.h>
 
 #include <linux/ve.h>
 
diff --git a/include/linux/device.h b/include/linux/device.h
index c1630a5dec50..b0da526490ce 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1665,10 +1665,12 @@ extern bool kill_device(struct device *dev);
 extern int devtmpfs_create_node(struct device *dev);
 extern int devtmpfs_delete_node(struct device *dev);
 extern int devtmpfs_mount(const char *mntdir);
+extern int ve_mount_devtmpfs(struct ve_struct *ve);
 #else
 static inline int devtmpfs_create_node(struct device *dev) { return 0; }
 static inline int devtmpfs_delete_node(struct device *dev) { return 0; }
 static inline int devtmpfs_mount(const char *mountpoint) { return 0; }
+static inline int ve_mount_devtmpfs(struct ve_struct *ve) { return 0; }
 #endif
 
 /* drivers/base/power/shutdown.c */
diff --git a/include/linux/ve.h b/include/linux/ve.h
index b17868ba86c3..d3f77467cdf6 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -22,6 +22,7 @@
 struct nsproxy;
 struct veip_struct;
 struct user_namespace;
+struct vfsmount;
 
 struct ve_struct {
 	struct cgroup_subsys_state	css;
@@ -120,6 +121,8 @@ struct ve_struct {
 	 */
 	struct list_head	per_cgroot_list;
 	spinlock_t		per_cgroot_list_lock;
+
+	struct vfsmount		*devtmpfs_mnt;
 };
 
 struct ve_devmnt {
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index fee86917d624..bb9b99c58992 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -31,6 +31,7 @@
 #include <linux/ctype.h>
 #include <linux/tty.h>
 #include <linux/genhd.h>
+#include <linux/device.h>
 
 #include <uapi/linux/vzcalluser.h>
 #include <net/rtnetlink.h>
@@ -910,6 +911,10 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	if (copy_vdso(&ve->vdso_32, &vdso_image_32))
 		goto err_vdso;
 
+	err = ve_mount_devtmpfs(ve);
+	if (err)
+		goto err_vdso;
+
 	ve->features = VE_FEATURES_DEF;
 
 	INIT_WORK(&ve->release_agent_work, cgroup1_release_agent);
@@ -1021,6 +1026,7 @@ static void ve_destroy(struct cgroup_subsys_state *css)
 	kmapset_unlink(&ve->sysfs_perms_key, &sysfs_ve_perms_set);
 	ve_log_destroy(ve);
 	ve_free_vdso(ve);
+	mntput(ve->devtmpfs_mnt);
 #if IS_ENABLED(CONFIG_BINFMT_MISC)
 	kfree(ve->binfmt_misc);
 #endif


More information about the Devel mailing list