[Devel] [PATCH RHEL7 COMMIT] ve/devtmpfs: lightweight virtualization

Konstantin Khorenko khorenko at virtuozzo.com
Fri Aug 28 05:10:59 PDT 2015


The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.6.3
------>
commit 22255fb606cfd53fb98b11c62b854c0de5a4c713
Author: Vladimir Davydov <vdavydov at parallels.com>
Date:   Fri Aug 28 16:10:59 2015 +0400

    ve/devtmpfs: lightweight virtualization
    
    Patchset description:
    
    Rework devtmpfs virtualization
    
    Currently, we implement full-featured devtmpfs virtualization for VE:
    when a device is created in a VE "namespace", we send a signal to
    kdevtmpfs to create the devnode on devtmpfs mount corresponding to the
    VE. This seems to be over-complicated: all this work can be done from
    userspace, because we only have a hardcoded list of devices created
    exclusively for VE on container start. Those are tty-related stuff and
    mem devices, and we only need the latter to create devtmpfs nodes.
    Moreover, it is buggy: ve_stop_ns, which destroys VE devtmpfs mount can
    be called before a VE tty device is unregistered, resulting in a KP:
    
    https://jira.sw.ru/browse/PSBM-35077
    
    This patch therefore simplifies it. It makes the kernel only provide a
    single empty tmpfs mount per VE, which appears on an attempt to mount
    devtmpfs from inside a VE. The content of the fs is to be filled by the
    userspace on container start, which will be done in the scope of
    
    https://jira.sw.ru/browse/PSBM-35146
    
    Vladimir Davydov (6):
      Revert "ve/devtmpfs: Create required devices on container startup"
      Revert "ve/devtmpfs: pass proper options string"
      Revert "devtmpfs: containerize it with new obj ns operation"
      Revert "fs: add data pointer to mount_ns()"
      Revert "devtmpfs: per-VE mounts introduced"
      devtmpfs: lightweight virtualization
    
    Reviewed-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
    
    ===
    This patch description:
    
    All this patch does is provides each VE with its own empty single tmpfs
    mount, which appears on an attempt to mount "devtmpfs". It's up to the
    userspace to populate this fs on container start, all kernel requests to
    create a device node inside a VE are ignored.
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 drivers/base/devtmpfs.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ve.h      |  1 +
 kernel/ve/ve.c          |  4 +++
 3 files changed, 72 insertions(+)

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index f59b798..daf97ee 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -23,6 +23,7 @@
 #include <linux/ramfs.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
+#include <linux/ve.h>
 #include "base.h"
 
 static struct task_struct *thread;
@@ -53,9 +54,61 @@ static int __init mount_param(char *str)
 }
 __setup("devtmpfs.mount=", mount_param);
 
+#ifdef CONFIG_VE
+static int ve_test_dev_sb(struct super_block *s, void *p)
+{
+	return get_exec_env()->dev_sb == s;
+}
+
+static int ve_set_dev_sb(struct super_block *s, void *p)
+{
+	struct ve_struct *ve = get_exec_env();
+	int error;
+
+	error = set_anon_super(s, p);
+	if (!error) {
+		BUG_ON(ve->dev_sb);
+		ve->dev_sb = s;
+		atomic_inc(&s->s_active);
+	}
+	return error;
+}
+
+static struct dentry *ve_dev_mount(struct file_system_type *fs_type, int flags,
+		      const char *dev_name, void *data)
+{
+	int (*fill_super)(struct super_block *, void *, int);
+	struct super_block *s;
+	int error;
+
+#ifdef CONFIG_TMPFS
+	fill_super = shmem_fill_super;
+#else
+	fill_super = ramfs_fill_super;
+#endif
+	s = sget(fs_type, ve_test_dev_sb, ve_set_dev_sb, flags, NULL);
+	if (IS_ERR(s))
+		return ERR_CAST(s);
+
+	if (!s->s_root) {
+		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			deactivate_locked_super(s);
+			return ERR_PTR(error);
+		}
+		s->s_flags |= MS_ACTIVE;
+	}
+	return dget(s->s_root);
+}
+#endif /* CONFIG_VE */
+
 static struct dentry *dev_mount(struct file_system_type *fs_type, int flags,
 		      const char *dev_name, void *data)
 {
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env()))
+		return ve_dev_mount(fs_type, flags, dev_name, data);
+#endif
 #ifdef CONFIG_TMPFS
 	return mount_single(fs_type, flags, data, shmem_fill_super);
 #else
@@ -79,6 +132,16 @@ static inline int is_blockdev(struct device *dev)
 static inline int is_blockdev(struct device *dev) { return 0; }
 #endif
 
+#ifdef CONFIG_VE
+static inline int is_ve_dev(struct device *dev)
+{
+	return dev->class && dev->class->namespace == ve_namespace &&
+		ve_namespace(dev) != get_ve0();
+}
+#else
+static inline int is_ve_dev(struct device *dev) { return 0; }
+#endif
+
 int devtmpfs_create_node(struct device *dev)
 {
 	const char *tmp = NULL;
@@ -86,6 +149,8 @@ int devtmpfs_create_node(struct device *dev)
 
 	if (!thread)
 		return 0;
+	if (is_ve_dev(dev))
+		return 0;
 
 	req.mode = 0;
 	req.uid = GLOBAL_ROOT_UID;
@@ -125,6 +190,8 @@ int devtmpfs_delete_node(struct device *dev)
 
 	if (!thread)
 		return 0;
+	if (is_ve_dev(dev))
+		return 0;
 
 	req.name = device_get_devnode(dev, NULL, NULL, NULL, &tmp);
 	if (!req.name)
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 84be823..82a840a 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -60,6 +60,7 @@ struct ve_struct {
 /* VE's root */
 	struct path		root_path;
 
+	struct super_block	*dev_sb;
 	struct super_block	*devpts_sb;
 
 #if IS_ENABLED(CONFIG_BINFMT_MISC)
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 536a83f..bdfa30d 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -532,6 +532,10 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
 	 * At this point all userspace tasks in container are dead.
 	 */
 
+	if (ve->dev_sb) {
+		deactivate_super(ve->dev_sb);
+		ve->dev_sb = NULL;
+	}
 	if (ve->devpts_sb) {
 		deactivate_super(ve->devpts_sb);
 		ve->devpts_sb = NULL;



More information about the Devel mailing list