[Devel] [PATCH RHEL7 COMMIT] ve/devtmpfs: lightweight virtualization
Konstantin Khorenko
khorenko at virtuozzo.com
Fri Aug 28 05:10:59 PDT 2015
The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.6.3
------>
commit 22255fb606cfd53fb98b11c62b854c0de5a4c713
Author: Vladimir Davydov <vdavydov at parallels.com>
Date: Fri Aug 28 16:10:59 2015 +0400
ve/devtmpfs: lightweight virtualization
Patchset description:
Rework devtmpfs virtualization
Currently, we implement full-featured devtmpfs virtualization for VE:
when a device is created in a VE "namespace", we send a signal to
kdevtmpfs to create the devnode on devtmpfs mount corresponding to the
VE. This seems to be over-complicated: all this work can be done from
userspace, because we only have a hardcoded list of devices created
exclusively for VE on container start. Those are tty-related stuff and
mem devices, and we only need the latter to create devtmpfs nodes.
Moreover, it is buggy: ve_stop_ns, which destroys VE devtmpfs mount can
be called before a VE tty device is unregistered, resulting in a KP:
https://jira.sw.ru/browse/PSBM-35077
This patch therefore simplifies it. It makes the kernel only provide a
single empty tmpfs mount per VE, which appears on an attempt to mount
devtmpfs from inside a VE. The content of the fs is to be filled by the
userspace on container start, which will be done in the scope of
https://jira.sw.ru/browse/PSBM-35146
Vladimir Davydov (6):
Revert "ve/devtmpfs: Create required devices on container startup"
Revert "ve/devtmpfs: pass proper options string"
Revert "devtmpfs: containerize it with new obj ns operation"
Revert "fs: add data pointer to mount_ns()"
Revert "devtmpfs: per-VE mounts introduced"
devtmpfs: lightweight virtualization
Reviewed-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
===
This patch description:
All this patch does is provides each VE with its own empty single tmpfs
mount, which appears on an attempt to mount "devtmpfs". It's up to the
userspace to populate this fs on container start, all kernel requests to
create a device node inside a VE are ignored.
Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
drivers/base/devtmpfs.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/ve.h | 1 +
kernel/ve/ve.c | 4 +++
3 files changed, 72 insertions(+)
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index f59b798..daf97ee 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -23,6 +23,7 @@
#include <linux/ramfs.h>
#include <linux/slab.h>
#include <linux/kthread.h>
+#include <linux/ve.h>
#include "base.h"
static struct task_struct *thread;
@@ -53,9 +54,61 @@ static int __init mount_param(char *str)
}
__setup("devtmpfs.mount=", mount_param);
+#ifdef CONFIG_VE
+static int ve_test_dev_sb(struct super_block *s, void *p)
+{
+ return get_exec_env()->dev_sb == s;
+}
+
+static int ve_set_dev_sb(struct super_block *s, void *p)
+{
+ struct ve_struct *ve = get_exec_env();
+ int error;
+
+ error = set_anon_super(s, p);
+ if (!error) {
+ BUG_ON(ve->dev_sb);
+ ve->dev_sb = s;
+ atomic_inc(&s->s_active);
+ }
+ return error;
+}
+
+static struct dentry *ve_dev_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data)
+{
+ int (*fill_super)(struct super_block *, void *, int);
+ struct super_block *s;
+ int error;
+
+#ifdef CONFIG_TMPFS
+ fill_super = shmem_fill_super;
+#else
+ fill_super = ramfs_fill_super;
+#endif
+ s = sget(fs_type, ve_test_dev_sb, ve_set_dev_sb, flags, NULL);
+ if (IS_ERR(s))
+ return ERR_CAST(s);
+
+ if (!s->s_root) {
+ error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+ if (error) {
+ deactivate_locked_super(s);
+ return ERR_PTR(error);
+ }
+ s->s_flags |= MS_ACTIVE;
+ }
+ return dget(s->s_root);
+}
+#endif /* CONFIG_VE */
+
static struct dentry *dev_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data)
{
+#ifdef CONFIG_VE
+ if (!ve_is_super(get_exec_env()))
+ return ve_dev_mount(fs_type, flags, dev_name, data);
+#endif
#ifdef CONFIG_TMPFS
return mount_single(fs_type, flags, data, shmem_fill_super);
#else
@@ -79,6 +132,16 @@ static inline int is_blockdev(struct device *dev)
static inline int is_blockdev(struct device *dev) { return 0; }
#endif
+#ifdef CONFIG_VE
+static inline int is_ve_dev(struct device *dev)
+{
+ return dev->class && dev->class->namespace == ve_namespace &&
+ ve_namespace(dev) != get_ve0();
+}
+#else
+static inline int is_ve_dev(struct device *dev) { return 0; }
+#endif
+
int devtmpfs_create_node(struct device *dev)
{
const char *tmp = NULL;
@@ -86,6 +149,8 @@ int devtmpfs_create_node(struct device *dev)
if (!thread)
return 0;
+ if (is_ve_dev(dev))
+ return 0;
req.mode = 0;
req.uid = GLOBAL_ROOT_UID;
@@ -125,6 +190,8 @@ int devtmpfs_delete_node(struct device *dev)
if (!thread)
return 0;
+ if (is_ve_dev(dev))
+ return 0;
req.name = device_get_devnode(dev, NULL, NULL, NULL, &tmp);
if (!req.name)
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 84be823..82a840a 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -60,6 +60,7 @@ struct ve_struct {
/* VE's root */
struct path root_path;
+ struct super_block *dev_sb;
struct super_block *devpts_sb;
#if IS_ENABLED(CONFIG_BINFMT_MISC)
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 536a83f..bdfa30d 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -532,6 +532,10 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
* At this point all userspace tasks in container are dead.
*/
+ if (ve->dev_sb) {
+ deactivate_super(ve->dev_sb);
+ ve->dev_sb = NULL;
+ }
if (ve->devpts_sb) {
deactivate_super(ve->devpts_sb);
ve->devpts_sb = NULL;
More information about the Devel
mailing list