[Devel] [PATCH rh7 v2] ve/devpts: Support per-VE mount namespace
Cyrill Gorcunov
gorcunov at virtuozzo.com
Tue Jul 21 10:23:10 PDT 2015
Modern systemd based containers (such as fedora-21, centos-7) already
mounting initial devpts filesystem with @newinstance option but it
turned out that ubuntu-14 lts doesn't, which makes restore procedure
to fail because we're using get_exec_env as a namespace mark and the
kernel mounts new superblock for container internally. This is done
to isolate devpts between containers but somehow incomplete.
Lets revert commits c77f3df733bfa8382a309edb6a381c7eaf9ded0c
and 2c27d20125f51256d59ec2b278a905321dc64914 to make code closer
to native one, and introduce ve::_devpts_mnt to track per-VE
devpts root superblock.
There is one additional problem we've to address: in CRIU we
rather aimed at vanilla kernel which has no devpts superblock
virtualization so when we dump container we mark it as having
@newinstance option, this cause current kernel to allocate new
superblock on restore instead of using existing ve::_devpts_mnt.
To workaround it we track the first call to devpts mount inside
container and regardless the @newinstance option we provide
a caller our virtualized superblock.
https://jira.sw.ru/browse/PSBM-34931
Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
CC: Andrey Vagin <avagin at virtuozzo.com>
CC: Vladimir Davydov <vdavydov at virtuozzo.com>
CC: Konstantin Khorenko <khorenko at virtuozzo.com>
CC: Pavel Emelyanov <xemul at virtuozzo.com>
---
fs/devpts/inode.c | 92 ++++++++++++++++++++++++++++++++++++++--------
include/linux/devpts_fs.h | 6 +++
include/linux/ve.h | 5 ++
kernel/ve/ve.c | 15 +++++--
4 files changed, 96 insertions(+), 22 deletions(-)
Index: linux-pcs7.git/fs/devpts/inode.c
===================================================================
--- linux-pcs7.git.orig/fs/devpts/inode.c
+++ linux-pcs7.git/fs/devpts/inode.c
@@ -24,7 +24,10 @@
#include <linux/parser.h>
#include <linux/fsnotify.h>
#include <linux/seq_file.h>
+
+#ifdef CONFIG_VE
#include <linux/ve.h>
+#endif
#define DEVPTS_DEFAULT_MODE 0600
/*
@@ -93,7 +96,11 @@ static struct ctl_table pty_root_table[]
static DEFINE_MUTEX(allocated_ptys_lock);
+#ifdef CONFIG_VE
+#define devpts_mnt (get_exec_env()->_devpts_mnt)
+#else
static struct vfsmount *devpts_mnt;
+#endif
struct pts_mount_opts {
int setuid;
@@ -140,7 +147,9 @@ static inline struct super_block *pts_sb
if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
return inode->i_sb;
#endif
- return get_exec_env()->devpts_sb;
+ if (!devpts_mnt)
+ return NULL;
+ return devpts_mnt->mnt_sb;
}
#define PARSE_MOUNT 0
@@ -402,6 +411,12 @@ fail:
}
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int compare_init_pts_sb(struct super_block *s, void *p)
+{
+ if (devpts_mnt)
+ return devpts_mnt->mnt_sb == s;
+ return 0;
+}
/*
* devpts_mount()
@@ -436,43 +451,71 @@ static struct dentry *devpts_mount(struc
int error;
struct pts_mount_opts opts;
struct super_block *s;
- struct dentry *root;
error = parse_mount_options(data, PARSE_MOUNT, &opts);
if (error)
return ERR_PTR(error);
+#ifndef CONFIG_VE
/* Require newinstance for all user namespace mounts to ensure
* the mount options are not changed.
*/
- if (!IS_ENABLED(CONFIG_VE) &&
- (current_user_ns() != &init_user_ns) && !opts.newinstance)
+ if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
return ERR_PTR(-EINVAL);
+#endif
+#ifdef CONFIG_VE
+ /*
+ * Each container has to have own devpts superblock for isolation
+ * sake but it makes a bad joke for us: in CRIU we test if devpts
+ * device in container is the same as on the node, to figure out
+ * if @newinstance option has to be passed (simply because in
+ * vanilla kernel there is no such devpts virtualization) on
+ * the restore. Thus every time we're restoring container
+ * we pass @newinstance option even if container has been
+ * started without this option initially.
+ *
+ * To workaround this situation here is an ugly hack: first
+ * mount of devpts inside container always runs without
+ * @newinstance option providing back virtualized superblock.
+ * The next mounts inside container go in a regular way.
+ *
+ * Note @devpts_once is always set for node. And be careful
+ * about @else branch below.
+ */
+ if (!get_exec_env()->devpts_once && get_exec_env()->_devpts_mnt)
+ s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags, NULL);
+ else
+#endif
if (opts.newinstance)
- root = mount_nodev(fs_type, flags, data, devpts_fill_super);
+ s = sget(fs_type, NULL, set_anon_super, flags, NULL);
else
- root = mount_ns(fs_type, flags, data, get_exec_env(), devpts_fill_super);
+ s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
+ NULL);
- if (IS_ERR(root))
- return ERR_CAST(root);
+ if (IS_ERR(s))
+ return ERR_CAST(s);
+
+ if (!s->s_root) {
+ error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+ if (error)
+ goto out_undo_sget;
+ s->s_flags |= MS_ACTIVE;
+ }
- s = root->d_sb;
memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
error = mknod_ptmx(s);
if (error)
goto out_undo_sget;
- if (!opts.newinstance) {
- atomic_inc(&s->s_active);
- get_exec_env()->devpts_sb = s;
- }
-
- return root;
+#ifdef CONFIG_VE
+ if (!get_exec_env()->devpts_once && get_exec_env()->_devpts_mnt)
+ get_exec_env()->devpts_once = true;
+#endif
+ return dget(s->s_root);
out_undo_sget:
- dput(root);
deactivate_locked_super(s);
return ERR_PTR(error);
}
@@ -683,3 +726,20 @@ static int __init init_devpts_fs(void)
return err;
}
module_init(init_devpts_fs)
+
+#ifdef CONFIG_VE
+int ve_devpts_init(struct ve_struct *ve)
+{
+ ve->_devpts_mnt = kern_mount(&devpts_fs_type);
+ if (IS_ERR(ve->_devpts_mnt))
+ return PTR_ERR(ve->_devpts_mnt);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ve_devpts_init);
+
+void ve_devpts_fini(struct ve_struct *ve)
+{
+ kern_unmount(ve->_devpts_mnt);
+}
+EXPORT_SYMBOL_GPL(ve_devpts_fini);
+#endif /* CONFIG_VE */
Index: linux-pcs7.git/include/linux/devpts_fs.h
===================================================================
--- linux-pcs7.git.orig/include/linux/devpts_fs.h
+++ linux-pcs7.git/include/linux/devpts_fs.h
@@ -45,5 +45,11 @@ static inline void devpts_pty_kill(struc
#endif
+#ifdef CONFIG_VE
+struct ve_struct;
+
+extern int ve_devpts_init(struct ve_struct *ve);
+extern void ve_devpts_fini(struct ve_struct *ve);
+#endif
#endif /* _LINUX_DEVPTS_FS_H */
Index: linux-pcs7.git/include/linux/ve.h
===================================================================
--- linux-pcs7.git.orig/include/linux/ve.h
+++ linux-pcs7.git/include/linux/ve.h
@@ -61,7 +61,10 @@ struct ve_struct {
/* VE's root */
struct path root_path;
- struct super_block *devpts_sb;
+#ifdef CONFIG_UNIX98_PTYS
+ struct vfsmount *_devpts_mnt;
+ bool devpts_once;
+#endif
#if IS_ENABLED(CONFIG_BINFMT_MISC)
struct binfmt_misc *binfmt_misc;
Index: linux-pcs7.git/kernel/ve/ve.c
===================================================================
--- linux-pcs7.git.orig/kernel/ve/ve.c
+++ linux-pcs7.git/kernel/ve/ve.c
@@ -72,6 +72,9 @@ struct ve_struct ve0 = {
.ipt_mask = VE_IP_ALL, /* everything is allowed */
#endif
.features = -1,
+#ifdef CONFIG_UNIX98_PTYS
+ .devpts_once = true,
+#endif
.fsync_enable = FSYNC_FILTERED,
.meminfo_val = VE_MEMINFO_SYSTEM,
._randomize_va_space =
@@ -499,6 +502,10 @@ int ve_start_container(struct ve_struct
if (err)
goto err_dev;
+ err = ve_devpts_init(ve);
+ if (err)
+ goto err_devpts;
+
err = ve_legacy_pty_init(ve);
if (err)
goto err_legacy_pty;
@@ -542,6 +549,8 @@ err_tty_console:
err_unix98_pty:
ve_legacy_pty_fini(ve);
err_legacy_pty:
+ ve_devpts_fini(ve);
+err_devpts:
ve_fini_devtmpfs(ve);
err_dev:
ve_stop_umh(ve);
@@ -579,6 +588,7 @@ void ve_stop_ns(struct pid_namespace *pi
ve_legacy_pty_fini(ve);
ve_mem_class_fini(ve);
+ ve_devpts_fini(ve);
ve_fini_devtmpfs(ve);
ve_stop_umh(ve);
@@ -604,11 +614,6 @@ void ve_exit_ns(struct pid_namespace *pi
* At this point all userspace tasks in container are dead.
*/
- if (ve->devpts_sb) {
- deactivate_super(ve->devpts_sb);
- ve->devpts_sb = NULL;
- }
-
down_write(&ve->op_sem);
ve_hook_iterate_fini(VE_SS_CHAIN, ve);
More information about the Devel
mailing list