[Devel] [PATCH RH8] ve/devtmpfs: lightweight virtualization
Konstantin Khorenko
khorenko at virtuozzo.com
Mon Jul 19 19:01:34 MSK 2021
On 07/16/2021 06:15 PM, Pavel Tikhomirov wrote:
> From: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
>
> Due to changes in RH8.4 we need to rewrork it, actually the logic
> becomes much more simple, we mount/umount single tmpts per ve on cgroup
> creation/removal, all actual devtmpfs mount calls only increase a
> refcount on corresponding ve's mount like with hosts devtmps.
>
> Original commit message:
>
> Previousely, we implemented full-featured devtmpfs virtualization for
> VE: when a device is created in a VE "namespace", we send a signal to
> kdevtmpfs to create the devnode on devtmpfs mount corresponding to the
> VE. This seems to be over-complicated: all this work can be done from
> userspace, because we only have a hardcoded list of devices created
> exclusively for VE on container start. Those are tty-related stuff and
> mem devices, and we only need the latter to create devtmpfs nodes.
> Moreover, it is buggy: ve_stop_ns, which destroys VE devtmpfs mount can
> be called before a VE tty device is unregistered, resulting in a KP:
>
> https://jira.sw.ru/browse/PSBM-35077
>
> This patch therefore simplified it. It makes the kernel only provide a
> single empty tmpfs mount per VE, which appears on an attempt to mount
> devtmpfs from inside a VE. The content of the fs is to be filled by the
> userspace on container start, which will be done in the scope of
>
> https://jira.sw.ru/browse/PSBM-35146
>
> All this patch does is provides each VE with its own empty single tmpfs
> mount, which appears on an attempt to mount "devtmpfs". It's up to the
> userspace to populate this fs on container start, all kernel requests to
> create a device node inside a VE are ignored.
>
> Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
> Signed-off-by: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
>
> https://jira.sw.ru/browse/PSBM-131158
>
> Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
> ---
> drivers/base/devtmpfs.c | 25 +++++++++++++++++++++++++
> fs/namespace.c | 1 +
> include/linux/device.h | 2 ++
> include/linux/ve.h | 3 +++
> kernel/ve/ve.c | 6 ++++++
> 5 files changed, 37 insertions(+)
>
> diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
> index fc7b883f36c9..25efacb5ed5c 100644
> --- a/drivers/base/devtmpfs.c
> +++ b/drivers/base/devtmpfs.c
> @@ -26,6 +26,7 @@
> #include <linux/slab.h>
> #include <linux/kthread.h>
> #include <uapi/linux/mount.h>
> +#include <linux/ve.h>
> #include "base.h"
>
> static struct task_struct *thread;
> @@ -62,6 +63,13 @@ static struct dentry *public_dev_mount(struct file_system_type *fs_type, int fla
> const char *dev_name, void *data)
> {
> struct super_block *s = mnt->mnt_sb;
> +#ifdef CONFIG_VE
> + struct ve_struct *ve = get_exec_env();
> +
> + if (!ve_is_super(ve))
> + s = ve->devtmpfs_mnt->mnt_sb;
> +#endif
> +
We don't have any lock here, so why can't we get a race with ve_destroy()?
Because ve_destroy() is called on ve cgroup destruction and no processes
are in this cgroup and thus get_exec_env() can't return semi-dead ve cgroup?
> atomic_inc(&s->s_active);
> down_write(&s->s_umount);
> return dget(s->s_root);
> @@ -82,6 +90,7 @@ static struct file_system_type internal_fs_type = {
> static struct file_system_type dev_fs_type = {
> .name = "devtmpfs",
> .mount = public_dev_mount,
> + .fs_flags = FS_VIRTUALIZED | FS_USERNS_MOUNT,
i'll put FS_VE_MOUNT instead of FS_USERNS_MOUNT.
i've checked on host:
# unshare -U
# mount -t devtmpfs devtmpfs /mnt
does not work
If you have any arguments against this - please let me know.
> };
>
> #ifdef CONFIG_BLOCK
> @@ -425,6 +434,22 @@ static int devtmpfsd(void *p)
> return *err;
> }
>
> +int ve_mount_devtmpfs(struct ve_struct *ve)
> +{
> + char opts[] = "mode=0755";
> + struct vfsmount *mnt;
> +
> + mnt = vfs_kern_mount(&internal_fs_type, 0, "devtmpfs", opts);
> + if (IS_ERR(mnt)) {
> + printk(KERN_ERR "CT#%s: devtmpfs: unable to create devtmpfs %ld\n",
> + ve_name(ve), PTR_ERR(mnt));
> + return PTR_ERR(mnt);
> + }
> + ve->devtmpfs_mnt = mnt;
> +
> + return 0;
> +}
> +
> /*
> * Create devtmpfs instance, driver-core devices will add their device
> * nodes here.
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 9128029f5a78..2009130cd51e 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -30,6 +30,7 @@
> #include <uapi/linux/mount.h>
> #include <linux/fs_context.h>
> #include <linux/shmem_fs.h>
> +#include <linux/mount.h>
>
> #include <linux/ve.h>
>
> diff --git a/include/linux/device.h b/include/linux/device.h
> index c1630a5dec50..b0da526490ce 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -1665,10 +1665,12 @@ extern bool kill_device(struct device *dev);
> extern int devtmpfs_create_node(struct device *dev);
> extern int devtmpfs_delete_node(struct device *dev);
> extern int devtmpfs_mount(const char *mntdir);
> +extern int ve_mount_devtmpfs(struct ve_struct *ve);
> #else
> static inline int devtmpfs_create_node(struct device *dev) { return 0; }
> static inline int devtmpfs_delete_node(struct device *dev) { return 0; }
> static inline int devtmpfs_mount(const char *mountpoint) { return 0; }
> +static inline int ve_mount_devtmpfs(struct ve_struct *ve) { return 0; }
> #endif
>
> /* drivers/base/power/shutdown.c */
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index b17868ba86c3..d3f77467cdf6 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -22,6 +22,7 @@
> struct nsproxy;
> struct veip_struct;
> struct user_namespace;
> +struct vfsmount;
>
> struct ve_struct {
> struct cgroup_subsys_state css;
> @@ -120,6 +121,8 @@ struct ve_struct {
> */
> struct list_head per_cgroot_list;
> spinlock_t per_cgroot_list_lock;
> +
> + struct vfsmount *devtmpfs_mnt;
> };
>
> struct ve_devmnt {
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index fee86917d624..bb9b99c58992 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -31,6 +31,7 @@
> #include <linux/ctype.h>
> #include <linux/tty.h>
> #include <linux/genhd.h>
> +#include <linux/device.h>
>
> #include <uapi/linux/vzcalluser.h>
> #include <net/rtnetlink.h>
> @@ -910,6 +911,10 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
> if (copy_vdso(&ve->vdso_32, &vdso_image_32))
> goto err_vdso;
>
> + err = ve_mount_devtmpfs(ve);
> + if (err)
> + goto err_vdso;
> +
> ve->features = VE_FEATURES_DEF;
>
> INIT_WORK(&ve->release_agent_work, cgroup1_release_agent);
> @@ -1021,6 +1026,7 @@ static void ve_destroy(struct cgroup_subsys_state *css)
> kmapset_unlink(&ve->sysfs_perms_key, &sysfs_ve_perms_set);
> ve_log_destroy(ve);
> ve_free_vdso(ve);
> + mntput(ve->devtmpfs_mnt);
> #if IS_ENABLED(CONFIG_BINFMT_MISC)
> kfree(ve->binfmt_misc);
> #endif
>
More information about the Devel
mailing list