[Devel] [PATCH RH8] ve/devtmpfs: lightweight virtualization

Konstantin Khorenko khorenko at virtuozzo.com
Mon Jul 19 19:01:34 MSK 2021


On 07/16/2021 06:15 PM, Pavel Tikhomirov wrote:
> From: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
>
> Due to changes in RH8.4 we need to rewrork it, actually the logic
> becomes much more simple, we mount/umount single tmpts per ve on cgroup
> creation/removal, all actual devtmpfs mount calls only increase a
> refcount on corresponding ve's mount like with hosts devtmps.
>
> Original commit message:
>
> Previousely, we implemented full-featured devtmpfs virtualization for
> VE: when a device is created in a VE "namespace", we send a signal to
> kdevtmpfs to create the devnode on devtmpfs mount corresponding to the
> VE. This seems to be over-complicated: all this work can be done from
> userspace, because we only have a hardcoded list of devices created
> exclusively for VE on container start. Those are tty-related stuff and
> mem devices, and we only need the latter to create devtmpfs nodes.
> Moreover, it is buggy: ve_stop_ns, which destroys VE devtmpfs mount can
> be called before a VE tty device is unregistered, resulting in a KP:
>
> https://jira.sw.ru/browse/PSBM-35077
>
> This patch therefore simplified it. It makes the kernel only provide a
> single empty tmpfs mount per VE, which appears on an attempt to mount
> devtmpfs from inside a VE. The content of the fs is to be filled by the
> userspace on container start, which will be done in the scope of
>
> https://jira.sw.ru/browse/PSBM-35146
>
> All this patch does is provides each VE with its own empty single tmpfs
> mount, which appears on an attempt to mount "devtmpfs". It's up to the
> userspace to populate this fs on container start, all kernel requests to
> create a device node inside a VE are ignored.
>
> Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
> Signed-off-by: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
>
> https://jira.sw.ru/browse/PSBM-131158
>
> Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
> ---
>  drivers/base/devtmpfs.c | 25 +++++++++++++++++++++++++
>  fs/namespace.c          |  1 +
>  include/linux/device.h  |  2 ++
>  include/linux/ve.h      |  3 +++
>  kernel/ve/ve.c          |  6 ++++++
>  5 files changed, 37 insertions(+)
>
> diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
> index fc7b883f36c9..25efacb5ed5c 100644
> --- a/drivers/base/devtmpfs.c
> +++ b/drivers/base/devtmpfs.c
> @@ -26,6 +26,7 @@
>  #include <linux/slab.h>
>  #include <linux/kthread.h>
>  #include <uapi/linux/mount.h>
> +#include <linux/ve.h>
>  #include "base.h"
>
>  static struct task_struct *thread;
> @@ -62,6 +63,13 @@ static struct dentry *public_dev_mount(struct file_system_type *fs_type, int fla
>  		      const char *dev_name, void *data)
>  {
>  	struct super_block *s = mnt->mnt_sb;
> +#ifdef CONFIG_VE
> +	struct ve_struct *ve = get_exec_env();
> +
> +	if (!ve_is_super(ve))
> +		s = ve->devtmpfs_mnt->mnt_sb;
> +#endif
> +

We don't have any lock here, so why can't we get a race with ve_destroy()?
Because ve_destroy() is called on ve cgroup destruction and no processes
are in this cgroup and thus get_exec_env() can't return semi-dead ve cgroup?

>  	atomic_inc(&s->s_active);
>  	down_write(&s->s_umount);
>  	return dget(s->s_root);
> @@ -82,6 +90,7 @@ static struct file_system_type internal_fs_type = {
>  static struct file_system_type dev_fs_type = {
>  	.name = "devtmpfs",
>  	.mount = public_dev_mount,
> +	.fs_flags = FS_VIRTUALIZED | FS_USERNS_MOUNT,

i'll put FS_VE_MOUNT instead of FS_USERNS_MOUNT.

i've checked on host:
  # unshare -U
  # mount -t devtmpfs devtmpfs /mnt
  does not work

If you have any arguments against this - please let me know.

>  };
>
>  #ifdef CONFIG_BLOCK
> @@ -425,6 +434,22 @@ static int devtmpfsd(void *p)
>  	return *err;
>  }
>
> +int ve_mount_devtmpfs(struct ve_struct *ve)
> +{
> +	char opts[] = "mode=0755";
> +	struct vfsmount *mnt;
> +
> +	mnt = vfs_kern_mount(&internal_fs_type, 0, "devtmpfs", opts);
> +	if (IS_ERR(mnt)) {
> +		printk(KERN_ERR "CT#%s: devtmpfs: unable to create devtmpfs %ld\n",
> +		       ve_name(ve), PTR_ERR(mnt));
> +		return PTR_ERR(mnt);
> +	}
> +	ve->devtmpfs_mnt = mnt;
> +
> +	return 0;
> +}
> +
>  /*
>   * Create devtmpfs instance, driver-core devices will add their device
>   * nodes here.
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 9128029f5a78..2009130cd51e 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -30,6 +30,7 @@
>  #include <uapi/linux/mount.h>
>  #include <linux/fs_context.h>
>  #include <linux/shmem_fs.h>
> +#include <linux/mount.h>
>
>  #include <linux/ve.h>
>
> diff --git a/include/linux/device.h b/include/linux/device.h
> index c1630a5dec50..b0da526490ce 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -1665,10 +1665,12 @@ extern bool kill_device(struct device *dev);
>  extern int devtmpfs_create_node(struct device *dev);
>  extern int devtmpfs_delete_node(struct device *dev);
>  extern int devtmpfs_mount(const char *mntdir);
> +extern int ve_mount_devtmpfs(struct ve_struct *ve);
>  #else
>  static inline int devtmpfs_create_node(struct device *dev) { return 0; }
>  static inline int devtmpfs_delete_node(struct device *dev) { return 0; }
>  static inline int devtmpfs_mount(const char *mountpoint) { return 0; }
> +static inline int ve_mount_devtmpfs(struct ve_struct *ve) { return 0; }
>  #endif
>
>  /* drivers/base/power/shutdown.c */
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index b17868ba86c3..d3f77467cdf6 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -22,6 +22,7 @@
>  struct nsproxy;
>  struct veip_struct;
>  struct user_namespace;
> +struct vfsmount;
>
>  struct ve_struct {
>  	struct cgroup_subsys_state	css;
> @@ -120,6 +121,8 @@ struct ve_struct {
>  	 */
>  	struct list_head	per_cgroot_list;
>  	spinlock_t		per_cgroot_list_lock;
> +
> +	struct vfsmount		*devtmpfs_mnt;
>  };
>
>  struct ve_devmnt {
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index fee86917d624..bb9b99c58992 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -31,6 +31,7 @@
>  #include <linux/ctype.h>
>  #include <linux/tty.h>
>  #include <linux/genhd.h>
> +#include <linux/device.h>
>
>  #include <uapi/linux/vzcalluser.h>
>  #include <net/rtnetlink.h>
> @@ -910,6 +911,10 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
>  	if (copy_vdso(&ve->vdso_32, &vdso_image_32))
>  		goto err_vdso;
>
> +	err = ve_mount_devtmpfs(ve);
> +	if (err)
> +		goto err_vdso;
> +
>  	ve->features = VE_FEATURES_DEF;
>
>  	INIT_WORK(&ve->release_agent_work, cgroup1_release_agent);
> @@ -1021,6 +1026,7 @@ static void ve_destroy(struct cgroup_subsys_state *css)
>  	kmapset_unlink(&ve->sysfs_perms_key, &sysfs_ve_perms_set);
>  	ve_log_destroy(ve);
>  	ve_free_vdso(ve);
> +	mntput(ve->devtmpfs_mnt);
>  #if IS_ENABLED(CONFIG_BINFMT_MISC)
>  	kfree(ve->binfmt_misc);
>  #endif
>


More information about the Devel mailing list