[CRIU] [PATCH 4/5] mount: make open_mountpoint handle overmouts properly

Tue Dec 5 01:28:12 MSK 2017

On Sat, Dec 02, 2017 at 10:35:02AM +0300, Pavel Tikhomirov wrote:
> 
> 
> On 12/02/2017 05:47 AM, Andrew Vagin wrote:
> > On Fri, Nov 24, 2017 at 05:22:20PM +0300, Pavel Tikhomirov wrote:
> > > dump of VZ7 ct fails, if we have overmounted tmpfs inside:
> > > 
> > > [root at silo ~]# prlctl enter su-test-2
> > > entered into CT
> > > CT-829e7b28 /# mkdir /mnt/overmntedtmp
> > > CT-829e7b28 /# mount -t tmpfs tmpfs /mnt/overmntedtmp/
> > > CT-829e7b28 /# mount -t tmpfs tmpfs /mnt
> > > CT-829e7b28 /# logout
> > > 
> > > [root at silo ~]# prlctl suspend su-test-2
> > > Suspending the CT...
> > > Failed to suspend the CT: PRL_ERR_VZCTL_OPERATION_FAILED (Details: Will skip in-flight TCP connections
> > > (01.657913) Error (criu/mount.c:1202): mnt: Can't open ./mnt/overmntedtmp: No such file or directory
> > > (01.662528) Error (criu/util.c:709): exited, status=1
> > > (01.664329) Error (criu/util.c:709): exited, status=1
> > > (01.664694) Error (criu/cr-dump.c:2005): Dumping FAILED.
> > > Failed to checkpoint the Container
> > > All dump files and logs were saved to /vz/private/829e7b28-f204-4bce-b09f-d203b99befd4/dump/Dump.fail
> > > Checkpointing failed
> > > )
> > > 
> > > Criu wants to dump the contents of /mnt/overmntedtmp/ mount but it is
> > > unavailable. So we copy the mount namespace in such a case and unmount
> > > overmounts to access what we want to dump.
> > > 
> > > Actual usecase here is dumping CT with active mariadb and ssh
> > > connection. Together they happen to create such overmount. As by default
> > > systemd creates a separate mount namespace for mysql and also mounts
> > > tmpfs to /run/user in it, and when ssh(root) is connected - systemd also
> > > mounts tmpfs in container root mount namespace to /run/user/0 for user
> > > files. As /run is slave mount /run/user/0 also propagates to mysql's
> > > mount namespace and initially becomes overmounted by /run/user.
> > > 
> > > https://jira.sw.ru/browse/PSBM-57362
> > > 
> > > remove __maybe_unused for mnt_is_overmounted and umount_overmounts
> > > 
> > > Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
> > > ---
> > >   criu/filesystems.c   |   2 +-
> > >   criu/include/mount.h |   2 +
> > >   criu/include/util.h  |   1 +
> > >   criu/mount.c         | 164 +++++++++++++++++++++++++++++----------------------
> > >   4 files changed, 99 insertions(+), 70 deletions(-)
> > > 
> > > diff --git a/criu/filesystems.c b/criu/filesystems.c
> > > index 5a88788d7..fa6d96ecf 100644
> > > --- a/criu/filesystems.c
> > > +++ b/criu/filesystems.c
> > > @@ -387,7 +387,7 @@ static int tmpfs_dump(struct mount_info *pm)
> > > 
> > >   	fd = open_mountpoint(pm);
> > >   	if (fd < 0)
> > > -		return fd;
> > > +		return MNT_UNREACHABLE;
> > > 
> > >   	/* if fd happens to be 0 here, we need to move it to something
> > >   	 * non-zero, because cr_system_userns closes STDIN_FILENO as we are not
> > > diff --git a/criu/include/mount.h b/criu/include/mount.h
> > > index e60dd348f..ed771ffac 100644
> > > --- a/criu/include/mount.h
> > > +++ b/criu/include/mount.h
> > > @@ -12,6 +12,8 @@ struct ns_id;
> > > 
> > >   #define MOUNT_INVALID_DEV	(0)
> > > 
> > > +#define MNT_UNREACHABLE INT_MIN
> > > +
> > >   struct mount_info {
> > >   	int			mnt_id;
> > >   	int			parent_mnt_id;
> > > diff --git a/criu/include/util.h b/criu/include/util.h
> > > index 20684ccf9..d1b779d2b 100644
> > > --- a/criu/include/util.h
> > > +++ b/criu/include/util.h
> > > @@ -190,6 +190,7 @@ extern int is_empty_dir(int dirfd);
> > >    * path. Since fd is an integer, we can easily estimate one :)
> > >    */
> > >   #define PSFDS	(sizeof("/proc/self/fd/2147483647"))
> > > +#define PPFDS	(sizeof("/proc/2147483647/fd/2147483647"))
> > > 
> > >   extern int read_fd_link(int lfd, char *buf, size_t size);
> > >   extern pid_t get_self_real_pid(void);
> > > diff --git a/criu/mount.c b/criu/mount.c
> > > index e2e7c7116..d47006b58 100644
> > > --- a/criu/mount.c
> > > +++ b/criu/mount.c
> > > @@ -1106,7 +1106,7 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_
> > >    * root of our mount namespace as it is covered by other mount.
> > >    * mnt_is_overmounted() checks if mount is not visible.
> > >    */
> > > -static __maybe_unused bool mnt_is_overmounted(struct mount_info *mi)
> > > +static bool mnt_is_overmounted(struct mount_info *mi)
> > >   {
> > >   	struct mount_info *t, *c, *m = mi;
> > > 
> > > @@ -1223,7 +1223,7 @@ static int __umount_overmounts(struct mount_info *m)
> > >   }
> > > 
> > >   /* Make our mountpoint fully visible */
> > > -static __maybe_unused int umount_overmounts(struct mount_info *m)
> > > +static int umount_overmounts(struct mount_info *m)
> > >   {
> > >   	if (__umount_overmounts(m))
> > >   		return -1;
> > > @@ -1234,90 +1234,116 @@ static __maybe_unused int umount_overmounts(struct mount_info *m)
> > >   	return 0;
> > >   }
> > > 
> > > -#define MNT_UNREACHABLE INT_MIN
> > > -int open_mountpoint(struct mount_info *pm)
> > > +enum {
> > > +	FUTEX_FD_OK,
> > > +	FUTEX_FD_ERR,
> > > +	FUTEX_FD_SHIFT,
> > > +};
> > > +
> > > +/* Open mountpoint clean from children and overmounts */
> > > +int ns_open_mountpoint(struct mount_info *pm, futex_t *futex)
> > >   {
> > > -	struct mount_info *c;
> > > -	int fd = -1, ns_old = -1;
> > >   	char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
> > >   	char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
> > > -	char *mnt_path = mnt_path_tmp;
> > > -	int cwd_fd;
> > > -
> > > -	/*
> > > -	 * If a mount doesn't have children, we can open a mount point,
> > > -	 * otherwise we need to create a "private" copy.
> > > -	 */
> > > -	if (list_empty(&pm->children))
> > > -		return __open_mountpoint(pm, -1);
> > > +	char *mnt_path = NULL;
> > > +	int fd;
> > > 
> > > -	pr_info("Something is mounted on top of %s\n", pm->mountpoint);
> > > +	if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, NULL) < 0)
> > > +		goto err;
> > > +	/* Need user namespace so that mounts will be unmounable */
> > 
> > I don't understand what you mean here. Pls, explain why do we need to
> > change a userns
> 
> # terminal 1:
> # create new userns and create new mntns owned by the former
> [root at cat snorch]# unshare -rUm
> # mount something in new mntns
> [root at cat snorch]# mount -t tmpfs tmpfs /mnt
> [root at cat snorch]# sleep 1000
> 
> # terminal 2:
> ps auxf | grep sleep
> root      3132 ...
> # enter only new mntns, userns is init_user_ns now
> [root at cat snorch]# nsenter -t 3132 -m
> # unshare mntns (to have a private copy)
> [root at cat /]# unshare -m
> # does not actually changes anything, but to make sure that unmounts not
> propagate to initial mount namespace
> [root at cat /]# mount --make-rprivate /
> # try to umount /mnt fails as mount is MNT_LOCKED
> [root at cat /]# strace umount /mnt
> ...
> umount2("/mnt", 0)                      = -1 EINVAL (Invalid argument)
> 
> That is because:
> sys_umount() {
> 	retval = -EINVAL;
> 	if (mnt->mnt.mnt_flags & MNT_LOCKED)
> 		goto dput_and_out;
> }
> 
> Mount is MNT_LOCKED because we are CL_UNPRIVILEGED:
> clone_mnt()
> {
>         /* Don't allow unprivileged users to reveal what is under a mount */
>         if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
>                 mnt->mnt.mnt_flags |= MNT_LOCKED;
> }
> 
> We are CL_UNPRIVILEGED as when unshare -m we were in non-owner userns:
> copy_mnt_ns() {
>         if (user_ns != ns->user_ns)

I think we can try to send a patch to lkml, which changes this line on
this one:

if (user_ns != ns->user_ns && user_ns != &init_user_ns)

by other words, CL_UNPRIVILEGED isn't set for &init_user_ns

>                 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
> }
> 
> So, finally, if we want to make unmounts there we need to unshare -m from
> owner userns of mntns which we want to copy.
> 
> > 
> > > +	if (pm->nsid->user_ns &&
> > > +	    switch_ns(pm->nsid->user_ns->ns_pid, &user_ns_desc, NULL) < 0)
> > > +		goto err;
> > > 
> > > -	list_for_each_entry(c, &pm->children, siblings) {
> > > -		if (!strcmp(c->mountpoint, pm->mountpoint)) {
> > > -			pr_debug("%d:%s is overmounted\n", pm->mnt_id, pm->mountpoint);
> > > -			return MNT_UNREACHABLE;
> > > +	if (!mnt_is_overmounted(pm)) {
> > > +		mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
> > > +		if (!mnt_path)
> > > +			goto err;
> > > +	} else {
> > > +		/* Create helper mount namespace so we can unmount in it */
> > > +		if (unshare(CLONE_NEWNS)) {
> > > +			pr_perror("Unable to clone a mount namespace");
> > > +			goto err;
> > >   		}
> > > -	}
> > > +		if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL))
> > > +			goto err;
> > > 
> > > -	/*
> > > -	 * To create a "private" copy, the target mount is bind-mounted
> > > -	 * in a temporary place w/o MS_REC (non-recursively).
> > > -	 * A mount point can't be bind-mounted in criu's namespace, it will be
> > > -	 * mounted in a target namespace. The sequence of actions is
> > > -	 * mkdtemp, setns(tgt), mount, open, detach, setns(old).
> > > -	 */
> > > +		if (umount_overmounts(pm))
> > > +			goto err;
> > > 
> > > -	cwd_fd = open(".", O_DIRECTORY);
> > > -	if (cwd_fd < 0) {
> > > -		pr_perror("Unable to open cwd");
> > > -		return -1;
> > > +		mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
> > > +		if (!mnt_path)
> > > +			goto err;
> > >   	}
> > > 
> > > -	if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0)
> > > -		goto out;
> > > +	fd = open_detach_mount(mnt_path);
> > > +	if (fd < 0)
> > > +		goto err;
> > > 
> > > -	mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
> > > -	if (mnt_path == NULL) {
> > > -		/*
> > > -		 * We probably can't create a temporary direcotry,
> > > -		 * so we can try to clone the mount namespace, open
> > > -		 * the required mount and destroy this mount namespace
> > > -		 * by calling restore_ns() below in this function.
> > > -		 */
> > > -		if (unshare(CLONE_NEWNS)) {
> > > -			pr_perror("Unable to clone a mount namespace");
> > > -			goto out;
> > > -		}
> > > +	/* Report fd to parent and wait until he he is done */
> > > +	futex_set_and_wake(futex, fd + FUTEX_FD_SHIFT);
> > > +	futex_wait_until(futex, FUTEX_FD_OK);
> > > +	close(fd);
> > > +	return 0;
> > > +err:
> > > +	futex_set_and_wake(futex, FUTEX_FD_ERR);
> > > +	return -1;
> > > +}
> > > 
> > > -		fd = open(pm->mountpoint, O_RDONLY | O_DIRECTORY, 0);
> > > -		if (fd < 0)
> > > -			pr_perror("Can't open directory %s: %d", pm->mountpoint, fd);
> > > -	} else
> > > -		fd = open_detach_mount(mnt_path);
> > > -	if (fd < 0)
> > > -		goto out;
> > > +int open_mountpoint(struct mount_info *pm)
> > > +{
> > > +	int fd = -1, pid = -1;
> > > +	char mnt_path[PPFDS];
> > > +	futex_t *futex = NULL;
> > > 
> > > -	if (restore_ns(ns_old, &mnt_ns_desc)) {
> > > -		ns_old = -1;
> > > -		goto out;
> > > +	/* No overmounts and children - the entire mount is visible */
> > > +	if (list_empty(&pm->children) && !mnt_is_overmounted(pm))
> > > +		return __open_mountpoint(pm, -1);
> > > +
> > > +	pr_info("Mount is not fully visible %s\n", pm->mountpoint);
> > > +
> > > +	futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
> > 
> > It is ugly. You can try to create a new process with CLONE_VM, but I
> > think we should avaoid of forking a new process.
> 
> Entering unprivileged userns is irreversible operation so I need a helper
> process here. As a bonus we don't need to revert mntns change. It has short
> lifetime so I don't think it will influence anybody.
> 
> Will add CLONE_VM - it seem to be a good thing to make the process eat less
> memory.

It is about performance rather than memory. We have to be sure that we
fork a process only we don't have another way.

> 
> > 
> > > +	if (futex == MAP_FAILED) {
> > > +		pr_perror("Failed to mmap futex");
> > > +		return 1;
> > >   	}
> > > -	if (fchdir(cwd_fd)) {
> > > -		pr_perror("Unable to restore cwd");
> > > -		close(cwd_fd);
> > > -		close(fd);
> > > -		return -1;
> > > +	futex_init(futex);
> > > +
> > > +	pid = fork();
> > 
> > Why do we need to create a new process? Can we avoid doing this?
> > 
> > > +	if (pid == -1) {
> > > +		pr_perror("Failed to fork a helper");
> > > +		goto err;
> > > +	} else if (pid == 0) {
> > 
> > pls add a detailed comment why do we need to create a new process
> 
> Will add.
> 
> > 
> > > +		exit(ns_open_mountpoint(pm, futex));
> > >   	}
> > > -	close(cwd_fd);
> > > 
> > > +	/* Wait mount fd from child */
> > > +	futex_wait_while(futex, FUTEX_FD_OK);
> > 
> > you will wait forever, if a child process segfaulted
> 
> It seem we have to have some timeout here. Also we can try to wait the child
> after timeout, and if the child is alright we can futex_wait again, and
> cycle.

or you can unset SA_RESTART for SIGCHLD and then futex() will return
-EINTR if a child execited.

> 
> > 
> > > +	fd = futex_get(futex) - FUTEX_FD_SHIFT;
> > > +	if (fd == -1)
> > > +		goto err;
> > > +
> > > +	/* Open child's fd */
> > > +	snprintf(mnt_path, PPFDS,"/proc/%d/fd/%d", pid, fd);
> > > +	fd = open(mnt_path, O_RDONLY | O_DIRECTORY);
> > > +	if (fd == -1) {
> > > +		pr_perror("Failed to open child fd");
> > > +		goto err;
> > > +	}
> > > +
> > > +	/* Release the child */
> > > +	futex_set_and_wake(futex, 0);
> > > +	waitpid(pid, NULL, 0);
> > 
> > you have to check an exit code of waitpid and a status of the child
> > process
> 
> Will do. We need it especially for a segfault case.

You have to chech exit codes of all children. It is like brushing teeth
twice a day.

> 
> > 
> > > +
> > > +	munmap(futex, sizeof(*futex));
> > >   	return __open_mountpoint(pm, fd);
> > > -out:
> > > -	if (ns_old >= 0)
> > > -		 restore_ns(ns_old, &mnt_ns_desc);
> > > -	close_safe(&fd);
> > > -	if (fchdir(cwd_fd))
> > > -		pr_perror("Unable to restore cwd");
> > > -	close(cwd_fd);
> > > +err:
> > > +	if (pid > 0) {
> > > +		kill(pid, SIGKILL);
> > > +		waitpid(pid, NULL, 0);
> > > +	}
> > > +	if (futex)
> > > +		munmap(futex, sizeof(*futex));
> > >   	return -1;
> > >   }
> > > 
> > > --
> > > 2.13.6
> > >