[CRIU] [PATCH 4/5] mount: make open_mountpoint handle overmouts properly

Sat Dec 2 05:47:50 MSK 2017

On Fri, Nov 24, 2017 at 05:22:20PM +0300, Pavel Tikhomirov wrote:
> dump of VZ7 ct fails, if we have overmounted tmpfs inside:
> 
> [root at silo ~]# prlctl enter su-test-2
> entered into CT
> CT-829e7b28 /# mkdir /mnt/overmntedtmp
> CT-829e7b28 /# mount -t tmpfs tmpfs /mnt/overmntedtmp/
> CT-829e7b28 /# mount -t tmpfs tmpfs /mnt
> CT-829e7b28 /# logout
> 
> [root at silo ~]# prlctl suspend su-test-2
> Suspending the CT...
> Failed to suspend the CT: PRL_ERR_VZCTL_OPERATION_FAILED (Details: Will skip in-flight TCP connections
> (01.657913) Error (criu/mount.c:1202): mnt: Can't open ./mnt/overmntedtmp: No such file or directory
> (01.662528) Error (criu/util.c:709): exited, status=1
> (01.664329) Error (criu/util.c:709): exited, status=1
> (01.664694) Error (criu/cr-dump.c:2005): Dumping FAILED.
> Failed to checkpoint the Container
> All dump files and logs were saved to /vz/private/829e7b28-f204-4bce-b09f-d203b99befd4/dump/Dump.fail
> Checkpointing failed
> )
> 
> Criu wants to dump the contents of /mnt/overmntedtmp/ mount but it is
> unavailable. So we copy the mount namespace in such a case and unmount
> overmounts to access what we want to dump.
> 
> Actual usecase here is dumping CT with active mariadb and ssh
> connection. Together they happen to create such overmount. As by default
> systemd creates a separate mount namespace for mysql and also mounts
> tmpfs to /run/user in it, and when ssh(root) is connected - systemd also
> mounts tmpfs in container root mount namespace to /run/user/0 for user
> files. As /run is slave mount /run/user/0 also propagates to mysql's
> mount namespace and initially becomes overmounted by /run/user.
> 
> https://jira.sw.ru/browse/PSBM-57362
> 
> remove __maybe_unused for mnt_is_overmounted and umount_overmounts
> 
> Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
> ---
>  criu/filesystems.c   |   2 +-
>  criu/include/mount.h |   2 +
>  criu/include/util.h  |   1 +
>  criu/mount.c         | 164 +++++++++++++++++++++++++++++----------------------
>  4 files changed, 99 insertions(+), 70 deletions(-)
> 
> diff --git a/criu/filesystems.c b/criu/filesystems.c
> index 5a88788d7..fa6d96ecf 100644
> --- a/criu/filesystems.c
> +++ b/criu/filesystems.c
> @@ -387,7 +387,7 @@ static int tmpfs_dump(struct mount_info *pm)
>  
>  	fd = open_mountpoint(pm);
>  	if (fd < 0)
> -		return fd;
> +		return MNT_UNREACHABLE;
>  
>  	/* if fd happens to be 0 here, we need to move it to something
>  	 * non-zero, because cr_system_userns closes STDIN_FILENO as we are not
> diff --git a/criu/include/mount.h b/criu/include/mount.h
> index e60dd348f..ed771ffac 100644
> --- a/criu/include/mount.h
> +++ b/criu/include/mount.h
> @@ -12,6 +12,8 @@ struct ns_id;
>  
>  #define MOUNT_INVALID_DEV	(0)
>  
> +#define MNT_UNREACHABLE INT_MIN
> +
>  struct mount_info {
>  	int			mnt_id;
>  	int			parent_mnt_id;
> diff --git a/criu/include/util.h b/criu/include/util.h
> index 20684ccf9..d1b779d2b 100644
> --- a/criu/include/util.h
> +++ b/criu/include/util.h
> @@ -190,6 +190,7 @@ extern int is_empty_dir(int dirfd);
>   * path. Since fd is an integer, we can easily estimate one :)
>   */
>  #define PSFDS	(sizeof("/proc/self/fd/2147483647"))
> +#define PPFDS	(sizeof("/proc/2147483647/fd/2147483647"))
>  
>  extern int read_fd_link(int lfd, char *buf, size_t size);
>  extern pid_t get_self_real_pid(void);
> diff --git a/criu/mount.c b/criu/mount.c
> index e2e7c7116..d47006b58 100644
> --- a/criu/mount.c
> +++ b/criu/mount.c
> @@ -1106,7 +1106,7 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_
>   * root of our mount namespace as it is covered by other mount.
>   * mnt_is_overmounted() checks if mount is not visible.
>   */
> -static __maybe_unused bool mnt_is_overmounted(struct mount_info *mi)
> +static bool mnt_is_overmounted(struct mount_info *mi)
>  {
>  	struct mount_info *t, *c, *m = mi;
>  
> @@ -1223,7 +1223,7 @@ static int __umount_overmounts(struct mount_info *m)
>  }
>  
>  /* Make our mountpoint fully visible */
> -static __maybe_unused int umount_overmounts(struct mount_info *m)
> +static int umount_overmounts(struct mount_info *m)
>  {
>  	if (__umount_overmounts(m))
>  		return -1;
> @@ -1234,90 +1234,116 @@ static __maybe_unused int umount_overmounts(struct mount_info *m)
>  	return 0;
>  }
>  
> -#define MNT_UNREACHABLE INT_MIN
> -int open_mountpoint(struct mount_info *pm)
> +enum {
> +	FUTEX_FD_OK,
> +	FUTEX_FD_ERR,
> +	FUTEX_FD_SHIFT,
> +};
> +
> +/* Open mountpoint clean from children and overmounts */
> +int ns_open_mountpoint(struct mount_info *pm, futex_t *futex)
>  {
> -	struct mount_info *c;
> -	int fd = -1, ns_old = -1;
>  	char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
>  	char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
> -	char *mnt_path = mnt_path_tmp;
> -	int cwd_fd;
> -
> -	/*
> -	 * If a mount doesn't have children, we can open a mount point,
> -	 * otherwise we need to create a "private" copy.
> -	 */
> -	if (list_empty(&pm->children))
> -		return __open_mountpoint(pm, -1);
> +	char *mnt_path = NULL;
> +	int fd;
>  
> -	pr_info("Something is mounted on top of %s\n", pm->mountpoint);
> +	if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, NULL) < 0)
> +		goto err;
> +	/* Need user namespace so that mounts will be unmounable */

I don't understand what you mean here. Pls, explain why do we need to
change a userns

> +	if (pm->nsid->user_ns &&
> +	    switch_ns(pm->nsid->user_ns->ns_pid, &user_ns_desc, NULL) < 0)
> +		goto err;
>  
> -	list_for_each_entry(c, &pm->children, siblings) {
> -		if (!strcmp(c->mountpoint, pm->mountpoint)) {
> -			pr_debug("%d:%s is overmounted\n", pm->mnt_id, pm->mountpoint);
> -			return MNT_UNREACHABLE;
> +	if (!mnt_is_overmounted(pm)) {
> +		mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
> +		if (!mnt_path)
> +			goto err;
> +	} else {
> +		/* Create helper mount namespace so we can unmount in it */
> +		if (unshare(CLONE_NEWNS)) {
> +			pr_perror("Unable to clone a mount namespace");
> +			goto err;
>  		}
> -	}
> +		if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL))
> +			goto err;
>  
> -	/*
> -	 * To create a "private" copy, the target mount is bind-mounted
> -	 * in a temporary place w/o MS_REC (non-recursively).
> -	 * A mount point can't be bind-mounted in criu's namespace, it will be
> -	 * mounted in a target namespace. The sequence of actions is
> -	 * mkdtemp, setns(tgt), mount, open, detach, setns(old).
> -	 */
> +		if (umount_overmounts(pm))
> +			goto err;
>  
> -	cwd_fd = open(".", O_DIRECTORY);
> -	if (cwd_fd < 0) {
> -		pr_perror("Unable to open cwd");
> -		return -1;
> +		mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
> +		if (!mnt_path)
> +			goto err;
>  	}
>  
> -	if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0)
> -		goto out;
> +	fd = open_detach_mount(mnt_path);
> +	if (fd < 0)
> +		goto err;
>  
> -	mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
> -	if (mnt_path == NULL) {
> -		/*
> -		 * We probably can't create a temporary direcotry,
> -		 * so we can try to clone the mount namespace, open
> -		 * the required mount and destroy this mount namespace
> -		 * by calling restore_ns() below in this function.
> -		 */
> -		if (unshare(CLONE_NEWNS)) {
> -			pr_perror("Unable to clone a mount namespace");
> -			goto out;
> -		}
> +	/* Report fd to parent and wait until he he is done */
> +	futex_set_and_wake(futex, fd + FUTEX_FD_SHIFT);
> +	futex_wait_until(futex, FUTEX_FD_OK);
> +	close(fd);
> +	return 0;
> +err:
> +	futex_set_and_wake(futex, FUTEX_FD_ERR);
> +	return -1;
> +}
>  
> -		fd = open(pm->mountpoint, O_RDONLY | O_DIRECTORY, 0);
> -		if (fd < 0)
> -			pr_perror("Can't open directory %s: %d", pm->mountpoint, fd);
> -	} else
> -		fd = open_detach_mount(mnt_path);
> -	if (fd < 0)
> -		goto out;
> +int open_mountpoint(struct mount_info *pm)
> +{
> +	int fd = -1, pid = -1;
> +	char mnt_path[PPFDS];
> +	futex_t *futex = NULL;
>  
> -	if (restore_ns(ns_old, &mnt_ns_desc)) {
> -		ns_old = -1;
> -		goto out;
> +	/* No overmounts and children - the entire mount is visible */
> +	if (list_empty(&pm->children) && !mnt_is_overmounted(pm))
> +		return __open_mountpoint(pm, -1);
> +
> +	pr_info("Mount is not fully visible %s\n", pm->mountpoint);
> +
> +	futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);

It is ugly. You can try to create a new process with CLONE_VM, but I
think we should avaoid of forking a new process.

> +	if (futex == MAP_FAILED) {
> +		pr_perror("Failed to mmap futex");
> +		return 1;
>  	}
> -	if (fchdir(cwd_fd)) {
> -		pr_perror("Unable to restore cwd");
> -		close(cwd_fd);
> -		close(fd);
> -		return -1;
> +	futex_init(futex);
> +
> +	pid = fork();

Why do we need to create a new process? Can we avoid doing this?

> +	if (pid == -1) {
> +		pr_perror("Failed to fork a helper");
> +		goto err;
> +	} else if (pid == 0) {

pls add a detailed comment why do we need to create a new process

> +		exit(ns_open_mountpoint(pm, futex));
>  	}
> -	close(cwd_fd);
>  
> +	/* Wait mount fd from child */
> +	futex_wait_while(futex, FUTEX_FD_OK);

you will wait forever, if a child process segfaulted

> +	fd = futex_get(futex) - FUTEX_FD_SHIFT;
> +	if (fd == -1)
> +		goto err;
> +
> +	/* Open child's fd */
> +	snprintf(mnt_path, PPFDS,"/proc/%d/fd/%d", pid, fd);
> +	fd = open(mnt_path, O_RDONLY | O_DIRECTORY);
> +	if (fd == -1) {
> +		pr_perror("Failed to open child fd");
> +		goto err;
> +	}
> +
> +	/* Release the child */
> +	futex_set_and_wake(futex, 0);
> +	waitpid(pid, NULL, 0);

you have to check an exit code of waitpid and a status of the child
process

> +
> +	munmap(futex, sizeof(*futex));
>  	return __open_mountpoint(pm, fd);
> -out:
> -	if (ns_old >= 0)
> -		 restore_ns(ns_old, &mnt_ns_desc);
> -	close_safe(&fd);
> -	if (fchdir(cwd_fd))
> -		pr_perror("Unable to restore cwd");
> -	close(cwd_fd);
> +err:
> +	if (pid > 0) {
> +		kill(pid, SIGKILL);
> +		waitpid(pid, NULL, 0);
> +	}
> +	if (futex)
> +		munmap(futex, sizeof(*futex));
>  	return -1;
>  }
>  
> -- 
> 2.13.6
>