[CRIU] [PATCH 4/5] mount: make open_mountpoint handle overmouts properly

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Sat Dec 2 10:35:02 MSK 2017



On 12/02/2017 05:47 AM, Andrew Vagin wrote:
> On Fri, Nov 24, 2017 at 05:22:20PM +0300, Pavel Tikhomirov wrote:
>> dump of VZ7 ct fails, if we have overmounted tmpfs inside:
>>
>> [root at silo ~]# prlctl enter su-test-2
>> entered into CT
>> CT-829e7b28 /# mkdir /mnt/overmntedtmp
>> CT-829e7b28 /# mount -t tmpfs tmpfs /mnt/overmntedtmp/
>> CT-829e7b28 /# mount -t tmpfs tmpfs /mnt
>> CT-829e7b28 /# logout
>>
>> [root at silo ~]# prlctl suspend su-test-2
>> Suspending the CT...
>> Failed to suspend the CT: PRL_ERR_VZCTL_OPERATION_FAILED (Details: Will skip in-flight TCP connections
>> (01.657913) Error (criu/mount.c:1202): mnt: Can't open ./mnt/overmntedtmp: No such file or directory
>> (01.662528) Error (criu/util.c:709): exited, status=1
>> (01.664329) Error (criu/util.c:709): exited, status=1
>> (01.664694) Error (criu/cr-dump.c:2005): Dumping FAILED.
>> Failed to checkpoint the Container
>> All dump files and logs were saved to /vz/private/829e7b28-f204-4bce-b09f-d203b99befd4/dump/Dump.fail
>> Checkpointing failed
>> )
>>
>> Criu wants to dump the contents of /mnt/overmntedtmp/ mount but it is
>> unavailable. So we copy the mount namespace in such a case and unmount
>> overmounts to access what we want to dump.
>>
>> Actual usecase here is dumping CT with active mariadb and ssh
>> connection. Together they happen to create such overmount. As by default
>> systemd creates a separate mount namespace for mysql and also mounts
>> tmpfs to /run/user in it, and when ssh(root) is connected - systemd also
>> mounts tmpfs in container root mount namespace to /run/user/0 for user
>> files. As /run is slave mount /run/user/0 also propagates to mysql's
>> mount namespace and initially becomes overmounted by /run/user.
>>
>> https://jira.sw.ru/browse/PSBM-57362
>>
>> remove __maybe_unused for mnt_is_overmounted and umount_overmounts
>>
>> Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
>> ---
>>   criu/filesystems.c   |   2 +-
>>   criu/include/mount.h |   2 +
>>   criu/include/util.h  |   1 +
>>   criu/mount.c         | 164 +++++++++++++++++++++++++++++----------------------
>>   4 files changed, 99 insertions(+), 70 deletions(-)
>>
>> diff --git a/criu/filesystems.c b/criu/filesystems.c
>> index 5a88788d7..fa6d96ecf 100644
>> --- a/criu/filesystems.c
>> +++ b/criu/filesystems.c
>> @@ -387,7 +387,7 @@ static int tmpfs_dump(struct mount_info *pm)
>>
>>   	fd = open_mountpoint(pm);
>>   	if (fd < 0)
>> -		return fd;
>> +		return MNT_UNREACHABLE;
>>
>>   	/* if fd happens to be 0 here, we need to move it to something
>>   	 * non-zero, because cr_system_userns closes STDIN_FILENO as we are not
>> diff --git a/criu/include/mount.h b/criu/include/mount.h
>> index e60dd348f..ed771ffac 100644
>> --- a/criu/include/mount.h
>> +++ b/criu/include/mount.h
>> @@ -12,6 +12,8 @@ struct ns_id;
>>
>>   #define MOUNT_INVALID_DEV	(0)
>>
>> +#define MNT_UNREACHABLE INT_MIN
>> +
>>   struct mount_info {
>>   	int			mnt_id;
>>   	int			parent_mnt_id;
>> diff --git a/criu/include/util.h b/criu/include/util.h
>> index 20684ccf9..d1b779d2b 100644
>> --- a/criu/include/util.h
>> +++ b/criu/include/util.h
>> @@ -190,6 +190,7 @@ extern int is_empty_dir(int dirfd);
>>    * path. Since fd is an integer, we can easily estimate one :)
>>    */
>>   #define PSFDS	(sizeof("/proc/self/fd/2147483647"))
>> +#define PPFDS	(sizeof("/proc/2147483647/fd/2147483647"))
>>
>>   extern int read_fd_link(int lfd, char *buf, size_t size);
>>   extern pid_t get_self_real_pid(void);
>> diff --git a/criu/mount.c b/criu/mount.c
>> index e2e7c7116..d47006b58 100644
>> --- a/criu/mount.c
>> +++ b/criu/mount.c
>> @@ -1106,7 +1106,7 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_
>>    * root of our mount namespace as it is covered by other mount.
>>    * mnt_is_overmounted() checks if mount is not visible.
>>    */
>> -static __maybe_unused bool mnt_is_overmounted(struct mount_info *mi)
>> +static bool mnt_is_overmounted(struct mount_info *mi)
>>   {
>>   	struct mount_info *t, *c, *m = mi;
>>
>> @@ -1223,7 +1223,7 @@ static int __umount_overmounts(struct mount_info *m)
>>   }
>>
>>   /* Make our mountpoint fully visible */
>> -static __maybe_unused int umount_overmounts(struct mount_info *m)
>> +static int umount_overmounts(struct mount_info *m)
>>   {
>>   	if (__umount_overmounts(m))
>>   		return -1;
>> @@ -1234,90 +1234,116 @@ static __maybe_unused int umount_overmounts(struct mount_info *m)
>>   	return 0;
>>   }
>>
>> -#define MNT_UNREACHABLE INT_MIN
>> -int open_mountpoint(struct mount_info *pm)
>> +enum {
>> +	FUTEX_FD_OK,
>> +	FUTEX_FD_ERR,
>> +	FUTEX_FD_SHIFT,
>> +};
>> +
>> +/* Open mountpoint clean from children and overmounts */
>> +int ns_open_mountpoint(struct mount_info *pm, futex_t *futex)
>>   {
>> -	struct mount_info *c;
>> -	int fd = -1, ns_old = -1;
>>   	char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
>>   	char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
>> -	char *mnt_path = mnt_path_tmp;
>> -	int cwd_fd;
>> -
>> -	/*
>> -	 * If a mount doesn't have children, we can open a mount point,
>> -	 * otherwise we need to create a "private" copy.
>> -	 */
>> -	if (list_empty(&pm->children))
>> -		return __open_mountpoint(pm, -1);
>> +	char *mnt_path = NULL;
>> +	int fd;
>>
>> -	pr_info("Something is mounted on top of %s\n", pm->mountpoint);
>> +	if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, NULL) < 0)
>> +		goto err;
>> +	/* Need user namespace so that mounts will be unmounable */
>
> I don't understand what you mean here. Pls, explain why do we need to
> change a userns

# terminal 1:
# create new userns and create new mntns owned by the former
[root at cat snorch]# unshare -rUm
# mount something in new mntns
[root at cat snorch]# mount -t tmpfs tmpfs /mnt
[root at cat snorch]# sleep 1000

# terminal 2:
ps auxf | grep sleep
root      3132 ...
# enter only new mntns, userns is init_user_ns now
[root at cat snorch]# nsenter -t 3132 -m
# unshare mntns (to have a private copy)
[root at cat /]# unshare -m
# does not actually changes anything, but to make sure that unmounts not 
propagate to initial mount namespace
[root at cat /]# mount --make-rprivate /
# try to umount /mnt fails as mount is MNT_LOCKED
[root at cat /]# strace umount /mnt
...
umount2("/mnt", 0)                      = -1 EINVAL (Invalid argument)

That is because:
sys_umount() {
	retval = -EINVAL;
	if (mnt->mnt.mnt_flags & MNT_LOCKED)
		goto dput_and_out;
}

Mount is MNT_LOCKED because we are CL_UNPRIVILEGED:
clone_mnt()
{
         /* Don't allow unprivileged users to reveal what is under a 
mount */
         if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
                 mnt->mnt.mnt_flags |= MNT_LOCKED;
}

We are CL_UNPRIVILEGED as when unshare -m we were in non-owner userns:
copy_mnt_ns() {
         if (user_ns != ns->user_ns)
                 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
}

So, finally, if we want to make unmounts there we need to unshare -m 
from owner userns of mntns which we want to copy.

>
>> +	if (pm->nsid->user_ns &&
>> +	    switch_ns(pm->nsid->user_ns->ns_pid, &user_ns_desc, NULL) < 0)
>> +		goto err;
>>
>> -	list_for_each_entry(c, &pm->children, siblings) {
>> -		if (!strcmp(c->mountpoint, pm->mountpoint)) {
>> -			pr_debug("%d:%s is overmounted\n", pm->mnt_id, pm->mountpoint);
>> -			return MNT_UNREACHABLE;
>> +	if (!mnt_is_overmounted(pm)) {
>> +		mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
>> +		if (!mnt_path)
>> +			goto err;
>> +	} else {
>> +		/* Create helper mount namespace so we can unmount in it */
>> +		if (unshare(CLONE_NEWNS)) {
>> +			pr_perror("Unable to clone a mount namespace");
>> +			goto err;
>>   		}
>> -	}
>> +		if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL))
>> +			goto err;
>>
>> -	/*
>> -	 * To create a "private" copy, the target mount is bind-mounted
>> -	 * in a temporary place w/o MS_REC (non-recursively).
>> -	 * A mount point can't be bind-mounted in criu's namespace, it will be
>> -	 * mounted in a target namespace. The sequence of actions is
>> -	 * mkdtemp, setns(tgt), mount, open, detach, setns(old).
>> -	 */
>> +		if (umount_overmounts(pm))
>> +			goto err;
>>
>> -	cwd_fd = open(".", O_DIRECTORY);
>> -	if (cwd_fd < 0) {
>> -		pr_perror("Unable to open cwd");
>> -		return -1;
>> +		mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
>> +		if (!mnt_path)
>> +			goto err;
>>   	}
>>
>> -	if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0)
>> -		goto out;
>> +	fd = open_detach_mount(mnt_path);
>> +	if (fd < 0)
>> +		goto err;
>>
>> -	mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
>> -	if (mnt_path == NULL) {
>> -		/*
>> -		 * We probably can't create a temporary direcotry,
>> -		 * so we can try to clone the mount namespace, open
>> -		 * the required mount and destroy this mount namespace
>> -		 * by calling restore_ns() below in this function.
>> -		 */
>> -		if (unshare(CLONE_NEWNS)) {
>> -			pr_perror("Unable to clone a mount namespace");
>> -			goto out;
>> -		}
>> +	/* Report fd to parent and wait until he he is done */
>> +	futex_set_and_wake(futex, fd + FUTEX_FD_SHIFT);
>> +	futex_wait_until(futex, FUTEX_FD_OK);
>> +	close(fd);
>> +	return 0;
>> +err:
>> +	futex_set_and_wake(futex, FUTEX_FD_ERR);
>> +	return -1;
>> +}
>>
>> -		fd = open(pm->mountpoint, O_RDONLY | O_DIRECTORY, 0);
>> -		if (fd < 0)
>> -			pr_perror("Can't open directory %s: %d", pm->mountpoint, fd);
>> -	} else
>> -		fd = open_detach_mount(mnt_path);
>> -	if (fd < 0)
>> -		goto out;
>> +int open_mountpoint(struct mount_info *pm)
>> +{
>> +	int fd = -1, pid = -1;
>> +	char mnt_path[PPFDS];
>> +	futex_t *futex = NULL;
>>
>> -	if (restore_ns(ns_old, &mnt_ns_desc)) {
>> -		ns_old = -1;
>> -		goto out;
>> +	/* No overmounts and children - the entire mount is visible */
>> +	if (list_empty(&pm->children) && !mnt_is_overmounted(pm))
>> +		return __open_mountpoint(pm, -1);
>> +
>> +	pr_info("Mount is not fully visible %s\n", pm->mountpoint);
>> +
>> +	futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
>
> It is ugly. You can try to create a new process with CLONE_VM, but I
> think we should avaoid of forking a new process.

Entering unprivileged userns is irreversible operation so I need a 
helper process here. As a bonus we don't need to revert mntns change. It 
has short lifetime so I don't think it will influence anybody.

Will add CLONE_VM - it seem to be a good thing to make the process eat 
less memory.

>
>> +	if (futex == MAP_FAILED) {
>> +		pr_perror("Failed to mmap futex");
>> +		return 1;
>>   	}
>> -	if (fchdir(cwd_fd)) {
>> -		pr_perror("Unable to restore cwd");
>> -		close(cwd_fd);
>> -		close(fd);
>> -		return -1;
>> +	futex_init(futex);
>> +
>> +	pid = fork();
>
> Why do we need to create a new process? Can we avoid doing this?
>
>> +	if (pid == -1) {
>> +		pr_perror("Failed to fork a helper");
>> +		goto err;
>> +	} else if (pid == 0) {
>
> pls add a detailed comment why do we need to create a new process

Will add.

>
>> +		exit(ns_open_mountpoint(pm, futex));
>>   	}
>> -	close(cwd_fd);
>>
>> +	/* Wait mount fd from child */
>> +	futex_wait_while(futex, FUTEX_FD_OK);
>
> you will wait forever, if a child process segfaulted

It seem we have to have some timeout here. Also we can try to wait the 
child after timeout, and if the child is alright we can futex_wait 
again, and cycle.

>
>> +	fd = futex_get(futex) - FUTEX_FD_SHIFT;
>> +	if (fd == -1)
>> +		goto err;
>> +
>> +	/* Open child's fd */
>> +	snprintf(mnt_path, PPFDS,"/proc/%d/fd/%d", pid, fd);
>> +	fd = open(mnt_path, O_RDONLY | O_DIRECTORY);
>> +	if (fd == -1) {
>> +		pr_perror("Failed to open child fd");
>> +		goto err;
>> +	}
>> +
>> +	/* Release the child */
>> +	futex_set_and_wake(futex, 0);
>> +	waitpid(pid, NULL, 0);
>
> you have to check an exit code of waitpid and a status of the child
> process

Will do. We need it especially for a segfault case.

>
>> +
>> +	munmap(futex, sizeof(*futex));
>>   	return __open_mountpoint(pm, fd);
>> -out:
>> -	if (ns_old >= 0)
>> -		 restore_ns(ns_old, &mnt_ns_desc);
>> -	close_safe(&fd);
>> -	if (fchdir(cwd_fd))
>> -		pr_perror("Unable to restore cwd");
>> -	close(cwd_fd);
>> +err:
>> +	if (pid > 0) {
>> +		kill(pid, SIGKILL);
>> +		waitpid(pid, NULL, 0);
>> +	}
>> +	if (futex)
>> +		munmap(futex, sizeof(*futex));
>>   	return -1;
>>   }
>>
>> --
>> 2.13.6
>>


More information about the CRIU mailing list