[CRIU] [PATCH 4/5] mount: make open_mountpoint handle overmouts properly
Andrew Vagin
avagin at virtuozzo.com
Sat Dec 2 05:47:50 MSK 2017
On Fri, Nov 24, 2017 at 05:22:20PM +0300, Pavel Tikhomirov wrote:
> dump of VZ7 ct fails, if we have overmounted tmpfs inside:
>
> [root at silo ~]# prlctl enter su-test-2
> entered into CT
> CT-829e7b28 /# mkdir /mnt/overmntedtmp
> CT-829e7b28 /# mount -t tmpfs tmpfs /mnt/overmntedtmp/
> CT-829e7b28 /# mount -t tmpfs tmpfs /mnt
> CT-829e7b28 /# logout
>
> [root at silo ~]# prlctl suspend su-test-2
> Suspending the CT...
> Failed to suspend the CT: PRL_ERR_VZCTL_OPERATION_FAILED (Details: Will skip in-flight TCP connections
> (01.657913) Error (criu/mount.c:1202): mnt: Can't open ./mnt/overmntedtmp: No such file or directory
> (01.662528) Error (criu/util.c:709): exited, status=1
> (01.664329) Error (criu/util.c:709): exited, status=1
> (01.664694) Error (criu/cr-dump.c:2005): Dumping FAILED.
> Failed to checkpoint the Container
> All dump files and logs were saved to /vz/private/829e7b28-f204-4bce-b09f-d203b99befd4/dump/Dump.fail
> Checkpointing failed
> )
>
> Criu wants to dump the contents of /mnt/overmntedtmp/ mount but it is
> unavailable. So we copy the mount namespace in such a case and unmount
> overmounts to access what we want to dump.
>
> Actual usecase here is dumping CT with active mariadb and ssh
> connection. Together they happen to create such overmount. As by default
> systemd creates a separate mount namespace for mysql and also mounts
> tmpfs to /run/user in it, and when ssh(root) is connected - systemd also
> mounts tmpfs in container root mount namespace to /run/user/0 for user
> files. As /run is slave mount /run/user/0 also propagates to mysql's
> mount namespace and initially becomes overmounted by /run/user.
>
> https://jira.sw.ru/browse/PSBM-57362
>
> remove __maybe_unused for mnt_is_overmounted and umount_overmounts
>
> Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
> ---
> criu/filesystems.c | 2 +-
> criu/include/mount.h | 2 +
> criu/include/util.h | 1 +
> criu/mount.c | 164 +++++++++++++++++++++++++++++----------------------
> 4 files changed, 99 insertions(+), 70 deletions(-)
>
> diff --git a/criu/filesystems.c b/criu/filesystems.c
> index 5a88788d7..fa6d96ecf 100644
> --- a/criu/filesystems.c
> +++ b/criu/filesystems.c
> @@ -387,7 +387,7 @@ static int tmpfs_dump(struct mount_info *pm)
>
> fd = open_mountpoint(pm);
> if (fd < 0)
> - return fd;
> + return MNT_UNREACHABLE;
>
> /* if fd happens to be 0 here, we need to move it to something
> * non-zero, because cr_system_userns closes STDIN_FILENO as we are not
> diff --git a/criu/include/mount.h b/criu/include/mount.h
> index e60dd348f..ed771ffac 100644
> --- a/criu/include/mount.h
> +++ b/criu/include/mount.h
> @@ -12,6 +12,8 @@ struct ns_id;
>
> #define MOUNT_INVALID_DEV (0)
>
> +#define MNT_UNREACHABLE INT_MIN
> +
> struct mount_info {
> int mnt_id;
> int parent_mnt_id;
> diff --git a/criu/include/util.h b/criu/include/util.h
> index 20684ccf9..d1b779d2b 100644
> --- a/criu/include/util.h
> +++ b/criu/include/util.h
> @@ -190,6 +190,7 @@ extern int is_empty_dir(int dirfd);
> * path. Since fd is an integer, we can easily estimate one :)
> */
> #define PSFDS (sizeof("/proc/self/fd/2147483647"))
> +#define PPFDS (sizeof("/proc/2147483647/fd/2147483647"))
>
> extern int read_fd_link(int lfd, char *buf, size_t size);
> extern pid_t get_self_real_pid(void);
> diff --git a/criu/mount.c b/criu/mount.c
> index e2e7c7116..d47006b58 100644
> --- a/criu/mount.c
> +++ b/criu/mount.c
> @@ -1106,7 +1106,7 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_
> * root of our mount namespace as it is covered by other mount.
> * mnt_is_overmounted() checks if mount is not visible.
> */
> -static __maybe_unused bool mnt_is_overmounted(struct mount_info *mi)
> +static bool mnt_is_overmounted(struct mount_info *mi)
> {
> struct mount_info *t, *c, *m = mi;
>
> @@ -1223,7 +1223,7 @@ static int __umount_overmounts(struct mount_info *m)
> }
>
> /* Make our mountpoint fully visible */
> -static __maybe_unused int umount_overmounts(struct mount_info *m)
> +static int umount_overmounts(struct mount_info *m)
> {
> if (__umount_overmounts(m))
> return -1;
> @@ -1234,90 +1234,116 @@ static __maybe_unused int umount_overmounts(struct mount_info *m)
> return 0;
> }
>
> -#define MNT_UNREACHABLE INT_MIN
> -int open_mountpoint(struct mount_info *pm)
> +enum {
> + FUTEX_FD_OK,
> + FUTEX_FD_ERR,
> + FUTEX_FD_SHIFT,
> +};
> +
> +/* Open mountpoint clean from children and overmounts */
> +int ns_open_mountpoint(struct mount_info *pm, futex_t *futex)
> {
> - struct mount_info *c;
> - int fd = -1, ns_old = -1;
> char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
> char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
> - char *mnt_path = mnt_path_tmp;
> - int cwd_fd;
> -
> - /*
> - * If a mount doesn't have children, we can open a mount point,
> - * otherwise we need to create a "private" copy.
> - */
> - if (list_empty(&pm->children))
> - return __open_mountpoint(pm, -1);
> + char *mnt_path = NULL;
> + int fd;
>
> - pr_info("Something is mounted on top of %s\n", pm->mountpoint);
> + if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, NULL) < 0)
> + goto err;
> + /* Need user namespace so that mounts will be unmounable */
I don't understand what you mean here. Pls, explain why do we need to
change a userns
> + if (pm->nsid->user_ns &&
> + switch_ns(pm->nsid->user_ns->ns_pid, &user_ns_desc, NULL) < 0)
> + goto err;
>
> - list_for_each_entry(c, &pm->children, siblings) {
> - if (!strcmp(c->mountpoint, pm->mountpoint)) {
> - pr_debug("%d:%s is overmounted\n", pm->mnt_id, pm->mountpoint);
> - return MNT_UNREACHABLE;
> + if (!mnt_is_overmounted(pm)) {
> + mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
> + if (!mnt_path)
> + goto err;
> + } else {
> + /* Create helper mount namespace so we can unmount in it */
> + if (unshare(CLONE_NEWNS)) {
> + pr_perror("Unable to clone a mount namespace");
> + goto err;
> }
> - }
> + if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL))
> + goto err;
>
> - /*
> - * To create a "private" copy, the target mount is bind-mounted
> - * in a temporary place w/o MS_REC (non-recursively).
> - * A mount point can't be bind-mounted in criu's namespace, it will be
> - * mounted in a target namespace. The sequence of actions is
> - * mkdtemp, setns(tgt), mount, open, detach, setns(old).
> - */
> + if (umount_overmounts(pm))
> + goto err;
>
> - cwd_fd = open(".", O_DIRECTORY);
> - if (cwd_fd < 0) {
> - pr_perror("Unable to open cwd");
> - return -1;
> + mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
> + if (!mnt_path)
> + goto err;
> }
>
> - if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0)
> - goto out;
> + fd = open_detach_mount(mnt_path);
> + if (fd < 0)
> + goto err;
>
> - mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
> - if (mnt_path == NULL) {
> - /*
> - * We probably can't create a temporary direcotry,
> - * so we can try to clone the mount namespace, open
> - * the required mount and destroy this mount namespace
> - * by calling restore_ns() below in this function.
> - */
> - if (unshare(CLONE_NEWNS)) {
> - pr_perror("Unable to clone a mount namespace");
> - goto out;
> - }
> + /* Report fd to parent and wait until he he is done */
> + futex_set_and_wake(futex, fd + FUTEX_FD_SHIFT);
> + futex_wait_until(futex, FUTEX_FD_OK);
> + close(fd);
> + return 0;
> +err:
> + futex_set_and_wake(futex, FUTEX_FD_ERR);
> + return -1;
> +}
>
> - fd = open(pm->mountpoint, O_RDONLY | O_DIRECTORY, 0);
> - if (fd < 0)
> - pr_perror("Can't open directory %s: %d", pm->mountpoint, fd);
> - } else
> - fd = open_detach_mount(mnt_path);
> - if (fd < 0)
> - goto out;
> +int open_mountpoint(struct mount_info *pm)
> +{
> + int fd = -1, pid = -1;
> + char mnt_path[PPFDS];
> + futex_t *futex = NULL;
>
> - if (restore_ns(ns_old, &mnt_ns_desc)) {
> - ns_old = -1;
> - goto out;
> + /* No overmounts and children - the entire mount is visible */
> + if (list_empty(&pm->children) && !mnt_is_overmounted(pm))
> + return __open_mountpoint(pm, -1);
> +
> + pr_info("Mount is not fully visible %s\n", pm->mountpoint);
> +
> + futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
It is ugly. You can try to create a new process with CLONE_VM, but I
think we should avaoid of forking a new process.
> + if (futex == MAP_FAILED) {
> + pr_perror("Failed to mmap futex");
> + return 1;
> }
> - if (fchdir(cwd_fd)) {
> - pr_perror("Unable to restore cwd");
> - close(cwd_fd);
> - close(fd);
> - return -1;
> + futex_init(futex);
> +
> + pid = fork();
Why do we need to create a new process? Can we avoid doing this?
> + if (pid == -1) {
> + pr_perror("Failed to fork a helper");
> + goto err;
> + } else if (pid == 0) {
pls add a detailed comment why do we need to create a new process
> + exit(ns_open_mountpoint(pm, futex));
> }
> - close(cwd_fd);
>
> + /* Wait mount fd from child */
> + futex_wait_while(futex, FUTEX_FD_OK);
you will wait forever, if a child process segfaulted
> + fd = futex_get(futex) - FUTEX_FD_SHIFT;
> + if (fd == -1)
> + goto err;
> +
> + /* Open child's fd */
> + snprintf(mnt_path, PPFDS,"/proc/%d/fd/%d", pid, fd);
> + fd = open(mnt_path, O_RDONLY | O_DIRECTORY);
> + if (fd == -1) {
> + pr_perror("Failed to open child fd");
> + goto err;
> + }
> +
> + /* Release the child */
> + futex_set_and_wake(futex, 0);
> + waitpid(pid, NULL, 0);
you have to check an exit code of waitpid and a status of the child
process
> +
> + munmap(futex, sizeof(*futex));
> return __open_mountpoint(pm, fd);
> -out:
> - if (ns_old >= 0)
> - restore_ns(ns_old, &mnt_ns_desc);
> - close_safe(&fd);
> - if (fchdir(cwd_fd))
> - pr_perror("Unable to restore cwd");
> - close(cwd_fd);
> +err:
> + if (pid > 0) {
> + kill(pid, SIGKILL);
> + waitpid(pid, NULL, 0);
> + }
> + if (futex)
> + munmap(futex, sizeof(*futex));
> return -1;
> }
>
> --
> 2.13.6
>
More information about the CRIU
mailing list