[CRIU] [PATCH 4/5] mount: make open_mountpoint handle overmouts properly
Andrew Vagin
avagin at virtuozzo.com
Tue Dec 5 01:28:12 MSK 2017
On Sat, Dec 02, 2017 at 10:35:02AM +0300, Pavel Tikhomirov wrote:
>
>
> On 12/02/2017 05:47 AM, Andrew Vagin wrote:
> > On Fri, Nov 24, 2017 at 05:22:20PM +0300, Pavel Tikhomirov wrote:
> > > dump of VZ7 ct fails, if we have overmounted tmpfs inside:
> > >
> > > [root at silo ~]# prlctl enter su-test-2
> > > entered into CT
> > > CT-829e7b28 /# mkdir /mnt/overmntedtmp
> > > CT-829e7b28 /# mount -t tmpfs tmpfs /mnt/overmntedtmp/
> > > CT-829e7b28 /# mount -t tmpfs tmpfs /mnt
> > > CT-829e7b28 /# logout
> > >
> > > [root at silo ~]# prlctl suspend su-test-2
> > > Suspending the CT...
> > > Failed to suspend the CT: PRL_ERR_VZCTL_OPERATION_FAILED (Details: Will skip in-flight TCP connections
> > > (01.657913) Error (criu/mount.c:1202): mnt: Can't open ./mnt/overmntedtmp: No such file or directory
> > > (01.662528) Error (criu/util.c:709): exited, status=1
> > > (01.664329) Error (criu/util.c:709): exited, status=1
> > > (01.664694) Error (criu/cr-dump.c:2005): Dumping FAILED.
> > > Failed to checkpoint the Container
> > > All dump files and logs were saved to /vz/private/829e7b28-f204-4bce-b09f-d203b99befd4/dump/Dump.fail
> > > Checkpointing failed
> > > )
> > >
> > > Criu wants to dump the contents of /mnt/overmntedtmp/ mount but it is
> > > unavailable. So we copy the mount namespace in such a case and unmount
> > > overmounts to access what we want to dump.
> > >
> > > Actual usecase here is dumping CT with active mariadb and ssh
> > > connection. Together they happen to create such overmount. As by default
> > > systemd creates a separate mount namespace for mysql and also mounts
> > > tmpfs to /run/user in it, and when ssh(root) is connected - systemd also
> > > mounts tmpfs in container root mount namespace to /run/user/0 for user
> > > files. As /run is slave mount /run/user/0 also propagates to mysql's
> > > mount namespace and initially becomes overmounted by /run/user.
> > >
> > > https://jira.sw.ru/browse/PSBM-57362
> > >
> > > remove __maybe_unused for mnt_is_overmounted and umount_overmounts
> > >
> > > Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
> > > ---
> > > criu/filesystems.c | 2 +-
> > > criu/include/mount.h | 2 +
> > > criu/include/util.h | 1 +
> > > criu/mount.c | 164 +++++++++++++++++++++++++++++----------------------
> > > 4 files changed, 99 insertions(+), 70 deletions(-)
> > >
> > > diff --git a/criu/filesystems.c b/criu/filesystems.c
> > > index 5a88788d7..fa6d96ecf 100644
> > > --- a/criu/filesystems.c
> > > +++ b/criu/filesystems.c
> > > @@ -387,7 +387,7 @@ static int tmpfs_dump(struct mount_info *pm)
> > >
> > > fd = open_mountpoint(pm);
> > > if (fd < 0)
> > > - return fd;
> > > + return MNT_UNREACHABLE;
> > >
> > > /* if fd happens to be 0 here, we need to move it to something
> > > * non-zero, because cr_system_userns closes STDIN_FILENO as we are not
> > > diff --git a/criu/include/mount.h b/criu/include/mount.h
> > > index e60dd348f..ed771ffac 100644
> > > --- a/criu/include/mount.h
> > > +++ b/criu/include/mount.h
> > > @@ -12,6 +12,8 @@ struct ns_id;
> > >
> > > #define MOUNT_INVALID_DEV (0)
> > >
> > > +#define MNT_UNREACHABLE INT_MIN
> > > +
> > > struct mount_info {
> > > int mnt_id;
> > > int parent_mnt_id;
> > > diff --git a/criu/include/util.h b/criu/include/util.h
> > > index 20684ccf9..d1b779d2b 100644
> > > --- a/criu/include/util.h
> > > +++ b/criu/include/util.h
> > > @@ -190,6 +190,7 @@ extern int is_empty_dir(int dirfd);
> > > * path. Since fd is an integer, we can easily estimate one :)
> > > */
> > > #define PSFDS (sizeof("/proc/self/fd/2147483647"))
> > > +#define PPFDS (sizeof("/proc/2147483647/fd/2147483647"))
> > >
> > > extern int read_fd_link(int lfd, char *buf, size_t size);
> > > extern pid_t get_self_real_pid(void);
> > > diff --git a/criu/mount.c b/criu/mount.c
> > > index e2e7c7116..d47006b58 100644
> > > --- a/criu/mount.c
> > > +++ b/criu/mount.c
> > > @@ -1106,7 +1106,7 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_
> > > * root of our mount namespace as it is covered by other mount.
> > > * mnt_is_overmounted() checks if mount is not visible.
> > > */
> > > -static __maybe_unused bool mnt_is_overmounted(struct mount_info *mi)
> > > +static bool mnt_is_overmounted(struct mount_info *mi)
> > > {
> > > struct mount_info *t, *c, *m = mi;
> > >
> > > @@ -1223,7 +1223,7 @@ static int __umount_overmounts(struct mount_info *m)
> > > }
> > >
> > > /* Make our mountpoint fully visible */
> > > -static __maybe_unused int umount_overmounts(struct mount_info *m)
> > > +static int umount_overmounts(struct mount_info *m)
> > > {
> > > if (__umount_overmounts(m))
> > > return -1;
> > > @@ -1234,90 +1234,116 @@ static __maybe_unused int umount_overmounts(struct mount_info *m)
> > > return 0;
> > > }
> > >
> > > -#define MNT_UNREACHABLE INT_MIN
> > > -int open_mountpoint(struct mount_info *pm)
> > > +enum {
> > > + FUTEX_FD_OK,
> > > + FUTEX_FD_ERR,
> > > + FUTEX_FD_SHIFT,
> > > +};
> > > +
> > > +/* Open mountpoint clean from children and overmounts */
> > > +int ns_open_mountpoint(struct mount_info *pm, futex_t *futex)
> > > {
> > > - struct mount_info *c;
> > > - int fd = -1, ns_old = -1;
> > > char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
> > > char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
> > > - char *mnt_path = mnt_path_tmp;
> > > - int cwd_fd;
> > > -
> > > - /*
> > > - * If a mount doesn't have children, we can open a mount point,
> > > - * otherwise we need to create a "private" copy.
> > > - */
> > > - if (list_empty(&pm->children))
> > > - return __open_mountpoint(pm, -1);
> > > + char *mnt_path = NULL;
> > > + int fd;
> > >
> > > - pr_info("Something is mounted on top of %s\n", pm->mountpoint);
> > > + if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, NULL) < 0)
> > > + goto err;
> > > + /* Need user namespace so that mounts will be unmounable */
> >
> > I don't understand what you mean here. Pls, explain why do we need to
> > change a userns
>
> # terminal 1:
> # create new userns and create new mntns owned by the former
> [root at cat snorch]# unshare -rUm
> # mount something in new mntns
> [root at cat snorch]# mount -t tmpfs tmpfs /mnt
> [root at cat snorch]# sleep 1000
>
> # terminal 2:
> ps auxf | grep sleep
> root 3132 ...
> # enter only new mntns, userns is init_user_ns now
> [root at cat snorch]# nsenter -t 3132 -m
> # unshare mntns (to have a private copy)
> [root at cat /]# unshare -m
> # does not actually changes anything, but to make sure that unmounts not
> propagate to initial mount namespace
> [root at cat /]# mount --make-rprivate /
> # try to umount /mnt fails as mount is MNT_LOCKED
> [root at cat /]# strace umount /mnt
> ...
> umount2("/mnt", 0) = -1 EINVAL (Invalid argument)
>
> That is because:
> sys_umount() {
> retval = -EINVAL;
> if (mnt->mnt.mnt_flags & MNT_LOCKED)
> goto dput_and_out;
> }
>
> Mount is MNT_LOCKED because we are CL_UNPRIVILEGED:
> clone_mnt()
> {
> /* Don't allow unprivileged users to reveal what is under a mount */
> if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
> mnt->mnt.mnt_flags |= MNT_LOCKED;
> }
>
> We are CL_UNPRIVILEGED as when unshare -m we were in non-owner userns:
> copy_mnt_ns() {
> if (user_ns != ns->user_ns)
I think we can try to send a patch to lkml, which changes this line on
this one:
if (user_ns != ns->user_ns && user_ns != &init_user_ns)
by other words, CL_UNPRIVILEGED isn't set for &init_user_ns
> copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
> }
>
> So, finally, if we want to make unmounts there we need to unshare -m from
> owner userns of mntns which we want to copy.
>
> >
> > > + if (pm->nsid->user_ns &&
> > > + switch_ns(pm->nsid->user_ns->ns_pid, &user_ns_desc, NULL) < 0)
> > > + goto err;
> > >
> > > - list_for_each_entry(c, &pm->children, siblings) {
> > > - if (!strcmp(c->mountpoint, pm->mountpoint)) {
> > > - pr_debug("%d:%s is overmounted\n", pm->mnt_id, pm->mountpoint);
> > > - return MNT_UNREACHABLE;
> > > + if (!mnt_is_overmounted(pm)) {
> > > + mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
> > > + if (!mnt_path)
> > > + goto err;
> > > + } else {
> > > + /* Create helper mount namespace so we can unmount in it */
> > > + if (unshare(CLONE_NEWNS)) {
> > > + pr_perror("Unable to clone a mount namespace");
> > > + goto err;
> > > }
> > > - }
> > > + if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL))
> > > + goto err;
> > >
> > > - /*
> > > - * To create a "private" copy, the target mount is bind-mounted
> > > - * in a temporary place w/o MS_REC (non-recursively).
> > > - * A mount point can't be bind-mounted in criu's namespace, it will be
> > > - * mounted in a target namespace. The sequence of actions is
> > > - * mkdtemp, setns(tgt), mount, open, detach, setns(old).
> > > - */
> > > + if (umount_overmounts(pm))
> > > + goto err;
> > >
> > > - cwd_fd = open(".", O_DIRECTORY);
> > > - if (cwd_fd < 0) {
> > > - pr_perror("Unable to open cwd");
> > > - return -1;
> > > + mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
> > > + if (!mnt_path)
> > > + goto err;
> > > }
> > >
> > > - if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0)
> > > - goto out;
> > > + fd = open_detach_mount(mnt_path);
> > > + if (fd < 0)
> > > + goto err;
> > >
> > > - mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
> > > - if (mnt_path == NULL) {
> > > - /*
> > > - * We probably can't create a temporary direcotry,
> > > - * so we can try to clone the mount namespace, open
> > > - * the required mount and destroy this mount namespace
> > > - * by calling restore_ns() below in this function.
> > > - */
> > > - if (unshare(CLONE_NEWNS)) {
> > > - pr_perror("Unable to clone a mount namespace");
> > > - goto out;
> > > - }
> > > + /* Report fd to parent and wait until he he is done */
> > > + futex_set_and_wake(futex, fd + FUTEX_FD_SHIFT);
> > > + futex_wait_until(futex, FUTEX_FD_OK);
> > > + close(fd);
> > > + return 0;
> > > +err:
> > > + futex_set_and_wake(futex, FUTEX_FD_ERR);
> > > + return -1;
> > > +}
> > >
> > > - fd = open(pm->mountpoint, O_RDONLY | O_DIRECTORY, 0);
> > > - if (fd < 0)
> > > - pr_perror("Can't open directory %s: %d", pm->mountpoint, fd);
> > > - } else
> > > - fd = open_detach_mount(mnt_path);
> > > - if (fd < 0)
> > > - goto out;
> > > +int open_mountpoint(struct mount_info *pm)
> > > +{
> > > + int fd = -1, pid = -1;
> > > + char mnt_path[PPFDS];
> > > + futex_t *futex = NULL;
> > >
> > > - if (restore_ns(ns_old, &mnt_ns_desc)) {
> > > - ns_old = -1;
> > > - goto out;
> > > + /* No overmounts and children - the entire mount is visible */
> > > + if (list_empty(&pm->children) && !mnt_is_overmounted(pm))
> > > + return __open_mountpoint(pm, -1);
> > > +
> > > + pr_info("Mount is not fully visible %s\n", pm->mountpoint);
> > > +
> > > + futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
> >
> > It is ugly. You can try to create a new process with CLONE_VM, but I
> > think we should avaoid of forking a new process.
>
> Entering unprivileged userns is irreversible operation so I need a helper
> process here. As a bonus we don't need to revert mntns change. It has short
> lifetime so I don't think it will influence anybody.
>
> Will add CLONE_VM - it seem to be a good thing to make the process eat less
> memory.
It is about performance rather than memory. We have to be sure that we
fork a process only we don't have another way.
>
> >
> > > + if (futex == MAP_FAILED) {
> > > + pr_perror("Failed to mmap futex");
> > > + return 1;
> > > }
> > > - if (fchdir(cwd_fd)) {
> > > - pr_perror("Unable to restore cwd");
> > > - close(cwd_fd);
> > > - close(fd);
> > > - return -1;
> > > + futex_init(futex);
> > > +
> > > + pid = fork();
> >
> > Why do we need to create a new process? Can we avoid doing this?
> >
> > > + if (pid == -1) {
> > > + pr_perror("Failed to fork a helper");
> > > + goto err;
> > > + } else if (pid == 0) {
> >
> > pls add a detailed comment why do we need to create a new process
>
> Will add.
>
> >
> > > + exit(ns_open_mountpoint(pm, futex));
> > > }
> > > - close(cwd_fd);
> > >
> > > + /* Wait mount fd from child */
> > > + futex_wait_while(futex, FUTEX_FD_OK);
> >
> > you will wait forever, if a child process segfaulted
>
> It seem we have to have some timeout here. Also we can try to wait the child
> after timeout, and if the child is alright we can futex_wait again, and
> cycle.
or you can unset SA_RESTART for SIGCHLD and then futex() will return
-EINTR if a child execited.
>
> >
> > > + fd = futex_get(futex) - FUTEX_FD_SHIFT;
> > > + if (fd == -1)
> > > + goto err;
> > > +
> > > + /* Open child's fd */
> > > + snprintf(mnt_path, PPFDS,"/proc/%d/fd/%d", pid, fd);
> > > + fd = open(mnt_path, O_RDONLY | O_DIRECTORY);
> > > + if (fd == -1) {
> > > + pr_perror("Failed to open child fd");
> > > + goto err;
> > > + }
> > > +
> > > + /* Release the child */
> > > + futex_set_and_wake(futex, 0);
> > > + waitpid(pid, NULL, 0);
> >
> > you have to check an exit code of waitpid and a status of the child
> > process
>
> Will do. We need it especially for a segfault case.
You have to chech exit codes of all children. It is like brushing teeth
twice a day.
>
> >
> > > +
> > > + munmap(futex, sizeof(*futex));
> > > return __open_mountpoint(pm, fd);
> > > -out:
> > > - if (ns_old >= 0)
> > > - restore_ns(ns_old, &mnt_ns_desc);
> > > - close_safe(&fd);
> > > - if (fchdir(cwd_fd))
> > > - pr_perror("Unable to restore cwd");
> > > - close(cwd_fd);
> > > +err:
> > > + if (pid > 0) {
> > > + kill(pid, SIGKILL);
> > > + waitpid(pid, NULL, 0);
> > > + }
> > > + if (futex)
> > > + munmap(futex, sizeof(*futex));
> > > return -1;
> > > }
> > >
> > > --
> > > 2.13.6
> > >
More information about the CRIU
mailing list