[CRIU] [PATCH 4/5] mount: make open_mountpoint handle overmouts properly

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Fri Nov 24 17:22:20 MSK 2017


dump of VZ7 ct fails, if we have overmounted tmpfs inside:

[root at silo ~]# prlctl enter su-test-2
entered into CT
CT-829e7b28 /# mkdir /mnt/overmntedtmp
CT-829e7b28 /# mount -t tmpfs tmpfs /mnt/overmntedtmp/
CT-829e7b28 /# mount -t tmpfs tmpfs /mnt
CT-829e7b28 /# logout

[root at silo ~]# prlctl suspend su-test-2
Suspending the CT...
Failed to suspend the CT: PRL_ERR_VZCTL_OPERATION_FAILED (Details: Will skip in-flight TCP connections
(01.657913) Error (criu/mount.c:1202): mnt: Can't open ./mnt/overmntedtmp: No such file or directory
(01.662528) Error (criu/util.c:709): exited, status=1
(01.664329) Error (criu/util.c:709): exited, status=1
(01.664694) Error (criu/cr-dump.c:2005): Dumping FAILED.
Failed to checkpoint the Container
All dump files and logs were saved to /vz/private/829e7b28-f204-4bce-b09f-d203b99befd4/dump/Dump.fail
Checkpointing failed
)

Criu wants to dump the contents of /mnt/overmntedtmp/ mount but it is
unavailable. So we copy the mount namespace in such a case and unmount
overmounts to access what we want to dump.

Actual usecase here is dumping CT with active mariadb and ssh
connection. Together they happen to create such overmount. As by default
systemd creates a separate mount namespace for mysql and also mounts
tmpfs to /run/user in it, and when ssh(root) is connected - systemd also
mounts tmpfs in container root mount namespace to /run/user/0 for user
files. As /run is slave mount /run/user/0 also propagates to mysql's
mount namespace and initially becomes overmounted by /run/user.

https://jira.sw.ru/browse/PSBM-57362

remove __maybe_unused for mnt_is_overmounted and umount_overmounts

Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 criu/filesystems.c   |   2 +-
 criu/include/mount.h |   2 +
 criu/include/util.h  |   1 +
 criu/mount.c         | 164 +++++++++++++++++++++++++++++----------------------
 4 files changed, 99 insertions(+), 70 deletions(-)

diff --git a/criu/filesystems.c b/criu/filesystems.c
index 5a88788d7..fa6d96ecf 100644
--- a/criu/filesystems.c
+++ b/criu/filesystems.c
@@ -387,7 +387,7 @@ static int tmpfs_dump(struct mount_info *pm)
 
 	fd = open_mountpoint(pm);
 	if (fd < 0)
-		return fd;
+		return MNT_UNREACHABLE;
 
 	/* if fd happens to be 0 here, we need to move it to something
 	 * non-zero, because cr_system_userns closes STDIN_FILENO as we are not
diff --git a/criu/include/mount.h b/criu/include/mount.h
index e60dd348f..ed771ffac 100644
--- a/criu/include/mount.h
+++ b/criu/include/mount.h
@@ -12,6 +12,8 @@ struct ns_id;
 
 #define MOUNT_INVALID_DEV	(0)
 
+#define MNT_UNREACHABLE INT_MIN
+
 struct mount_info {
 	int			mnt_id;
 	int			parent_mnt_id;
diff --git a/criu/include/util.h b/criu/include/util.h
index 20684ccf9..d1b779d2b 100644
--- a/criu/include/util.h
+++ b/criu/include/util.h
@@ -190,6 +190,7 @@ extern int is_empty_dir(int dirfd);
  * path. Since fd is an integer, we can easily estimate one :)
  */
 #define PSFDS	(sizeof("/proc/self/fd/2147483647"))
+#define PPFDS	(sizeof("/proc/2147483647/fd/2147483647"))
 
 extern int read_fd_link(int lfd, char *buf, size_t size);
 extern pid_t get_self_real_pid(void);
diff --git a/criu/mount.c b/criu/mount.c
index e2e7c7116..d47006b58 100644
--- a/criu/mount.c
+++ b/criu/mount.c
@@ -1106,7 +1106,7 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_
  * root of our mount namespace as it is covered by other mount.
  * mnt_is_overmounted() checks if mount is not visible.
  */
-static __maybe_unused bool mnt_is_overmounted(struct mount_info *mi)
+static bool mnt_is_overmounted(struct mount_info *mi)
 {
 	struct mount_info *t, *c, *m = mi;
 
@@ -1223,7 +1223,7 @@ static int __umount_overmounts(struct mount_info *m)
 }
 
 /* Make our mountpoint fully visible */
-static __maybe_unused int umount_overmounts(struct mount_info *m)
+static int umount_overmounts(struct mount_info *m)
 {
 	if (__umount_overmounts(m))
 		return -1;
@@ -1234,90 +1234,116 @@ static __maybe_unused int umount_overmounts(struct mount_info *m)
 	return 0;
 }
 
-#define MNT_UNREACHABLE INT_MIN
-int open_mountpoint(struct mount_info *pm)
+enum {
+	FUTEX_FD_OK,
+	FUTEX_FD_ERR,
+	FUTEX_FD_SHIFT,
+};
+
+/* Open mountpoint clean from children and overmounts */
+int ns_open_mountpoint(struct mount_info *pm, futex_t *futex)
 {
-	struct mount_info *c;
-	int fd = -1, ns_old = -1;
 	char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
 	char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
-	char *mnt_path = mnt_path_tmp;
-	int cwd_fd;
-
-	/*
-	 * If a mount doesn't have children, we can open a mount point,
-	 * otherwise we need to create a "private" copy.
-	 */
-	if (list_empty(&pm->children))
-		return __open_mountpoint(pm, -1);
+	char *mnt_path = NULL;
+	int fd;
 
-	pr_info("Something is mounted on top of %s\n", pm->mountpoint);
+	if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, NULL) < 0)
+		goto err;
+	/* Need user namespace so that mounts will be unmounable */
+	if (pm->nsid->user_ns &&
+	    switch_ns(pm->nsid->user_ns->ns_pid, &user_ns_desc, NULL) < 0)
+		goto err;
 
-	list_for_each_entry(c, &pm->children, siblings) {
-		if (!strcmp(c->mountpoint, pm->mountpoint)) {
-			pr_debug("%d:%s is overmounted\n", pm->mnt_id, pm->mountpoint);
-			return MNT_UNREACHABLE;
+	if (!mnt_is_overmounted(pm)) {
+		mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
+		if (!mnt_path)
+			goto err;
+	} else {
+		/* Create helper mount namespace so we can unmount in it */
+		if (unshare(CLONE_NEWNS)) {
+			pr_perror("Unable to clone a mount namespace");
+			goto err;
 		}
-	}
+		if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL))
+			goto err;
 
-	/*
-	 * To create a "private" copy, the target mount is bind-mounted
-	 * in a temporary place w/o MS_REC (non-recursively).
-	 * A mount point can't be bind-mounted in criu's namespace, it will be
-	 * mounted in a target namespace. The sequence of actions is
-	 * mkdtemp, setns(tgt), mount, open, detach, setns(old).
-	 */
+		if (umount_overmounts(pm))
+			goto err;
 
-	cwd_fd = open(".", O_DIRECTORY);
-	if (cwd_fd < 0) {
-		pr_perror("Unable to open cwd");
-		return -1;
+		mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
+		if (!mnt_path)
+			goto err;
 	}
 
-	if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0)
-		goto out;
+	fd = open_detach_mount(mnt_path);
+	if (fd < 0)
+		goto err;
 
-	mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
-	if (mnt_path == NULL) {
-		/*
-		 * We probably can't create a temporary direcotry,
-		 * so we can try to clone the mount namespace, open
-		 * the required mount and destroy this mount namespace
-		 * by calling restore_ns() below in this function.
-		 */
-		if (unshare(CLONE_NEWNS)) {
-			pr_perror("Unable to clone a mount namespace");
-			goto out;
-		}
+	/* Report fd to parent and wait until he he is done */
+	futex_set_and_wake(futex, fd + FUTEX_FD_SHIFT);
+	futex_wait_until(futex, FUTEX_FD_OK);
+	close(fd);
+	return 0;
+err:
+	futex_set_and_wake(futex, FUTEX_FD_ERR);
+	return -1;
+}
 
-		fd = open(pm->mountpoint, O_RDONLY | O_DIRECTORY, 0);
-		if (fd < 0)
-			pr_perror("Can't open directory %s: %d", pm->mountpoint, fd);
-	} else
-		fd = open_detach_mount(mnt_path);
-	if (fd < 0)
-		goto out;
+int open_mountpoint(struct mount_info *pm)
+{
+	int fd = -1, pid = -1;
+	char mnt_path[PPFDS];
+	futex_t *futex = NULL;
 
-	if (restore_ns(ns_old, &mnt_ns_desc)) {
-		ns_old = -1;
-		goto out;
+	/* No overmounts and children - the entire mount is visible */
+	if (list_empty(&pm->children) && !mnt_is_overmounted(pm))
+		return __open_mountpoint(pm, -1);
+
+	pr_info("Mount is not fully visible %s\n", pm->mountpoint);
+
+	futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+	if (futex == MAP_FAILED) {
+		pr_perror("Failed to mmap futex");
+		return 1;
 	}
-	if (fchdir(cwd_fd)) {
-		pr_perror("Unable to restore cwd");
-		close(cwd_fd);
-		close(fd);
-		return -1;
+	futex_init(futex);
+
+	pid = fork();
+	if (pid == -1) {
+		pr_perror("Failed to fork a helper");
+		goto err;
+	} else if (pid == 0) {
+		exit(ns_open_mountpoint(pm, futex));
 	}
-	close(cwd_fd);
 
+	/* Wait mount fd from child */
+	futex_wait_while(futex, FUTEX_FD_OK);
+	fd = futex_get(futex) - FUTEX_FD_SHIFT;
+	if (fd == -1)
+		goto err;
+
+	/* Open child's fd */
+	snprintf(mnt_path, PPFDS,"/proc/%d/fd/%d", pid, fd);
+	fd = open(mnt_path, O_RDONLY | O_DIRECTORY);
+	if (fd == -1) {
+		pr_perror("Failed to open child fd");
+		goto err;
+	}
+
+	/* Release the child */
+	futex_set_and_wake(futex, 0);
+	waitpid(pid, NULL, 0);
+
+	munmap(futex, sizeof(*futex));
 	return __open_mountpoint(pm, fd);
-out:
-	if (ns_old >= 0)
-		 restore_ns(ns_old, &mnt_ns_desc);
-	close_safe(&fd);
-	if (fchdir(cwd_fd))
-		pr_perror("Unable to restore cwd");
-	close(cwd_fd);
+err:
+	if (pid > 0) {
+		kill(pid, SIGKILL);
+		waitpid(pid, NULL, 0);
+	}
+	if (futex)
+		munmap(futex, sizeof(*futex));
 	return -1;
 }
 
-- 
2.13.6



More information about the CRIU mailing list