[CRIU] [PATCH 4/5] mount: make open_mountpoint handle overmouts properly

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Wed Dec 6 16:47:09 MSK 2017


dump of VZ7 ct fails, if we have overmounted tmpfs inside:

[root at silo ~]# prlctl enter su-test-2
entered into CT
CT-829e7b28 /# mkdir /mnt/overmntedtmp
CT-829e7b28 /# mount -t tmpfs tmpfs /mnt/overmntedtmp/
CT-829e7b28 /# mount -t tmpfs tmpfs /mnt
CT-829e7b28 /# logout

[root at silo ~]# prlctl suspend su-test-2
Suspending the CT...
Failed to suspend the CT: PRL_ERR_VZCTL_OPERATION_FAILED (Details: Will skip in-flight TCP connections
(01.657913) Error (criu/mount.c:1202): mnt: Can't open ./mnt/overmntedtmp: No such file or directory
(01.662528) Error (criu/util.c:709): exited, status=1
(01.664329) Error (criu/util.c:709): exited, status=1
(01.664694) Error (criu/cr-dump.c:2005): Dumping FAILED.
Failed to checkpoint the Container
All dump files and logs were saved to /vz/private/829e7b28-f204-4bce-b09f-d203b99befd4/dump/Dump.fail
Checkpointing failed
)

Criu wants to dump the contents of /mnt/overmntedtmp/ mount but it is
unavailable. So we copy the mount namespace in such a case and unmount
overmounts to access what we want to dump.

Actual usecase here is dumping CT with active mariadb and ssh
connection. Together they happen to create such overmount. As by default
systemd creates a separate mount namespace for mysql and also mounts
tmpfs to /run/user in it, and when ssh(root) is connected - systemd also
mounts tmpfs in container root mount namespace to /run/user/0 for user
files. As /run is slave mount /run/user/0 also propagates to mysql's
mount namespace and initially becomes overmounted by /run/user.

https://jira.sw.ru/browse/PSBM-57362

remove __maybe_unused for mnt_is_overmounted and umount_overmounts

changes in v2:
1) Use clone not fork, share resources with parent same as in
call_in_child_process.
2) Do not enter userns (create helper) for non-overmounted mounts. Thus
return back setns/resorens logic.
3) Helper opens fd for parent directly due to CLONE_FILES, remove futex.
4) Check helper exit status properly.
5) Add get_clean_fd helper.
6) Add better comments.

Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 criu/filesystems.c   |   2 +-
 criu/include/mount.h |   2 +
 criu/mount.c         | 151 +++++++++++++++++++++++++++++++++++----------------
 3 files changed, 106 insertions(+), 49 deletions(-)

diff --git a/criu/filesystems.c b/criu/filesystems.c
index 5a88788d7..fa6d96ecf 100644
--- a/criu/filesystems.c
+++ b/criu/filesystems.c
@@ -387,7 +387,7 @@ static int tmpfs_dump(struct mount_info *pm)
 
 	fd = open_mountpoint(pm);
 	if (fd < 0)
-		return fd;
+		return MNT_UNREACHABLE;
 
 	/* if fd happens to be 0 here, we need to move it to something
 	 * non-zero, because cr_system_userns closes STDIN_FILENO as we are not
diff --git a/criu/include/mount.h b/criu/include/mount.h
index e60dd348f..ed771ffac 100644
--- a/criu/include/mount.h
+++ b/criu/include/mount.h
@@ -12,6 +12,8 @@ struct ns_id;
 
 #define MOUNT_INVALID_DEV	(0)
 
+#define MNT_UNREACHABLE INT_MIN
+
 struct mount_info {
 	int			mnt_id;
 	int			parent_mnt_id;
diff --git a/criu/mount.c b/criu/mount.c
index e2e7c7116..e208c9b7f 100644
--- a/criu/mount.c
+++ b/criu/mount.c
@@ -27,6 +27,7 @@
 #include "files-reg.h"
 #include "external.h"
 #include "fdstore.h"
+#include "clone-noasan.h"
 
 #include "images/mnt.pb-c.h"
 
@@ -1097,6 +1098,19 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_
 	return mnt_path;
 }
 
+static int get_clean_fd(struct mount_info *mi)
+{
+	char *mnt_path = NULL;
+	char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
+	char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
+
+	mnt_path = get_clean_mnt(mi, mnt_path_tmp, mnt_path_root);
+	if (!mnt_path)
+		return -1;
+
+	return open_detach_mount(mnt_path);
+}
+
 /*
  * Our children mount can have same mountpoint as it's parent,
  * call these - children-overmount.
@@ -1106,7 +1120,7 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_
  * root of our mount namespace as it is covered by other mount.
  * mnt_is_overmounted() checks if mount is not visible.
  */
-static __maybe_unused bool mnt_is_overmounted(struct mount_info *mi)
+static bool mnt_is_overmounted(struct mount_info *mi)
 {
 	struct mount_info *t, *c, *m = mi;
 
@@ -1223,7 +1237,7 @@ static int __umount_overmounts(struct mount_info *m)
 }
 
 /* Make our mountpoint fully visible */
-static __maybe_unused int umount_overmounts(struct mount_info *m)
+static int umount_overmounts(struct mount_info *m)
 {
 	if (__umount_overmounts(m))
 		return -1;
@@ -1234,40 +1248,61 @@ static __maybe_unused int umount_overmounts(struct mount_info *m)
 	return 0;
 }
 
-#define MNT_UNREACHABLE INT_MIN
+/* Open mountpoint clean from children and overmounts */
+int ns_open_mountpoint(void *arg)
+{
+	struct mount_info *pm = arg;
+	int fd;
+
+	/* Need user namespace so that mounts will be unmountable */
+	if (pm->nsid->user_ns &&
+	    switch_ns(pm->nsid->user_ns->ns_pid, &user_ns_desc, NULL) < 0)
+		goto err;
+
+	/* Create helper mount namespace so we can unmount in it */
+	if (unshare(CLONE_NEWNS)) {
+		pr_perror("Unable to unshare a mount namespace");
+		goto err;
+	}
+
+	/* Recursively remount private to disable propagation */
+	if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL))
+		goto err;
+
+	if (umount_overmounts(pm))
+		goto err;
+
+	fd = get_clean_fd(pm);
+	if (fd < 0)
+		goto err;
+
+	/* Return fd which we opened for parent due to CLONE_FILES flag */
+	return fd;
+err:
+	return -1;
+}
+
 int open_mountpoint(struct mount_info *pm)
 {
-	struct mount_info *c;
-	int fd = -1, ns_old = -1;
-	char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
-	char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
-	char *mnt_path = mnt_path_tmp;
-	int cwd_fd;
+	int fd, cwd_fd, ns_old = -1;
 
-	/*
-	 * If a mount doesn't have children, we can open a mount point,
-	 * otherwise we need to create a "private" copy.
-	 */
-	if (list_empty(&pm->children))
+	/* No overmounts and children - the entire mount is visible */
+	if (list_empty(&pm->children) && !mnt_is_overmounted(pm))
 		return __open_mountpoint(pm, -1);
 
-	pr_info("Something is mounted on top of %s\n", pm->mountpoint);
-
-	list_for_each_entry(c, &pm->children, siblings) {
-		if (!strcmp(c->mountpoint, pm->mountpoint)) {
-			pr_debug("%d:%s is overmounted\n", pm->mnt_id, pm->mountpoint);
-			return MNT_UNREACHABLE;
-		}
-	}
+	pr_info("Mount is not fully visible %s\n", pm->mountpoint);
 
 	/*
-	 * To create a "private" copy, the target mount is bind-mounted
-	 * in a temporary place w/o MS_REC (non-recursively).
-	 * A mount point can't be bind-mounted in criu's namespace, it will be
-	 * mounted in a target namespace. The sequence of actions is
-	 * mkdtemp, setns(tgt), mount, open, detach, setns(old).
+	 * We do two things below:
+	 * a) If mount has children mounts in it which partially cover it's
+	 * content, to get access to the content we create a "private" copy of
+	 * such a mount, bind-mounting mount w/o MS_REC in a temporary place.
+	 * b) If mount is overmounted we create a private copy of it's mount
+	 * namespace so that we can safely get rid of overmounts and get an
+	 * access to the mount.
+	 * In both cases we can't do the thing from criu's mount namespace, so
+	 * we need to switch to mount's mount namespace, and later swtich back.
 	 */
-
 	cwd_fd = open(".", O_DIRECTORY);
 	if (cwd_fd < 0) {
 		pr_perror("Unable to open cwd");
@@ -1275,33 +1310,53 @@ int open_mountpoint(struct mount_info *pm)
 	}
 
 	if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0)
-		goto out;
+		goto err;
+
+	if (!mnt_is_overmounted(pm)) {
+		pr_info("\tmount has children %s\n", pm->mountpoint);
+
+		fd = get_clean_fd(pm);
+		if (fd < 0)
+			goto err;
+	} else {
+		int pid, status;
+
+		pr_info("\tmount is overmounted %s\n", pm->mountpoint);
 
-	mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
-	if (mnt_path == NULL) {
 		/*
-		 * We probably can't create a temporary direcotry,
-		 * so we can try to clone the mount namespace, open
-		 * the required mount and destroy this mount namespace
-		 * by calling restore_ns() below in this function.
+		 * We are overmounted - not accessible in regular way. We need
+		 * to clone "private" copy of mount's monut namespace and
+		 * unmount all covering overmounts. We also need to enter user
+		 * namespace owning mount's mount namespace first, as else all
+		 * mounts in "private" copy will be MNT_LOCKED and we won't be
+		 * able to unmount them (See CL_UNPRIVILEGED in sys_umount(),
+		 * clone_mnt() and copy_mnt_ns() in linux kernel code).
+		 * We have to create helper process for it as entering user
+		 * namespace is irreversible operation.
 		 */
-		if (unshare(CLONE_NEWNS)) {
-			pr_perror("Unable to clone a mount namespace");
-			goto out;
+		pid = clone_noasan(ns_open_mountpoint, CLONE_VFORK | CLONE_VM
+				| CLONE_FILES | CLONE_IO | CLONE_SIGHAND
+				| CLONE_SYSVSEM, pm);
+		if (pid == -1) {
+			pr_perror("Can't clone helper process");
+			return -1;
 		}
 
-		fd = open(pm->mountpoint, O_RDONLY | O_DIRECTORY, 0);
-		if (fd < 0)
-			pr_perror("Can't open directory %s: %d", pm->mountpoint, fd);
-	} else
-		fd = open_detach_mount(mnt_path);
-	if (fd < 0)
-		goto out;
+		errno = 0;
+		if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status)
+				|| WEXITSTATUS(status) == -1) {
+			pr_err("Can't wait or bad status: errno=%d, status=%d",
+				errno, status);
+			return -1;
+		}
+		fd = WEXITSTATUS(status);
+	}
 
 	if (restore_ns(ns_old, &mnt_ns_desc)) {
 		ns_old = -1;
-		goto out;
+		goto err;
 	}
+
 	if (fchdir(cwd_fd)) {
 		pr_perror("Unable to restore cwd");
 		close(cwd_fd);
@@ -1311,9 +1366,9 @@ int open_mountpoint(struct mount_info *pm)
 	close(cwd_fd);
 
 	return __open_mountpoint(pm, fd);
-out:
+err:
 	if (ns_old >= 0)
-		 restore_ns(ns_old, &mnt_ns_desc);
+		restore_ns(ns_old, &mnt_ns_desc);
 	close_safe(&fd);
 	if (fchdir(cwd_fd))
 		pr_perror("Unable to restore cwd");
-- 
2.13.6



More information about the CRIU mailing list