[CRIU] [PATCH 4/9] mount: make open_mountpoint handle overmouts properly

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Fri Dec 22 11:22:26 MSK 2017


dump of VZ7 ct fails, if we have overmounted tmpfs inside:

[root at silo ~]# prlctl enter su-test-2
entered into CT
CT-829e7b28 /# mkdir /mnt/overmntedtmp
CT-829e7b28 /# mount -t tmpfs tmpfs /mnt/overmntedtmp/
CT-829e7b28 /# mount -t tmpfs tmpfs /mnt
CT-829e7b28 /# logout

[root at silo ~]# prlctl suspend su-test-2
Suspending the CT...
Failed to suspend the CT: PRL_ERR_VZCTL_OPERATION_FAILED (Details: Will skip in-flight TCP connections
(01.657913) Error (criu/mount.c:1202): mnt: Can't open ./mnt/overmntedtmp: No such file or directory
(01.662528) Error (criu/util.c:709): exited, status=1
(01.664329) Error (criu/util.c:709): exited, status=1
(01.664694) Error (criu/cr-dump.c:2005): Dumping FAILED.
Failed to checkpoint the Container
All dump files and logs were saved to /vz/private/829e7b28-f204-4bce-b09f-d203b99befd4/dump/Dump.fail
Checkpointing failed
)

Criu wants to dump the contents of /mnt/overmntedtmp/ mount but it is
unavailable. So we copy the mount namespace in such a case and unmount
overmounts to access what we want to dump.

Actual usecase here is dumping CT with active mariadb and ssh
connection. Together they happen to create such overmount. As by default
systemd creates a separate mount namespace for mysql and also mounts
tmpfs to /run/user in it, and when ssh(root) is connected - systemd also
mounts tmpfs in container root mount namespace to /run/user/0 for user
files. As /run is slave mount /run/user/0 also propagates to mysql's
mount namespace and initially becomes overmounted by /run/user.

https://jira.sw.ru/browse/PSBM-57362

remove __maybe_unused for mnt_is_overmounted and umount_overmounts

changes in v2:
1) Use clone not fork, share resources with parent same as in
call_in_child_process.
2) Do not enter userns (create helper) for non-overmounted mounts. Thus
return back setns/resorens logic.
3) Helper opens fd for parent directly due to CLONE_FILES, remove futex.
4) Check helper exit status properly.
5) Add get_clean_fd helper.
6) Add better comments.

changes in v3:
1) Pass fd from helper through args instead of ret code, fix ret code
checking.
2) Add \n to pr_err in open_mountpoint

changes in v5:
Make comments even better.

Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 criu/filesystems.c   |   2 +-
 criu/include/mount.h |   2 +
 criu/mount.c         | 168 +++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 124 insertions(+), 48 deletions(-)

diff --git a/criu/filesystems.c b/criu/filesystems.c
index 5a88788d7..fa6d96ecf 100644
--- a/criu/filesystems.c
+++ b/criu/filesystems.c
@@ -387,7 +387,7 @@ static int tmpfs_dump(struct mount_info *pm)
 
 	fd = open_mountpoint(pm);
 	if (fd < 0)
-		return fd;
+		return MNT_UNREACHABLE;
 
 	/* if fd happens to be 0 here, we need to move it to something
 	 * non-zero, because cr_system_userns closes STDIN_FILENO as we are not
diff --git a/criu/include/mount.h b/criu/include/mount.h
index e60dd348f..ed771ffac 100644
--- a/criu/include/mount.h
+++ b/criu/include/mount.h
@@ -12,6 +12,8 @@ struct ns_id;
 
 #define MOUNT_INVALID_DEV	(0)
 
+#define MNT_UNREACHABLE INT_MIN
+
 struct mount_info {
 	int			mnt_id;
 	int			parent_mnt_id;
diff --git a/criu/mount.c b/criu/mount.c
index 18c0b2e6d..271dc6f4e 100644
--- a/criu/mount.c
+++ b/criu/mount.c
@@ -27,6 +27,7 @@
 #include "files-reg.h"
 #include "external.h"
 #include "fdstore.h"
+#include "clone-noasan.h"
 
 #include "images/mnt.pb-c.h"
 
@@ -1097,6 +1098,19 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_
 	return mnt_path;
 }
 
+static int get_clean_fd(struct mount_info *mi)
+{
+	char *mnt_path = NULL;
+	char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
+	char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
+
+	mnt_path = get_clean_mnt(mi, mnt_path_tmp, mnt_path_root);
+	if (!mnt_path)
+		return -1;
+
+	return open_detach_mount(mnt_path);
+}
+
 /*
  * Our children mount can have same mountpoint as it's parent,
  * call these - children-overmount.
@@ -1106,7 +1120,7 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_
  * root of our mount namespace as it is covered by other mount.
  * mnt_is_overmounted() checks if mount is not visible.
  */
-static __maybe_unused bool mnt_is_overmounted(struct mount_info *mi)
+static bool mnt_is_overmounted(struct mount_info *mi)
 {
 	struct mount_info *t, *c, *m = mi;
 
@@ -1223,7 +1237,7 @@ static int __umount_overmounts(struct mount_info *m)
 }
 
 /* Make our mountpoint fully visible */
-static __maybe_unused int umount_overmounts(struct mount_info *m)
+static int umount_overmounts(struct mount_info *m)
 {
 	if (__umount_overmounts(m))
 		return -1;
@@ -1234,40 +1248,79 @@ static __maybe_unused int umount_overmounts(struct mount_info *m)
 	return 0;
 }
 
-#define MNT_UNREACHABLE INT_MIN
-int open_mountpoint(struct mount_info *pm)
+struct clone_arg {
+	struct mount_info *mi;
+	int *fd;
+};
+
+/*
+ * Get access to the mountpoint covered by overmounts
+ * and open it's cleaned copy (without children mounts).
+ */
+int ns_open_mountpoint(void *arg)
 {
-	struct mount_info *c;
-	int fd = -1, ns_old = -1;
-	char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
-	char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
-	char *mnt_path = mnt_path_tmp;
-	int cwd_fd;
+	struct clone_arg *ca = arg;
+	struct mount_info *mi = ca->mi;
+	int *fd = ca->fd;
 
 	/*
-	 * If a mount doesn't have children, we can open a mount point,
-	 * otherwise we need to create a "private" copy.
+	 * We should enter user namespace owning mount namespace of our mount
+	 * before creating helper mount namespace. Else all mounts in helper
+	 * mount namespace will be locked (MNT_LOCKED) and we won't be able to
+	 * unmount them (see CL_UNPRIVILEGED in sys_umount(), clone_mnt() and
+	 * copy_mnt_ns() in linux kernel code).
 	 */
-	if (list_empty(&pm->children))
-		return __open_mountpoint(pm, -1);
-
-	pr_info("Something is mounted on top of %s\n", pm->mountpoint);
+	if (mi->nsid->user_ns &&
+	    switch_ns(mi->nsid->user_ns->ns_pid, &user_ns_desc, NULL) < 0)
+		goto err;
 
-	list_for_each_entry(c, &pm->children, siblings) {
-		if (!strcmp(c->mountpoint, pm->mountpoint)) {
-			pr_debug("%d:%s is overmounted\n", pm->mnt_id, pm->mountpoint);
-			return MNT_UNREACHABLE;
-		}
+	/*
+	 * Create a helper mount namespace in which we can safely do unmounts
+	 * without breaking dumping process' environment.
+	 */
+	if (unshare(CLONE_NEWNS)) {
+		pr_perror("Unable to unshare a mount namespace");
+		goto err;
 	}
 
+	/* Remount all mounts as private to disable propagation */
+	if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL))
+		goto err;
+
+	if (umount_overmounts(mi))
+		goto err;
+
+	/* Save fd which we opened for parent due to CLONE_FILES flag */
+	*fd = get_clean_fd(mi);
+	if (*fd < 0)
+		goto err;
+
+	return 0;
+err:
+	return 1;
+}
+
+int open_mountpoint(struct mount_info *pm)
+{
+	int fd, cwd_fd, ns_old = -1;
+
+	/* No overmounts and children - the entire mount is visible */
+	if (list_empty(&pm->children) && !mnt_is_overmounted(pm))
+		return __open_mountpoint(pm, -1);
+
+	pr_info("Mount is not fully visible %s\n", pm->mountpoint);
+
 	/*
-	 * To create a "private" copy, the target mount is bind-mounted
-	 * in a temporary place w/o MS_REC (non-recursively).
-	 * A mount point can't be bind-mounted in criu's namespace, it will be
-	 * mounted in a target namespace. The sequence of actions is
-	 * mkdtemp, setns(tgt), mount, open, detach, setns(old).
+	 * We do two things below:
+	 * a) If mount has children mounts in it which partially cover it's
+	 * content, to get access to the content we create a "private" copy of
+	 * such a mount, bind-mounting mount w/o MS_REC in a temporary place.
+	 * b) If mount is overmounted we create a private copy of it's mount
+	 * namespace so that we can safely get rid of overmounts and get an
+	 * access to the mount.
+	 * In both cases we can't do the thing from criu's mount namespace, so
+	 * we need to switch to mount's mount namespace, and later swtich back.
 	 */
-
 	cwd_fd = open(".", O_DIRECTORY);
 	if (cwd_fd < 0) {
 		pr_perror("Unable to open cwd");
@@ -1275,33 +1328,54 @@ int open_mountpoint(struct mount_info *pm)
 	}
 
 	if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0)
-		goto out;
+		goto err;
+
+	if (!mnt_is_overmounted(pm)) {
+		pr_info("\tmount has children %s\n", pm->mountpoint);
+
+		fd = get_clean_fd(pm);
+		if (fd < 0)
+			goto err;
+	} else {
+		int pid, status;
+		struct clone_arg ca = {
+			.mi = pm,
+			.fd = &fd
+		};
+
+		pr_info("\tmount is overmounted %s\n", pm->mountpoint);
 
-	mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
-	if (mnt_path == NULL) {
 		/*
-		 * We probably can't create a temporary direcotry,
-		 * so we can try to clone the mount namespace, open
-		 * the required mount and destroy this mount namespace
-		 * by calling restore_ns() below in this function.
+		 * We are overmounted - not accessible in a regular way. We
+		 * need to clone "private" copy of mount's monut namespace and
+		 * unmount all covering overmounts in it. We also need to enter
+		 * user namespace owning these mount namespace just before that
+		 * (see explanation in ns_open_mountpoint). Thus we also have
+		 * to create helper process here as entering user namespace is
+		 * irreversible operation.
 		 */
-		if (unshare(CLONE_NEWNS)) {
-			pr_perror("Unable to clone a mount namespace");
-			goto out;
+		pid = clone_noasan(ns_open_mountpoint, CLONE_VFORK | CLONE_VM
+				| CLONE_FILES | CLONE_IO | CLONE_SIGHAND
+				| CLONE_SYSVSEM, &ca);
+		if (pid == -1) {
+			pr_perror("Can't clone helper process");
+			return -1;
 		}
 
-		fd = open(pm->mountpoint, O_RDONLY | O_DIRECTORY, 0);
-		if (fd < 0)
-			pr_perror("Can't open directory %s: %d", pm->mountpoint, fd);
-	} else
-		fd = open_detach_mount(mnt_path);
-	if (fd < 0)
-		goto out;
+		errno = 0;
+		if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status)
+				|| WEXITSTATUS(status)) {
+			pr_err("Can't wait or bad status: errno=%d, status=%d\n",
+				errno, status);
+			return -1;
+		}
+	}
 
 	if (restore_ns(ns_old, &mnt_ns_desc)) {
 		ns_old = -1;
-		goto out;
+		goto err;
 	}
+
 	if (fchdir(cwd_fd)) {
 		pr_perror("Unable to restore cwd");
 		close(cwd_fd);
@@ -1311,9 +1385,9 @@ int open_mountpoint(struct mount_info *pm)
 	close(cwd_fd);
 
 	return __open_mountpoint(pm, fd);
-out:
+err:
 	if (ns_old >= 0)
-		 restore_ns(ns_old, &mnt_ns_desc);
+		restore_ns(ns_old, &mnt_ns_desc);
 	close_safe(&fd);
 	if (fchdir(cwd_fd))
 		pr_perror("Unable to restore cwd");
-- 
2.14.3



More information about the CRIU mailing list