[CRIU] [PATCH v4 3/7] mount: remount ro mounts writable before ghost-file restore

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Thu Dec 13 12:02:52 MSK 2018


We can have ghost-files on readonly mounts, for them we will need to
recreate the file on restore, and we can't do that if mount is readonly,
so the idea is to remount the mount we want to operate on to be writable,
and later after all ghost-files restored return mounts to their proper
state if needed.

There are three exceptions, where we don't remount:
a) Overmounted mounts can't be easily remounted writable, as their
mountpoints are invisible for us.
b) If the mount has readonly superblock - there can be no ghost-files on
such a mount.
c) When we are in host mntns, we should not remount mounts in it, else
if we face errors in between we'll forget to remount back.

We have 3 places where we need to add these remount:
1) create_ghost()
2) clean_one_remap()
3) rfi_remap()

For (1) and (2) we can just remount the mount writable without
remounting it back as they are called in service mntns (the one we save
in mnt_ns_fd), which will be destroyed with all it's mounts at the end.
We mark such mounts as remounted in service mntns - REMOUNTED_RW_SERVICE.

For (3) we need to remount these mounts back to readonly so we mark them
with REMOUNTED_RW and later in remount_readonly_mounts all such mounts
are re-remounted back.

For (3) we also need to enter proper mntns of tmi before remounting.

These solution v3 is better than v2 as for v2 we added additional
remount for all bind-readonly mounts, now we do remounts only for
those having ghost-files restore operations on them. These should be
quiet a rare thing, so ~3 remounts added for each suitable mount is a
relatively small price.

note: Also I thought and tried to implement the complete remove of the
step of remounting back to readonly, but it requires quiet a tricky
playing with usernsd and only removes one remount (of ~3) for already a
rare case so I don't thing it worth the effort.

v2: minor commit message cleanup and remove warn
v4: don't delay, only remount the mounts we explicitly want to write to
just before operating, rename patch accordingly, reuse
do_restore_task_mnt_ns, optimize inefficient ns_remount_readonly_mounts,
and also add another exception.

Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 criu/cr-restore.c    |   3 +
 criu/files-reg.c     |  25 +++++++-
 criu/include/mount.h |  16 +++++
 criu/mount.c         | 135 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 176 insertions(+), 3 deletions(-)

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 6086d6523..4b3c828bf 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -3263,6 +3263,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 		/* Wait when all tasks restored all files */
 		if (restore_wait_other_tasks())
 			goto err_nv;
+		if (root_ns_mask & CLONE_NEWNS &&
+		    remount_readonly_mounts())
+			goto err_nv;
 	}
 
 	/*
diff --git a/criu/files-reg.c b/criu/files-reg.c
index 3bc23049a..b44581638 100644
--- a/criu/files-reg.c
+++ b/criu/files-reg.c
@@ -12,6 +12,7 @@
 #include <sys/sendfile.h>
 #include <sched.h>
 #include <sys/capability.h>
+#include <sys/mount.h>
 
 #ifndef SEEK_DATA
 #define SEEK_DATA	3
@@ -354,6 +355,7 @@ static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe)
 
 static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img)
 {
+	struct mount_info *mi;
 	char path[PATH_MAX];
 	int ret, root_len;
 	char *msg;
@@ -372,6 +374,11 @@ static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_im
 
 	snprintf(path + root_len, sizeof(path) - root_len, "%s", gf->remap.rpath);
 	ret = -1;
+
+	mi = lookup_mnt_id(gf->remap.rmnt_id);
+	/* We get here while in service mntns */
+	if (mi && try_remount_writable(mi, false))
+		goto err;
 again:
 	if (S_ISFIFO(gfe->mode)) {
 		if ((ret = mknod(path, gfe->mode, 0)) < 0)
@@ -696,9 +703,10 @@ int prepare_remaps(void)
 
 static int clean_one_remap(struct remap_info *ri)
 {
-	char path[PATH_MAX];
-	int mnt_id, ret, rmntns_root;
 	struct file_remap *remap = ri->rfi->remap;
+	int mnt_id, ret, rmntns_root;
+	struct mount_info *mi;
+	char path[PATH_MAX];
 
 	if (remap->rpath[0] == 0)
 		return 0;
@@ -718,6 +726,13 @@ static int clean_one_remap(struct remap_info *ri)
 		return -1;
 	}
 
+	mi = lookup_mnt_id(mnt_id);
+	/* We get here while in service mntns */
+	if (mi && try_remount_writable(mi, false)) {
+		close(rmntns_root);
+		return -1;
+	}
+
 	pr_info("Unlink remap %s\n", remap->rpath);
 
 	ret = unlinkat(rmntns_root, remap->rpath, remap->is_dir ? AT_REMOVEDIR : 0);
@@ -1598,9 +1613,13 @@ static int rfi_remap(struct reg_file_info *rfi, int *level)
 	convert_path_from_another_mp(rfi->remap->rpath, rpath, sizeof(_rpath), rmi, tmi);
 
 out:
-	pr_debug("%d: Link %s -> %s\n", tmi->mnt_id, rpath, path);
 	mntns_root = mntns_get_root_fd(tmi->nsid);
 
+	/* We get here while in task's mntns */
+	if (try_remount_writable(tmi, true))
+		return -1;
+
+	pr_debug("%d: Link %s -> %s\n", tmi->mnt_id, rpath, path);
 out_root:
 	*level = make_parent_dirs_if_need(mntns_root, path);
 	if (*level < 0)
diff --git a/criu/include/mount.h b/criu/include/mount.h
index e7d026264..843f3c438 100644
--- a/criu/include/mount.h
+++ b/criu/include/mount.h
@@ -14,6 +14,18 @@ struct ns_id;
 
 #define MNT_UNREACHABLE INT_MIN
 
+/*
+ * We have remounted these mount writable temporary, and we
+ * should return it back to readonly at the end of file restore.
+ */
+#define REMOUNTED_RW 1
+/*
+ * We have remounted these mount writable in service mount namespace,
+ * thus we shouldn't return it back to readonly, as service mntns
+ * will be destroyed anyway.
+ */
+#define REMOUNTED_RW_SERVICE 2
+
 struct mount_info {
 	int			mnt_id;
 	int			parent_mnt_id;
@@ -71,6 +83,7 @@ struct mount_info {
 	struct list_head	postpone;
 
 	int			is_overmounted;
+	int			remounted_rw;
 
 	void			*private;	/* associated filesystem data */
 };
@@ -128,4 +141,7 @@ extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool fo
 
 extern int check_mnt_id(void);
 
+extern int remount_readonly_mounts(void);
+extern int try_remount_writable(struct mount_info *mi, bool ns);
+
 #endif /* __CR_MOUNT_H__ */
diff --git a/criu/mount.c b/criu/mount.c
index 220340335..5801f64a8 100644
--- a/criu/mount.c
+++ b/criu/mount.c
@@ -3682,3 +3682,138 @@ void clean_cr_time_mounts(void)
 }
 
 struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt");
+
+static int call_helper_process(int (*call)(void *), void *arg)
+{
+	int pid, status;
+
+	pid = clone_noasan(call, CLONE_VFORK | CLONE_VM | CLONE_FILES |
+			   CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM, arg);
+	if (pid == -1) {
+		pr_perror("Can't clone helper process");
+		return -1;
+	}
+
+	errno = 0;
+	if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status)
+	    || WEXITSTATUS(status)) {
+		pr_err("Can't wait or bad status: errno=%d, status=%d\n",
+		       errno, status);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int ns_remount_writable(void *arg)
+{
+	struct mount_info *mi = (struct mount_info *)arg;
+	struct ns_id *ns = mi->nsid;
+
+	if (do_restore_task_mnt_ns(ns))
+		return 1;
+	pr_info("Switched to mntns %u:%u/n", ns->id, ns->kid);
+
+	if (mount(NULL, mi->ns_mountpoint, NULL, MS_REMOUNT | MS_BIND, NULL) == -1)
+		return 1;
+	return 0;
+}
+
+int try_remount_writable(struct mount_info *mi, bool ns) {
+	int remounted = REMOUNTED_RW;
+
+	/* Don't remount if we are in host mntns to be on the safe side */
+	if (!(root_ns_mask & CLONE_NEWNS))
+		return 0;
+
+	if (!ns)
+		remounted = REMOUNTED_RW_SERVICE;
+
+	if (mi->flags & MS_RDONLY && !(mi->remounted_rw & remounted)) {
+		if (mnt_is_overmounted(mi)) {
+			pr_err("The mount %d is overmounted so paths are invisible\n", mi->mnt_id);
+			return -1;
+		}
+
+		/* There should be no ghost files on mounts with ro sb */
+		if (mi->sb_flags & MS_RDONLY) {
+			pr_err("The mount %d has readonly sb\n", mi->mnt_id);
+			return -1;
+		}
+
+		pr_info("Remount %d:%s writable\n", mi->mnt_id, mi->mountpoint);
+		if (!ns) {
+			if (mount(NULL, mi->mountpoint, NULL, MS_REMOUNT | MS_BIND, NULL) == -1)
+				goto err;
+		} else {
+			if (call_helper_process(ns_remount_writable, mi))
+				goto err;
+		}
+		mi->remounted_rw |= remounted;
+	}
+
+	return 0;
+err:
+	pr_perror("Failed to remount %d:%s writable", mi->mnt_id, mi->mountpoint);
+	return -1;
+}
+
+static int __remount_readonly_mounts(struct ns_id *ns)
+{
+	struct mount_info *mi;
+	bool mntns_set = false;
+
+	for (mi = mntinfo; mi; mi = mi->next) {
+		if (ns && mi->nsid != ns)
+			continue;
+
+		if (!(mi->remounted_rw && REMOUNTED_RW))
+			continue;
+
+		/*
+		 * Lets enter the mount namespace lazily, only if we've found the
+		 * mount which should be remounted readonly. These saves us
+		 * from entering mntns if we have no mounts to remount in it.
+		 */
+		if (ns && !mntns_set) {
+			if (do_restore_task_mnt_ns(ns))
+				return -1;
+			mntns_set = true;
+			pr_info("Switched to mntns %u:%u/n", ns->id, ns->kid);
+		}
+
+		if (mount(NULL, mi->ns_mountpoint, NULL,
+			  MS_REMOUNT | MS_BIND | (mi->flags & (~MS_PROPAGATE)),
+			  NULL)) {
+			pr_perror("Failed to restore %d:%s mount flags %x",
+				  mi->mnt_id, mi->mountpoint, mi->flags);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int ns_remount_readonly_mounts(void *arg)
+{
+	struct ns_id *nsid;
+
+	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
+		if (nsid->nd != &mnt_ns_desc)
+			continue;
+
+		if (__remount_readonly_mounts(nsid))
+			return 1;
+	}
+
+	return 0;
+}
+
+int remount_readonly_mounts(void)
+{
+	/*
+	 * Need a helper process because the root task can share fs via
+	 * CLONE_FS and we would not be able to enter mount namespaces
+	 */
+	return call_helper_process(ns_remount_readonly_mounts, NULL);
+}
-- 
2.17.1



More information about the CRIU mailing list