[CRIU] [PATCH v4 3/7] mount: remount ro mounts writable before ghost-file restore
Andrei Vagin
avagin at gmail.com
Sun Jan 13 10:12:01 MSK 2019
On Thu, Dec 13, 2018 at 12:02:52PM +0300, Pavel Tikhomirov wrote:
> We can have ghost-files on readonly mounts, for them we will need to
> recreate the file on restore, and we can't do that if mount is readonly,
> so the idea is to remount the mount we want to operate on to be writable,
> and later after all ghost-files restored return mounts to their proper
> state if needed.
>
> There are three exceptions, where we don't remount:
> a) Overmounted mounts can't be easily remounted writable, as their
> mountpoints are invisible for us.
> b) If the mount has readonly superblock - there can be no ghost-files on
> such a mount.
> c) When we are in host mntns, we should not remount mounts in it, else
> if we face errors in between we'll forget to remount back.
>
> We have 3 places where we need to add these remount:
> 1) create_ghost()
> 2) clean_one_remap()
> 3) rfi_remap()
>
> For (1) and (2) we can just remount the mount writable without
> remounting it back as they are called in service mntns (the one we save
> in mnt_ns_fd), which will be destroyed with all it's mounts at the end.
> We mark such mounts as remounted in service mntns - REMOUNTED_RW_SERVICE.
>
> For (3) we need to remount these mounts back to readonly so we mark them
> with REMOUNTED_RW and later in remount_readonly_mounts all such mounts
> are re-remounted back.
>
> For (3) we also need to enter proper mntns of tmi before remounting.
>
> These solution v3 is better than v2 as for v2 we added additional
> remount for all bind-readonly mounts, now we do remounts only for
> those having ghost-files restore operations on them. These should be
> quiet a rare thing, so ~3 remounts added for each suitable mount is a
> relatively small price.
>
> note: Also I thought and tried to implement the complete remove of the
> step of remounting back to readonly, but it requires quiet a tricky
> playing with usernsd and only removes one remount (of ~3) for already a
> rare case so I don't thing it worth the effort.
>
> v2: minor commit message cleanup and remove warn
> v4: don't delay, only remount the mounts we explicitly want to write to
> just before operating, rename patch accordingly, reuse
> do_restore_task_mnt_ns, optimize inefficient ns_remount_readonly_mounts,
> and also add another exception.
>
> Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
> ---
> criu/cr-restore.c | 3 +
> criu/files-reg.c | 25 +++++++-
> criu/include/mount.h | 16 +++++
> criu/mount.c | 135 +++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 176 insertions(+), 3 deletions(-)
>
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 6086d6523..4b3c828bf 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -3263,6 +3263,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
> /* Wait when all tasks restored all files */
> if (restore_wait_other_tasks())
> goto err_nv;
> + if (root_ns_mask & CLONE_NEWNS &&
> + remount_readonly_mounts())
> + goto err_nv;
> }
>
> /*
> diff --git a/criu/files-reg.c b/criu/files-reg.c
> index 3bc23049a..b44581638 100644
> --- a/criu/files-reg.c
> +++ b/criu/files-reg.c
> @@ -12,6 +12,7 @@
> #include <sys/sendfile.h>
> #include <sched.h>
> #include <sys/capability.h>
> +#include <sys/mount.h>
>
> #ifndef SEEK_DATA
> #define SEEK_DATA 3
> @@ -354,6 +355,7 @@ static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe)
>
> static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img)
> {
> + struct mount_info *mi;
> char path[PATH_MAX];
> int ret, root_len;
> char *msg;
> @@ -372,6 +374,11 @@ static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_im
>
> snprintf(path + root_len, sizeof(path) - root_len, "%s", gf->remap.rpath);
> ret = -1;
> +
> + mi = lookup_mnt_id(gf->remap.rmnt_id);
> + /* We get here while in service mntns */
> + if (mi && try_remount_writable(mi, false))
> + goto err;
> again:
> if (S_ISFIFO(gfe->mode)) {
> if ((ret = mknod(path, gfe->mode, 0)) < 0)
> @@ -696,9 +703,10 @@ int prepare_remaps(void)
>
> static int clean_one_remap(struct remap_info *ri)
> {
> - char path[PATH_MAX];
> - int mnt_id, ret, rmntns_root;
> struct file_remap *remap = ri->rfi->remap;
> + int mnt_id, ret, rmntns_root;
> + struct mount_info *mi;
> + char path[PATH_MAX];
>
> if (remap->rpath[0] == 0)
> return 0;
> @@ -718,6 +726,13 @@ static int clean_one_remap(struct remap_info *ri)
> return -1;
> }
>
> + mi = lookup_mnt_id(mnt_id);
> + /* We get here while in service mntns */
> + if (mi && try_remount_writable(mi, false)) {
> + close(rmntns_root);
> + return -1;
> + }
> +
> pr_info("Unlink remap %s\n", remap->rpath);
>
> ret = unlinkat(rmntns_root, remap->rpath, remap->is_dir ? AT_REMOVEDIR : 0);
> @@ -1598,9 +1613,13 @@ static int rfi_remap(struct reg_file_info *rfi, int *level)
> convert_path_from_another_mp(rfi->remap->rpath, rpath, sizeof(_rpath), rmi, tmi);
>
> out:
> - pr_debug("%d: Link %s -> %s\n", tmi->mnt_id, rpath, path);
> mntns_root = mntns_get_root_fd(tmi->nsid);
>
> + /* We get here while in task's mntns */
> + if (try_remount_writable(tmi, true))
> + return -1;
> +
> + pr_debug("%d: Link %s -> %s\n", tmi->mnt_id, rpath, path);
> out_root:
> *level = make_parent_dirs_if_need(mntns_root, path);
> if (*level < 0)
> diff --git a/criu/include/mount.h b/criu/include/mount.h
> index e7d026264..843f3c438 100644
> --- a/criu/include/mount.h
> +++ b/criu/include/mount.h
> @@ -14,6 +14,18 @@ struct ns_id;
>
> #define MNT_UNREACHABLE INT_MIN
>
> +/*
> + * We have remounted these mount writable temporary, and we
> + * should return it back to readonly at the end of file restore.
> + */
> +#define REMOUNTED_RW 1
> +/*
> + * We have remounted these mount writable in service mount namespace,
> + * thus we shouldn't return it back to readonly, as service mntns
> + * will be destroyed anyway.
> + */
> +#define REMOUNTED_RW_SERVICE 2
> +
> struct mount_info {
> int mnt_id;
> int parent_mnt_id;
> @@ -71,6 +83,7 @@ struct mount_info {
> struct list_head postpone;
>
> int is_overmounted;
> + int remounted_rw;
>
> void *private; /* associated filesystem data */
> };
> @@ -128,4 +141,7 @@ extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool fo
>
> extern int check_mnt_id(void);
>
> +extern int remount_readonly_mounts(void);
> +extern int try_remount_writable(struct mount_info *mi, bool ns);
> +
> #endif /* __CR_MOUNT_H__ */
> diff --git a/criu/mount.c b/criu/mount.c
> index 220340335..5801f64a8 100644
> --- a/criu/mount.c
> +++ b/criu/mount.c
> @@ -3682,3 +3682,138 @@ void clean_cr_time_mounts(void)
> }
>
> struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt");
> +
> +static int call_helper_process(int (*call)(void *), void *arg)
> +{
> + int pid, status;
> +
> + pid = clone_noasan(call, CLONE_VFORK | CLONE_VM | CLONE_FILES |
> + CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM, arg);
> + if (pid == -1) {
> + pr_perror("Can't clone helper process");
> + return -1;
> + }
> +
> + errno = 0;
> + if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status)
> + || WEXITSTATUS(status)) {
> + pr_err("Can't wait or bad status: errno=%d, status=%d\n",
> + errno, status);
> + return -1;
> + }
Can we split this code on two if-s:
if (waitpid(pid, &status, __WALL) != pid) {
....
}
if (status) {
....
}
> +
> + return 0;
> +}
> +
> +static int ns_remount_writable(void *arg)
> +{
> + struct mount_info *mi = (struct mount_info *)arg;
> + struct ns_id *ns = mi->nsid;
> +
> + if (do_restore_task_mnt_ns(ns))
> + return 1;
> + pr_info("Switched to mntns %u:%u/n", ns->id, ns->kid);
pr_debug?
> +
> + if (mount(NULL, mi->ns_mountpoint, NULL, MS_REMOUNT | MS_BIND, NULL) == -1)
we need to print a error message here
> + return 1;
> + return 0;
> +}
> +
> +int try_remount_writable(struct mount_info *mi, bool ns) {
The brace should be on the next line ^^^^
> + int remounted = REMOUNTED_RW;
> +
> + /* Don't remount if we are in host mntns to be on the safe side */
> + if (!(root_ns_mask & CLONE_NEWNS))
> + return 0;
> +
> + if (!ns)
> + remounted = REMOUNTED_RW_SERVICE;
> +
> + if (mi->flags & MS_RDONLY && !(mi->remounted_rw & remounted)) {
> + if (mnt_is_overmounted(mi)) {
> + pr_err("The mount %d is overmounted so paths are invisible\n", mi->mnt_id);
> + return -1;
> + }
> +
> + /* There should be no ghost files on mounts with ro sb */
> + if (mi->sb_flags & MS_RDONLY) {
> + pr_err("The mount %d has readonly sb\n", mi->mnt_id);
> + return -1;
> + }
> +
> + pr_info("Remount %d:%s writable\n", mi->mnt_id, mi->mountpoint);
> + if (!ns) {
> + if (mount(NULL, mi->mountpoint, NULL, MS_REMOUNT | MS_BIND, NULL) == -1)
> + goto err;
It will drop all mi->flags... Are you sure that it is always good? I
don't know, but this looks suspicious.
> + } else {
> + if (call_helper_process(ns_remount_writable, mi))
> + goto err;
> + }
> + mi->remounted_rw |= remounted;
> + }
> +
> + return 0;
> +err:
> + pr_perror("Failed to remount %d:%s writable", mi->mnt_id, mi->mountpoint);
does call_helper_process return a right errno?
> + return -1;
> +}
> +
> +static int __remount_readonly_mounts(struct ns_id *ns)
> +{
> + struct mount_info *mi;
> + bool mntns_set = false;
> +
> + for (mi = mntinfo; mi; mi = mi->next) {
> + if (ns && mi->nsid != ns)
> + continue;
> +
> + if (!(mi->remounted_rw && REMOUNTED_RW))
> + continue;
> +
> + /*
> + * Lets enter the mount namespace lazily, only if we've found the
> + * mount which should be remounted readonly. These saves us
> + * from entering mntns if we have no mounts to remount in it.
> + */
> + if (ns && !mntns_set) {
> + if (do_restore_task_mnt_ns(ns))
> + return -1;
> + mntns_set = true;
> + pr_info("Switched to mntns %u:%u/n", ns->id, ns->kid);
> + }
> +
> + if (mount(NULL, mi->ns_mountpoint, NULL,
> + MS_REMOUNT | MS_BIND | (mi->flags & (~MS_PROPAGATE)),
> + NULL)) {
> + pr_perror("Failed to restore %d:%s mount flags %x",
> + mi->mnt_id, mi->mountpoint, mi->flags);
> + return -1;
> + }
> + }
> +
> + return 0;
> +}
> +
> +static int ns_remount_readonly_mounts(void *arg)
> +{
> + struct ns_id *nsid;
> +
> + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
> + if (nsid->nd != &mnt_ns_desc)
> + continue;
> +
> + if (__remount_readonly_mounts(nsid))
> + return 1;
> + }
> +
> + return 0;
> +}
> +
> +int remount_readonly_mounts(void)
> +{
> + /*
> + * Need a helper process because the root task can share fs via
> + * CLONE_FS and we would not be able to enter mount namespaces
> + */
> + return call_helper_process(ns_remount_readonly_mounts, NULL);
> +}
> --
> 2.17.1
>
More information about the CRIU
mailing list