[CRIU] [PATCH v4 3/7] mount: remount ro mounts writable before ghost-file restore

Andrei Vagin avagin at gmail.com
Sun Jan 13 10:12:01 MSK 2019


On Thu, Dec 13, 2018 at 12:02:52PM +0300, Pavel Tikhomirov wrote:
> We can have ghost-files on readonly mounts, for them we will need to
> recreate the file on restore, and we can't do that if mount is readonly,
> so the idea is to remount the mount we want to operate on to be writable,
> and later after all ghost-files restored return mounts to their proper
> state if needed.
> 
> There are three exceptions, where we don't remount:
> a) Overmounted mounts can't be easily remounted writable, as their
> mountpoints are invisible for us.
> b) If the mount has readonly superblock - there can be no ghost-files on
> such a mount.
> c) When we are in host mntns, we should not remount mounts in it, else
> if we face errors in between we'll forget to remount back.
> 
> We have 3 places where we need to add these remount:
> 1) create_ghost()
> 2) clean_one_remap()
> 3) rfi_remap()
> 
> For (1) and (2) we can just remount the mount writable without
> remounting it back as they are called in service mntns (the one we save
> in mnt_ns_fd), which will be destroyed with all it's mounts at the end.
> We mark such mounts as remounted in service mntns - REMOUNTED_RW_SERVICE.
> 
> For (3) we need to remount these mounts back to readonly so we mark them
> with REMOUNTED_RW and later in remount_readonly_mounts all such mounts
> are re-remounted back.
> 
> For (3) we also need to enter proper mntns of tmi before remounting.
> 
> These solution v3 is better than v2 as for v2 we added additional
> remount for all bind-readonly mounts, now we do remounts only for
> those having ghost-files restore operations on them. These should be
> quiet a rare thing, so ~3 remounts added for each suitable mount is a
> relatively small price.
> 
> note: Also I thought and tried to implement the complete remove of the
> step of remounting back to readonly, but it requires quiet a tricky
> playing with usernsd and only removes one remount (of ~3) for already a
> rare case so I don't thing it worth the effort.
> 
> v2: minor commit message cleanup and remove warn
> v4: don't delay, only remount the mounts we explicitly want to write to
> just before operating, rename patch accordingly, reuse
> do_restore_task_mnt_ns, optimize inefficient ns_remount_readonly_mounts,
> and also add another exception.
> 
> Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
> ---
>  criu/cr-restore.c    |   3 +
>  criu/files-reg.c     |  25 +++++++-
>  criu/include/mount.h |  16 +++++
>  criu/mount.c         | 135 +++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 176 insertions(+), 3 deletions(-)
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 6086d6523..4b3c828bf 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -3263,6 +3263,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>  		/* Wait when all tasks restored all files */
>  		if (restore_wait_other_tasks())
>  			goto err_nv;
> +		if (root_ns_mask & CLONE_NEWNS &&
> +		    remount_readonly_mounts())
> +			goto err_nv;
>  	}
>  
>  	/*
> diff --git a/criu/files-reg.c b/criu/files-reg.c
> index 3bc23049a..b44581638 100644
> --- a/criu/files-reg.c
> +++ b/criu/files-reg.c
> @@ -12,6 +12,7 @@
>  #include <sys/sendfile.h>
>  #include <sched.h>
>  #include <sys/capability.h>
> +#include <sys/mount.h>
>  
>  #ifndef SEEK_DATA
>  #define SEEK_DATA	3
> @@ -354,6 +355,7 @@ static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe)
>  
>  static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img)
>  {
> +	struct mount_info *mi;
>  	char path[PATH_MAX];
>  	int ret, root_len;
>  	char *msg;
> @@ -372,6 +374,11 @@ static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_im
>  
>  	snprintf(path + root_len, sizeof(path) - root_len, "%s", gf->remap.rpath);
>  	ret = -1;
> +
> +	mi = lookup_mnt_id(gf->remap.rmnt_id);
> +	/* We get here while in service mntns */
> +	if (mi && try_remount_writable(mi, false))
> +		goto err;
>  again:
>  	if (S_ISFIFO(gfe->mode)) {
>  		if ((ret = mknod(path, gfe->mode, 0)) < 0)
> @@ -696,9 +703,10 @@ int prepare_remaps(void)
>  
>  static int clean_one_remap(struct remap_info *ri)
>  {
> -	char path[PATH_MAX];
> -	int mnt_id, ret, rmntns_root;
>  	struct file_remap *remap = ri->rfi->remap;
> +	int mnt_id, ret, rmntns_root;
> +	struct mount_info *mi;
> +	char path[PATH_MAX];
>  
>  	if (remap->rpath[0] == 0)
>  		return 0;
> @@ -718,6 +726,13 @@ static int clean_one_remap(struct remap_info *ri)
>  		return -1;
>  	}
>  
> +	mi = lookup_mnt_id(mnt_id);
> +	/* We get here while in service mntns */
> +	if (mi && try_remount_writable(mi, false)) {
> +		close(rmntns_root);
> +		return -1;
> +	}
> +
>  	pr_info("Unlink remap %s\n", remap->rpath);
>  
>  	ret = unlinkat(rmntns_root, remap->rpath, remap->is_dir ? AT_REMOVEDIR : 0);
> @@ -1598,9 +1613,13 @@ static int rfi_remap(struct reg_file_info *rfi, int *level)
>  	convert_path_from_another_mp(rfi->remap->rpath, rpath, sizeof(_rpath), rmi, tmi);
>  
>  out:
> -	pr_debug("%d: Link %s -> %s\n", tmi->mnt_id, rpath, path);
>  	mntns_root = mntns_get_root_fd(tmi->nsid);
>  
> +	/* We get here while in task's mntns */
> +	if (try_remount_writable(tmi, true))
> +		return -1;
> +
> +	pr_debug("%d: Link %s -> %s\n", tmi->mnt_id, rpath, path);
>  out_root:
>  	*level = make_parent_dirs_if_need(mntns_root, path);
>  	if (*level < 0)
> diff --git a/criu/include/mount.h b/criu/include/mount.h
> index e7d026264..843f3c438 100644
> --- a/criu/include/mount.h
> +++ b/criu/include/mount.h
> @@ -14,6 +14,18 @@ struct ns_id;
>  
>  #define MNT_UNREACHABLE INT_MIN
>  
> +/*
> + * We have remounted these mount writable temporary, and we
> + * should return it back to readonly at the end of file restore.
> + */
> +#define REMOUNTED_RW 1
> +/*
> + * We have remounted these mount writable in service mount namespace,
> + * thus we shouldn't return it back to readonly, as service mntns
> + * will be destroyed anyway.
> + */
> +#define REMOUNTED_RW_SERVICE 2
> +
>  struct mount_info {
>  	int			mnt_id;
>  	int			parent_mnt_id;
> @@ -71,6 +83,7 @@ struct mount_info {
>  	struct list_head	postpone;
>  
>  	int			is_overmounted;
> +	int			remounted_rw;
>  
>  	void			*private;	/* associated filesystem data */
>  };
> @@ -128,4 +141,7 @@ extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool fo
>  
>  extern int check_mnt_id(void);
>  
> +extern int remount_readonly_mounts(void);
> +extern int try_remount_writable(struct mount_info *mi, bool ns);
> +
>  #endif /* __CR_MOUNT_H__ */
> diff --git a/criu/mount.c b/criu/mount.c
> index 220340335..5801f64a8 100644
> --- a/criu/mount.c
> +++ b/criu/mount.c
> @@ -3682,3 +3682,138 @@ void clean_cr_time_mounts(void)
>  }
>  
>  struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt");
> +
> +static int call_helper_process(int (*call)(void *), void *arg)
> +{
> +	int pid, status;
> +
> +	pid = clone_noasan(call, CLONE_VFORK | CLONE_VM | CLONE_FILES |
> +			   CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM, arg);
> +	if (pid == -1) {
> +		pr_perror("Can't clone helper process");
> +		return -1;
> +	}
> +
> +	errno = 0;
> +	if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status)
> +	    || WEXITSTATUS(status)) {
> +		pr_err("Can't wait or bad status: errno=%d, status=%d\n",
> +		       errno, status);
> +		return -1;
> +	}

Can we split this code on two if-s:

	if (waitpid(pid, &status, __WALL) != pid) {
		....
	}

	if (status) {
		....
	}

> +
> +	return 0;
> +}
> +
> +static int ns_remount_writable(void *arg)
> +{
> +	struct mount_info *mi = (struct mount_info *)arg;
> +	struct ns_id *ns = mi->nsid;
> +
> +	if (do_restore_task_mnt_ns(ns))
> +		return 1;
> +	pr_info("Switched to mntns %u:%u/n", ns->id, ns->kid);

pr_debug?

> +
> +	if (mount(NULL, mi->ns_mountpoint, NULL, MS_REMOUNT | MS_BIND, NULL) == -1)

we need to print a error message here

> +		return 1;
> +	return 0;
> +}
> +
> +int try_remount_writable(struct mount_info *mi, bool ns) {

The brace should be on the next line                       ^^^^

> +	int remounted = REMOUNTED_RW;
> +
> +	/* Don't remount if we are in host mntns to be on the safe side */
> +	if (!(root_ns_mask & CLONE_NEWNS))
> +		return 0;
> +
> +	if (!ns)
> +		remounted = REMOUNTED_RW_SERVICE;
> +
> +	if (mi->flags & MS_RDONLY && !(mi->remounted_rw & remounted)) {
> +		if (mnt_is_overmounted(mi)) {
> +			pr_err("The mount %d is overmounted so paths are invisible\n", mi->mnt_id);
> +			return -1;
> +		}
> +
> +		/* There should be no ghost files on mounts with ro sb */
> +		if (mi->sb_flags & MS_RDONLY) {
> +			pr_err("The mount %d has readonly sb\n", mi->mnt_id);
> +			return -1;
> +		}
> +
> +		pr_info("Remount %d:%s writable\n", mi->mnt_id, mi->mountpoint);
> +		if (!ns) {
> +			if (mount(NULL, mi->mountpoint, NULL, MS_REMOUNT | MS_BIND, NULL) == -1)
> +				goto err;

It will drop all mi->flags... Are you sure that it is always good? I
don't know, but this looks suspicious.

> +		} else {
> +			if (call_helper_process(ns_remount_writable, mi))
> +				goto err;
> +		}
> +		mi->remounted_rw |= remounted;
> +	}
> +
> +	return 0;
> +err:
> +	pr_perror("Failed to remount %d:%s writable", mi->mnt_id, mi->mountpoint);

does call_helper_process return a right errno?

> +	return -1;
> +}
> +
> +static int __remount_readonly_mounts(struct ns_id *ns)
> +{
> +	struct mount_info *mi;
> +	bool mntns_set = false;
> +
> +	for (mi = mntinfo; mi; mi = mi->next) {
> +		if (ns && mi->nsid != ns)
> +			continue;
> +
> +		if (!(mi->remounted_rw && REMOUNTED_RW))
> +			continue;
> +
> +		/*
> +		 * Lets enter the mount namespace lazily, only if we've found the
> +		 * mount which should be remounted readonly. These saves us
> +		 * from entering mntns if we have no mounts to remount in it.
> +		 */
> +		if (ns && !mntns_set) {
> +			if (do_restore_task_mnt_ns(ns))
> +				return -1;
> +			mntns_set = true;
> +			pr_info("Switched to mntns %u:%u/n", ns->id, ns->kid);
> +		}
> +
> +		if (mount(NULL, mi->ns_mountpoint, NULL,
> +			  MS_REMOUNT | MS_BIND | (mi->flags & (~MS_PROPAGATE)),
> +			  NULL)) {
> +			pr_perror("Failed to restore %d:%s mount flags %x",
> +				  mi->mnt_id, mi->mountpoint, mi->flags);
> +			return -1;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int ns_remount_readonly_mounts(void *arg)
> +{
> +	struct ns_id *nsid;
> +
> +	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
> +		if (nsid->nd != &mnt_ns_desc)
> +			continue;
> +
> +		if (__remount_readonly_mounts(nsid))
> +			return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +int remount_readonly_mounts(void)
> +{
> +	/*
> +	 * Need a helper process because the root task can share fs via
> +	 * CLONE_FS and we would not be able to enter mount namespaces
> +	 */
> +	return call_helper_process(ns_remount_readonly_mounts, NULL);
> +}
> -- 
> 2.17.1
> 


More information about the CRIU mailing list