[CRIU] [PATCH 3/3] mount: open a root directory for mount namesapces via the root task

Pavel Emelyanov xemul at virtuozzo.com
Tue Aug 2 05:08:17 PDT 2016


On 07/27/2016 09:21 PM, Andrey Vagin wrote:
> From: Andrew Vagin <avagin at virtuozzo.com>
> 
> We use openat() syscall to open files in various mount namespaces.
> Recently we found that if a path contains absolute symlinks, openat()
> syscall resolves them relative to the current root, but they has to be
> resolved relative to the root of the target namespace, so we need to
> change root before restoring a file descriprot.

OK, what's the exact problem we're having now? Which openat() is called
on a path with symlink?

> But for that we need to rework a method how we open a root directory
> for a specified mount namespace. Currently we open /proc/pid/root for
> one of processes from this namespace. If a process will change a root
> directory, this way will not work.
> 
> In this patch we open all namespaces in the root task and then any
> process will be able to open one of these descriptors via /proc/pid/fd.
> 
> Signed-off-by: Andrew Vagin <avagin at virtuozzo.com>
> ---
>  criu/cr-restore.c | 32 ++++++++++++++------------------
>  criu/mount.c      | 48 ++++++++++++++++++++++++++++++++----------------
>  criu/pstree.c     |  1 +
>  3 files changed, 47 insertions(+), 34 deletions(-)
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 3437b76..1de2682 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -1469,16 +1469,6 @@ static int restore_task_with_children(void *_arg)
>  
>  	restore_pgid();
>  
> -	if (current->parent == NULL) {
> -		/*
> -		 * Wait when all tasks passed the CR_STATE_FORKING stage.
> -		 * It means that all tasks entered into their namespaces.
> -		 */
> -		futex_wait_while_gt(&task_entries->nr_in_progress, 1);
> -
> -		fini_restore_mntns();
> -	}
> -
>  	if (restore_finish_stage(CR_STATE_FORKING) < 0)
>  		goto err;
>  
> @@ -2907,6 +2897,20 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>  		goto err_nv;
>  
>  	/*
> +	 * Make root and cwd restore _that_ late not to break any
> +	 * attempts to open files by paths above (e.g. /proc).
> +	 */
> +	if (restore_fs(current))
> +		goto err;
> +
> +	if (current->parent == NULL) {
> +		/* Wait when all tasks restored all files */
> +		futex_wait_while_gt(&task_entries->nr_in_progress,
> +						current->nr_threads);
> +		fini_restore_mntns();
> +	}
> +
> +	/*
>  	 * We're about to search for free VM area and inject the restorer blob
>  	 * into it. No irrelevent mmaps/mremaps beyond this point, otherwise
>  	 * this unwanted mapping might get overlapped by the restorer.
> @@ -3151,14 +3155,6 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>  	task_args->nr_threads		= current->nr_threads;
>  	task_args->thread_args		= thread_args;
>  
> -	/*
> -	 * Make root and cwd restore _that_ late not to break any
> -	 * attempts to open files by paths above (e.g. /proc).
> -	 */
> -
> -	if (restore_fs(current))
> -		goto err;
> -
>  	close_image_dir();
>  	close_proc();
>  	close_service_fd(ROOT_FD_OFF);
> diff --git a/criu/mount.c b/criu/mount.c
> index 7c280c0..5a167b1 100644
> --- a/criu/mount.c
> +++ b/criu/mount.c
> @@ -3160,9 +3160,7 @@ void fini_restore_mntns(void)
>  		if (nsid->nd != &mnt_ns_desc)
>  			continue;
>  		close_safe(&nsid->mnt.ns_fd);
> -		if (nsid->type != NS_ROOT)
> -			close_safe(&nsid->mnt.root_fd);
> -		nsid->ns_populated = true;
> +		close_safe(&nsid->mnt.root_fd);
>  	}
>  }
>  
> @@ -3350,6 +3348,34 @@ void cleanup_mnt_ns(void)
>  		pr_perror("Can't remove the directory %s", mnt_roots);
>  }
>  
> +static int open_mnt_ns(struct ns_id *nsid, struct rst_info *rst)
> +{
> +	int fd, tfd;
> +
> +	/* Pin one with a file descriptor */
> +	fd = open_proc(PROC_SELF, "ns/mnt");
> +	if (fd < 0)
> +		return -1;
> +	tfd = reopen_as_unused_fd(fd, rst);
> +	if (tfd < 0) {
> +		close(fd);
> +		return -1;
> +	}
> +	nsid->mnt.ns_fd = tfd;
> +
> +	fd = open_proc(PROC_SELF, "root");
> +	if (fd < 0)
> +		return -1;
> +	tfd = reopen_as_unused_fd(fd, rst);
> +	if (tfd < 0) {
> +		close(fd);
> +		return -1;
> +	}
> +	nsid->mnt.root_fd = tfd;
> +
> +	return 0;
> +}
> +
>  int prepare_mnt_ns(void)
>  {
>  	int ret = -1, rst = -1;
> @@ -3483,12 +3509,8 @@ ns_created:
>  		if (nsid->nd != &mnt_ns_desc)
>  			continue;
>  		if (nsid->type == NS_ROOT) {
> -			/* Pin one with a file descriptor */
> -			nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
> -			if (nsid->mnt.ns_fd < 0)
> +			if (open_mnt_ns(nsid, rsti(root_item)))
>  				goto err;
> -			/* we set ns_populated so we don't need to open root_fd */
> -			nsid->ns_populated = true;
>  			continue;
>  		}
>  
> @@ -3504,14 +3526,7 @@ ns_created:
>  		if (cr_pivot_root(path))
>  			goto err;
>  
> -		/* Pin one with a file descriptor */
> -		nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
> -		if (nsid->mnt.ns_fd < 0)
> -			goto err;
> -
> -		/* root_fd is used to restore file mappings */
> -		nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
> -		if (nsid->mnt.root_fd < 0)
> +		if (open_mnt_ns(nsid, rsti(root_item)))
>  			goto err;
>  
>  		/* And return back to regain the access to the roots yard */
> @@ -3628,6 +3643,7 @@ int mntns_get_root_fd(struct ns_id *mntns)
>  		fd = open_proc(root_item->pid.virt, "fd/%d", mntns->mnt.root_fd);
>  		if (fd < 0)
>  			return -1;
> +		close_pid_proc();
>  
>  		return mntns_set_root_fd(mntns->ns_pid, fd);
>  	}
> diff --git a/criu/pstree.c b/criu/pstree.c
> index af89dbb..94b8021 100644
> --- a/criu/pstree.c
> +++ b/criu/pstree.c
> @@ -943,6 +943,7 @@ static int prepare_pstree_for_unshare(void)
>  		fake_root->threads->virt = INIT_PID;
>  		fake_root->ids = root_ids;
>  		rsti(fake_root)->clone_flags = opts.unshare_flags | rsti(root_item)->clone_flags;
> +		INIT_LIST_HEAD(&rsti(fake_root)->used);
>  
>  		rsti(fake_root)->helper_cb = do_fake_init;
>  
> 



More information about the CRIU mailing list