[CRIU] [PATCH 3/3] mount: open a root directory for mount namesapces via the root task

Andrew Vagin avagin at virtuozzo.com
Tue Aug 2 11:04:19 PDT 2016


On Tue, Aug 02, 2016 at 03:08:17PM +0300, Pavel Emelyanov wrote:
> On 07/27/2016 09:21 PM, Andrey Vagin wrote:
> > From: Andrew Vagin <avagin at virtuozzo.com>
> > 
> > We use openat() syscall to open files in various mount namespaces.
> > Recently we found that if a path contains absolute symlinks, openat()
> > syscall resolves them relative to the current root, but they has to be
> > resolved relative to the root of the target namespace, so we need to
> > change root before restoring a file descriprot.
> 
> OK, what's the exact problem we're having now? Which openat() is called
> on a path with symlink?

When we restore a file, we use openat() to open it:

fd = openat(ns_root_fd, rfi->path, flags);

but rfi->path can containe absolute symlinks and in this case, openat()
will be not enogh and we need to make chroot to resolve symlinks
correctly.

fchroot(ns_root_fd)
open(rfi->path, flags);


> 
> > But for that we need to rework a method how we open a root directory
> > for a specified mount namespace. Currently we open /proc/pid/root for
> > one of processes from this namespace. If a process will change a root
> > directory, this way will not work.
> > 
> > In this patch we open all namespaces in the root task and then any
> > process will be able to open one of these descriptors via /proc/pid/fd.
> > 
> > Signed-off-by: Andrew Vagin <avagin at virtuozzo.com>
> > ---
> >  criu/cr-restore.c | 32 ++++++++++++++------------------
> >  criu/mount.c      | 48 ++++++++++++++++++++++++++++++++----------------
> >  criu/pstree.c     |  1 +
> >  3 files changed, 47 insertions(+), 34 deletions(-)
> > 
> > diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> > index 3437b76..1de2682 100644
> > --- a/criu/cr-restore.c
> > +++ b/criu/cr-restore.c
> > @@ -1469,16 +1469,6 @@ static int restore_task_with_children(void *_arg)
> >  
> >  	restore_pgid();
> >  
> > -	if (current->parent == NULL) {
> > -		/*
> > -		 * Wait when all tasks passed the CR_STATE_FORKING stage.
> > -		 * It means that all tasks entered into their namespaces.
> > -		 */
> > -		futex_wait_while_gt(&task_entries->nr_in_progress, 1);
> > -
> > -		fini_restore_mntns();
> > -	}
> > -
> >  	if (restore_finish_stage(CR_STATE_FORKING) < 0)
> >  		goto err;
> >  
> > @@ -2907,6 +2897,20 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
> >  		goto err_nv;
> >  
> >  	/*
> > +	 * Make root and cwd restore _that_ late not to break any
> > +	 * attempts to open files by paths above (e.g. /proc).
> > +	 */
> > +	if (restore_fs(current))
> > +		goto err;
> > +
> > +	if (current->parent == NULL) {
> > +		/* Wait when all tasks restored all files */
> > +		futex_wait_while_gt(&task_entries->nr_in_progress,
> > +						current->nr_threads);
> > +		fini_restore_mntns();
> > +	}
> > +
> > +	/*
> >  	 * We're about to search for free VM area and inject the restorer blob
> >  	 * into it. No irrelevent mmaps/mremaps beyond this point, otherwise
> >  	 * this unwanted mapping might get overlapped by the restorer.
> > @@ -3151,14 +3155,6 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
> >  	task_args->nr_threads		= current->nr_threads;
> >  	task_args->thread_args		= thread_args;
> >  
> > -	/*
> > -	 * Make root and cwd restore _that_ late not to break any
> > -	 * attempts to open files by paths above (e.g. /proc).
> > -	 */
> > -
> > -	if (restore_fs(current))
> > -		goto err;
> > -
> >  	close_image_dir();
> >  	close_proc();
> >  	close_service_fd(ROOT_FD_OFF);
> > diff --git a/criu/mount.c b/criu/mount.c
> > index 7c280c0..5a167b1 100644
> > --- a/criu/mount.c
> > +++ b/criu/mount.c
> > @@ -3160,9 +3160,7 @@ void fini_restore_mntns(void)
> >  		if (nsid->nd != &mnt_ns_desc)
> >  			continue;
> >  		close_safe(&nsid->mnt.ns_fd);
> > -		if (nsid->type != NS_ROOT)
> > -			close_safe(&nsid->mnt.root_fd);
> > -		nsid->ns_populated = true;
> > +		close_safe(&nsid->mnt.root_fd);
> >  	}
> >  }
> >  
> > @@ -3350,6 +3348,34 @@ void cleanup_mnt_ns(void)
> >  		pr_perror("Can't remove the directory %s", mnt_roots);
> >  }
> >  
> > +static int open_mnt_ns(struct ns_id *nsid, struct rst_info *rst)
> > +{
> > +	int fd, tfd;
> > +
> > +	/* Pin one with a file descriptor */
> > +	fd = open_proc(PROC_SELF, "ns/mnt");
> > +	if (fd < 0)
> > +		return -1;
> > +	tfd = reopen_as_unused_fd(fd, rst);
> > +	if (tfd < 0) {
> > +		close(fd);
> > +		return -1;
> > +	}
> > +	nsid->mnt.ns_fd = tfd;
> > +
> > +	fd = open_proc(PROC_SELF, "root");
> > +	if (fd < 0)
> > +		return -1;
> > +	tfd = reopen_as_unused_fd(fd, rst);
> > +	if (tfd < 0) {
> > +		close(fd);
> > +		return -1;
> > +	}
> > +	nsid->mnt.root_fd = tfd;
> > +
> > +	return 0;
> > +}
> > +
> >  int prepare_mnt_ns(void)
> >  {
> >  	int ret = -1, rst = -1;
> > @@ -3483,12 +3509,8 @@ ns_created:
> >  		if (nsid->nd != &mnt_ns_desc)
> >  			continue;
> >  		if (nsid->type == NS_ROOT) {
> > -			/* Pin one with a file descriptor */
> > -			nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
> > -			if (nsid->mnt.ns_fd < 0)
> > +			if (open_mnt_ns(nsid, rsti(root_item)))
> >  				goto err;
> > -			/* we set ns_populated so we don't need to open root_fd */
> > -			nsid->ns_populated = true;
> >  			continue;
> >  		}
> >  
> > @@ -3504,14 +3526,7 @@ ns_created:
> >  		if (cr_pivot_root(path))
> >  			goto err;
> >  
> > -		/* Pin one with a file descriptor */
> > -		nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
> > -		if (nsid->mnt.ns_fd < 0)
> > -			goto err;
> > -
> > -		/* root_fd is used to restore file mappings */
> > -		nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
> > -		if (nsid->mnt.root_fd < 0)
> > +		if (open_mnt_ns(nsid, rsti(root_item)))
> >  			goto err;
> >  
> >  		/* And return back to regain the access to the roots yard */
> > @@ -3628,6 +3643,7 @@ int mntns_get_root_fd(struct ns_id *mntns)
> >  		fd = open_proc(root_item->pid.virt, "fd/%d", mntns->mnt.root_fd);
> >  		if (fd < 0)
> >  			return -1;
> > +		close_pid_proc();
> >  
> >  		return mntns_set_root_fd(mntns->ns_pid, fd);
> >  	}
> > diff --git a/criu/pstree.c b/criu/pstree.c
> > index af89dbb..94b8021 100644
> > --- a/criu/pstree.c
> > +++ b/criu/pstree.c
> > @@ -943,6 +943,7 @@ static int prepare_pstree_for_unshare(void)
> >  		fake_root->threads->virt = INIT_PID;
> >  		fake_root->ids = root_ids;
> >  		rsti(fake_root)->clone_flags = opts.unshare_flags | rsti(root_item)->clone_flags;
> > +		INIT_LIST_HEAD(&rsti(fake_root)->used);
> >  
> >  		rsti(fake_root)->helper_cb = do_fake_init;
> >  
> > 
> 


More information about the CRIU mailing list