[CRIU] [PATCH 3/3] mount: open a root directory for mount namesapces via the root task
Pavel Emelyanov
xemul at virtuozzo.com
Tue Aug 2 05:08:17 PDT 2016
On 07/27/2016 09:21 PM, Andrey Vagin wrote:
> From: Andrew Vagin <avagin at virtuozzo.com>
>
> We use openat() syscall to open files in various mount namespaces.
> Recently we found that if a path contains absolute symlinks, openat()
> syscall resolves them relative to the current root, but they has to be
> resolved relative to the root of the target namespace, so we need to
> change root before restoring a file descriprot.
OK, what's the exact problem we're having now? Which openat() is called
on a path with symlink?
> But for that we need to rework a method how we open a root directory
> for a specified mount namespace. Currently we open /proc/pid/root for
> one of processes from this namespace. If a process will change a root
> directory, this way will not work.
>
> In this patch we open all namespaces in the root task and then any
> process will be able to open one of these descriptors via /proc/pid/fd.
>
> Signed-off-by: Andrew Vagin <avagin at virtuozzo.com>
> ---
> criu/cr-restore.c | 32 ++++++++++++++------------------
> criu/mount.c | 48 ++++++++++++++++++++++++++++++++----------------
> criu/pstree.c | 1 +
> 3 files changed, 47 insertions(+), 34 deletions(-)
>
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 3437b76..1de2682 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -1469,16 +1469,6 @@ static int restore_task_with_children(void *_arg)
>
> restore_pgid();
>
> - if (current->parent == NULL) {
> - /*
> - * Wait when all tasks passed the CR_STATE_FORKING stage.
> - * It means that all tasks entered into their namespaces.
> - */
> - futex_wait_while_gt(&task_entries->nr_in_progress, 1);
> -
> - fini_restore_mntns();
> - }
> -
> if (restore_finish_stage(CR_STATE_FORKING) < 0)
> goto err;
>
> @@ -2907,6 +2897,20 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
> goto err_nv;
>
> /*
> + * Make root and cwd restore _that_ late not to break any
> + * attempts to open files by paths above (e.g. /proc).
> + */
> + if (restore_fs(current))
> + goto err;
> +
> + if (current->parent == NULL) {
> + /* Wait when all tasks restored all files */
> + futex_wait_while_gt(&task_entries->nr_in_progress,
> + current->nr_threads);
> + fini_restore_mntns();
> + }
> +
> + /*
> * We're about to search for free VM area and inject the restorer blob
> * into it. No irrelevent mmaps/mremaps beyond this point, otherwise
> * this unwanted mapping might get overlapped by the restorer.
> @@ -3151,14 +3155,6 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
> task_args->nr_threads = current->nr_threads;
> task_args->thread_args = thread_args;
>
> - /*
> - * Make root and cwd restore _that_ late not to break any
> - * attempts to open files by paths above (e.g. /proc).
> - */
> -
> - if (restore_fs(current))
> - goto err;
> -
> close_image_dir();
> close_proc();
> close_service_fd(ROOT_FD_OFF);
> diff --git a/criu/mount.c b/criu/mount.c
> index 7c280c0..5a167b1 100644
> --- a/criu/mount.c
> +++ b/criu/mount.c
> @@ -3160,9 +3160,7 @@ void fini_restore_mntns(void)
> if (nsid->nd != &mnt_ns_desc)
> continue;
> close_safe(&nsid->mnt.ns_fd);
> - if (nsid->type != NS_ROOT)
> - close_safe(&nsid->mnt.root_fd);
> - nsid->ns_populated = true;
> + close_safe(&nsid->mnt.root_fd);
> }
> }
>
> @@ -3350,6 +3348,34 @@ void cleanup_mnt_ns(void)
> pr_perror("Can't remove the directory %s", mnt_roots);
> }
>
> +static int open_mnt_ns(struct ns_id *nsid, struct rst_info *rst)
> +{
> + int fd, tfd;
> +
> + /* Pin one with a file descriptor */
> + fd = open_proc(PROC_SELF, "ns/mnt");
> + if (fd < 0)
> + return -1;
> + tfd = reopen_as_unused_fd(fd, rst);
> + if (tfd < 0) {
> + close(fd);
> + return -1;
> + }
> + nsid->mnt.ns_fd = tfd;
> +
> + fd = open_proc(PROC_SELF, "root");
> + if (fd < 0)
> + return -1;
> + tfd = reopen_as_unused_fd(fd, rst);
> + if (tfd < 0) {
> + close(fd);
> + return -1;
> + }
> + nsid->mnt.root_fd = tfd;
> +
> + return 0;
> +}
> +
> int prepare_mnt_ns(void)
> {
> int ret = -1, rst = -1;
> @@ -3483,12 +3509,8 @@ ns_created:
> if (nsid->nd != &mnt_ns_desc)
> continue;
> if (nsid->type == NS_ROOT) {
> - /* Pin one with a file descriptor */
> - nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
> - if (nsid->mnt.ns_fd < 0)
> + if (open_mnt_ns(nsid, rsti(root_item)))
> goto err;
> - /* we set ns_populated so we don't need to open root_fd */
> - nsid->ns_populated = true;
> continue;
> }
>
> @@ -3504,14 +3526,7 @@ ns_created:
> if (cr_pivot_root(path))
> goto err;
>
> - /* Pin one with a file descriptor */
> - nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
> - if (nsid->mnt.ns_fd < 0)
> - goto err;
> -
> - /* root_fd is used to restore file mappings */
> - nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
> - if (nsid->mnt.root_fd < 0)
> + if (open_mnt_ns(nsid, rsti(root_item)))
> goto err;
>
> /* And return back to regain the access to the roots yard */
> @@ -3628,6 +3643,7 @@ int mntns_get_root_fd(struct ns_id *mntns)
> fd = open_proc(root_item->pid.virt, "fd/%d", mntns->mnt.root_fd);
> if (fd < 0)
> return -1;
> + close_pid_proc();
>
> return mntns_set_root_fd(mntns->ns_pid, fd);
> }
> diff --git a/criu/pstree.c b/criu/pstree.c
> index af89dbb..94b8021 100644
> --- a/criu/pstree.c
> +++ b/criu/pstree.c
> @@ -943,6 +943,7 @@ static int prepare_pstree_for_unshare(void)
> fake_root->threads->virt = INIT_PID;
> fake_root->ids = root_ids;
> rsti(fake_root)->clone_flags = opts.unshare_flags | rsti(root_item)->clone_flags;
> + INIT_LIST_HEAD(&rsti(fake_root)->used);
>
> rsti(fake_root)->helper_cb = do_fake_init;
>
>
More information about the CRIU
mailing list