[CRIU] [PATCH v2 27/36] ns: Generate user_ns tree
Kirill Tkhai
ktkhai at virtuozzo.com
Fri Feb 3 08:15:36 PST 2017
Create user namespaces hierarhy from criu main task.
Open ns'es fds in, so they are seen for everybody as
/proc/[criu pid]/fd/[ns_fd].
Why we do it this way.
1)User namespaces are not correlated with task
hierarhy. Parent task may have a user namespace
of a level bigger, that a child task. So, we
can't restore the user namespaces just by
passing CLONE_NEWUSER in fork_with_pid().
2)We create namespaces from criu main task to store
open namespaces'es fds. If we used root_item instead,
all open files would clone to children, and children
would have close unnecessary file descriptors, which
is just a time wasting.
3)CLONE_FS tasks will require user_ns is set at the
moment of clone(), so we have to restore target user_ns
in locality of create_children_and_session().
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
criu/cr-restore.c | 3 +
criu/include/namespaces.h | 1
criu/namespaces.c | 153 +++++++++++++++++++++++++++++++++++++++++++++
criu/pstree.c | 6 +-
4 files changed, 161 insertions(+), 2 deletions(-)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index ab05ebfd1..07a966154 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -1818,7 +1818,8 @@ static int restore_root_task(struct pstree_item *init)
* uid_map and gid_map must be filled from a parent user namespace.
* prepare_userns_creds() must be called after filling mappings.
*/
- if ((root_ns_mask & CLONE_NEWUSER) && prepare_userns(init->pid->real, userns_entry))
+ if ((root_ns_mask & CLONE_NEWUSER) &&
+ (prepare_userns(init->pid->real, userns_entry) < 0 || create_ns_hierarhy() < 0))
goto out_kill;
pr_info("Wait until namespaces are created\n");
diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
index 546de7c5d..bf8b90eba 100644
--- a/criu/include/namespaces.h
+++ b/criu/include/namespaces.h
@@ -168,6 +168,7 @@ extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd);
extern int collect_user_namespaces(bool for_dump);
extern int prepare_userns(pid_t real_pid, UsernsEntry *e);
+extern int create_ns_hierarhy(void);
extern int stop_usernsd(void);
extern uid_t userns_uid(uid_t uid);
diff --git a/criu/namespaces.c b/criu/namespaces.c
index 6151219d8..fd390c938 100644
--- a/criu/namespaces.c
+++ b/criu/namespaces.c
@@ -30,6 +30,7 @@
#include "protobuf.h"
#include "util.h"
#include "images/ns.pb-c.h"
+#include "common/scm.h"
static struct ns_desc *ns_desc_array[] = {
&net_ns_desc,
@@ -2151,5 +2152,157 @@ int prepare_namespace_before_tasks(void)
return -1;
}
+enum {
+ NS__CREATED = 1,
+ NS__MAPS_POPULATED,
+ NS__RESTORED,
+ NS__EXIT_HELPER,
+ NS__ERROR,
+};
+
+struct ns_arg {
+ struct ns_id *me;
+ futex_t *futex;
+ pid_t pid;
+};
+
+static int create_user_ns_hierarhy_fn(void *in_arg)
+{
+ char stack[128] __stack_aligned__;
+ struct ns_arg arg, *p_arg = in_arg;
+ futex_t *p_futex, *futex = NULL;
+ int status, fd, ret = -1;
+ struct ns_id *me, *child;
+ pid_t pid = -1;
+
+ p_futex = p_arg->futex;
+ me = p_arg->me;
+
+ if (p_futex) {
+ /* Temporary set ns owner to me to allow parent restore user_ns maps */
+ me->owner.pid = get_self_real_pid();
+ if (me->owner.pid < 0) {
+ pr_err("Can't self pid\n");
+ goto out;
+ }
+ futex_set_and_wake(p_futex, NS__CREATED);
+
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ pr_err("Can't get self user ns");
+ goto out;
+ }
+ /*
+ * As we are cloned with CLONE_FILES,
+ * parent task will see this fd too.
+ */
+ me->owner.fd = fd;
+
+ futex_wait_while_lt(p_futex, NS__MAPS_POPULATED);
+ if (prepare_userns_creds()) {
+ pr_err("Can't prepare creds\n");
+ goto out;
+ }
+ }
+
+ futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (futex == MAP_FAILED) {
+ pr_perror("Failed to mmap futex");
+ goto out;
+ }
+ arg.futex = futex;
+ arg.pid = p_arg->pid;
+
+ list_for_each_entry(child, &me->children, siblings) {
+ arg.me = child;
+ futex_init(futex);
+
+ pid = clone(create_user_ns_hierarhy_fn, stack + 127, CLONE_NEWUSER | CLONE_FILES | SIGCHLD, &arg);
+ if (pid < 0) {
+ pr_perror("Can't clone");
+ goto out;
+ }
+ futex_wait_while_lt(futex, NS__CREATED);
+ /* Get child real pid */
+ pid = child->owner.pid;
+ if (prepare_userns(pid, child->user.e) < 0) {
+ pr_err("Can't prepare child user_ns\n");
+ goto out;
+ }
+ /* Set ns owner to criu's virt pid */
+ child->owner.pid = p_arg->pid;
+ futex_set_and_wake(futex, NS__MAPS_POPULATED);
+
+ errno = 0;
+ if (wait(&status) < 0 || WEXITSTATUS(status)) {
+ pr_perror("Child process waiting: %d\n", WEXITSTATUS(status));
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ if (p_futex)
+ futex_set_and_wake(p_futex, ret ? NS__ERROR : NS__RESTORED);
+ if (futex)
+ munmap(futex, sizeof(*futex));
+ return ret ? 1 : 0;
+}
+
+static int do_create_ns_hierarhy(void *ppid)
+{
+ struct ns_arg arg;
+ char buf[128];
+ int fd;
+
+ arg.me = root_user_ns;
+ arg.futex = NULL;
+ arg.pid = (pid_t)(long)ppid;
+
+ fd = get_service_fd(CR_PROC_FD_OFF);
+ if (fd < 0)
+ exit(4);
+
+ snprintf(buf, sizeof(buf), "%d/ns/user", root_item->pid->real);
+ fd = openat(fd, buf, O_RDONLY);
+ if (fd < 0) {
+ pr_perror("Can't open %s", buf);
+ exit(5);
+ }
+ if (setns(fd, CLONE_NEWUSER) < 0) {
+ pr_perror("Can't setns()");
+ exit(6);
+ }
+ if (prepare_userns_creds() < 0) {
+ pr_err("Can't prepare creds\n");
+ exit(7);
+ }
+ exit(create_user_ns_hierarhy_fn(&arg));
+}
+
+int create_ns_hierarhy(void)
+{
+ char stack[128] __stack_aligned__;
+ int status;
+ pid_t pid;
+
+ if (!(root_ns_mask & CLONE_NEWUSER))
+ return 0;
+
+ pid = clone(do_create_ns_hierarhy, stack + 127, CLONE_FILES | SIGCHLD, (void *)(long)getpid());
+ if (pid < 0) {
+ pr_perror("Can't clone()");
+ return -1;
+ }
+
+ errno = 0;
+ if (waitpid(pid, &status, 0) < 0 || WEXITSTATUS(status)) {
+ pr_err("Can't create ns hierarhy: errno=%d, status=%d\n",
+ errno, WEXITSTATUS(status));
+ return -1;
+ }
+ return 0;
+}
+
struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
diff --git a/criu/pstree.c b/criu/pstree.c
index 1ba762b80..d2d7339bc 100644
--- a/criu/pstree.c
+++ b/criu/pstree.c
@@ -873,8 +873,12 @@ static int prepare_pstree_kobj_ids(void)
* be born in a fresh new mount namespace
* which will be populated with all other
* namespaces' entries.
+ *
+ * User namespaces are created in create_ns_hierarhy()
+ * before the tasks, as their hierarhy does not correlated
+ * with tasks hierarhy in any way.
*/
- rsti(item)->clone_flags &= ~CLONE_NEWNS;
+ rsti(item)->clone_flags &= ~(CLONE_NEWNS | CLONE_NEWUSER);
cflags &= CLONE_ALLNS;
More information about the CRIU
mailing list