[CRIU] [PATCH v2 27/36] ns: Generate user_ns tree
Kirill Tkhai
ktkhai at virtuozzo.com
Tue Feb 7 00:05:01 PST 2017
On 07.02.2017 01:28, Andrei Vagin wrote:
> On Mon, Feb 06, 2017 at 12:13:47PM +0300, Kirill Tkhai wrote:
>> On 03.02.2017 23:43, Andrei Vagin wrote:
>>> On Fri, Feb 03, 2017 at 07:15:36PM +0300, Kirill Tkhai wrote:
>>>> Create user namespaces hierarhy from criu main task.
>>>> Open ns'es fds in, so they are seen for everybody as
>>>> /proc/[criu pid]/fd/[ns_fd].
>>>>
>>>> Why we do it this way.
>>>> 1)User namespaces are not correlated with task
>>>> hierarhy. Parent task may have a user namespace
>>>> of a level bigger, that a child task. So, we
>>>> can't restore the user namespaces just by
>>>> passing CLONE_NEWUSER in fork_with_pid().
>>>>
>>>> 2)We create namespaces from criu main task to store
>>>> open namespaces'es fds. If we used root_item instead,
>>>> all open files would clone to children, and children
>>>> would have close unnecessary file descriptors, which
>>>> is just a time wasting.
>>>>
>>>> 3)CLONE_FS tasks will require user_ns is set at the
>>>> moment of clone(), so we have to restore target user_ns
>>>> in locality of create_children_and_session().
>>>>
>>>> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
>>>> ---
>>>> criu/cr-restore.c | 3 +
>>>> criu/include/namespaces.h | 1
>>>> criu/namespaces.c | 153 +++++++++++++++++++++++++++++++++++++++++++++
>>>> criu/pstree.c | 6 +-
>>>> 4 files changed, 161 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>>>> index ab05ebfd1..07a966154 100644
>>>> --- a/criu/cr-restore.c
>>>> +++ b/criu/cr-restore.c
>>>> @@ -1818,7 +1818,8 @@ static int restore_root_task(struct pstree_item *init)
>>>> * uid_map and gid_map must be filled from a parent user namespace.
>>>> * prepare_userns_creds() must be called after filling mappings.
>>>> */
>>>> - if ((root_ns_mask & CLONE_NEWUSER) && prepare_userns(init->pid->real, userns_entry))
>>>> + if ((root_ns_mask & CLONE_NEWUSER) &&
>>>> + (prepare_userns(init->pid->real, userns_entry) < 0 || create_ns_hierarhy() < 0))
>>>> goto out_kill;
>>>>
>>>> pr_info("Wait until namespaces are created\n");
>>>> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
>>>> index 546de7c5d..bf8b90eba 100644
>>>> --- a/criu/include/namespaces.h
>>>> +++ b/criu/include/namespaces.h
>>>> @@ -168,6 +168,7 @@ extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd);
>>>>
>>>> extern int collect_user_namespaces(bool for_dump);
>>>> extern int prepare_userns(pid_t real_pid, UsernsEntry *e);
>>>> +extern int create_ns_hierarhy(void);
>>>> extern int stop_usernsd(void);
>>>>
>>>> extern uid_t userns_uid(uid_t uid);
>>>> diff --git a/criu/namespaces.c b/criu/namespaces.c
>>>> index 6151219d8..fd390c938 100644
>>>> --- a/criu/namespaces.c
>>>> +++ b/criu/namespaces.c
>>>> @@ -30,6 +30,7 @@
>>>> #include "protobuf.h"
>>>> #include "util.h"
>>>> #include "images/ns.pb-c.h"
>>>> +#include "common/scm.h"
>>>>
>>>> static struct ns_desc *ns_desc_array[] = {
>>>> &net_ns_desc,
>>>> @@ -2151,5 +2152,157 @@ int prepare_namespace_before_tasks(void)
>>>> return -1;
>>>> }
>>>>
>>>> +enum {
>>>> + NS__CREATED = 1,
>>>> + NS__MAPS_POPULATED,
>>>> + NS__RESTORED,
>>>> + NS__EXIT_HELPER,
>>>> + NS__ERROR,
>>>> +};
>>>> +
>>>> +struct ns_arg {
>>>> + struct ns_id *me;
>>>> + futex_t *futex;
>>>> + pid_t pid;
>>>> +};
>>>> +
>>>> +static int create_user_ns_hierarhy_fn(void *in_arg)
>>>> +{
>>>> + char stack[128] __stack_aligned__;
>>>> + struct ns_arg arg, *p_arg = in_arg;
>>>> + futex_t *p_futex, *futex = NULL;
>>>> + int status, fd, ret = -1;
>>>> + struct ns_id *me, *child;
>>>> + pid_t pid = -1;
>>>> +
>>>> + p_futex = p_arg->futex;
>>>> + me = p_arg->me;
>>>> +
>>>> + if (p_futex) {
>>>> + /* Temporary set ns owner to me to allow parent restore user_ns maps */
>>>> + me->owner.pid = get_self_real_pid();
>>>> + if (me->owner.pid < 0) {
>>>> + pr_err("Can't self pid\n");
>>>> + goto out;
>>>> + }
>>>> + futex_set_and_wake(p_futex, NS__CREATED);
>>>> +
>>>> + fd = open("/proc/self/ns/user", O_RDONLY);
>>>> + if (fd < 0) {
>>>> + pr_err("Can't get self user ns");
>>>> + goto out;
>>>> + }
>>>> + /*
>>>> + * As we are cloned with CLONE_FILES,
>>>> + * parent task will see this fd too.
>>>> + */
>>>> + me->owner.fd = fd;
>>>> +
>>>> + futex_wait_while_lt(p_futex, NS__MAPS_POPULATED);
>>>> + if (prepare_userns_creds()) {
>>>> + pr_err("Can't prepare creds\n");
>>>> + goto out;
>>>> + }
>>>> + }
>>>> +
>>>> + futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
>>>> + if (futex == MAP_FAILED) {
>>>> + pr_perror("Failed to mmap futex");
>>>> + goto out;
>>>> + }
>>>> + arg.futex = futex;
>>>> + arg.pid = p_arg->pid;
>>>> +
>>>> + list_for_each_entry(child, &me->children, siblings) {
>>>> + arg.me = child;
>>>> + futex_init(futex);
>>>> +
>>>> + pid = clone(create_user_ns_hierarhy_fn, stack + 127, CLONE_NEWUSER | CLONE_FILES | SIGCHLD, &arg);
>>>
>>> stack has to be aligned. I think stack + 128 should be used. You have to
>>> gurantee, that arg will be placed after stack, pls take a look at
>>> "struct cr_clone_arg". I think we need to do something similar here.
>>
>> I saw it and even was inspiring this code... Strange. Ok, 128 bytes.
>
> What is exactly strange here?
Strange is that I saw that code and was inspiring it, but somehow 127 bytes came.
>>
>>>> + if (pid < 0) {
>>>> + pr_perror("Can't clone");
>>>> + goto out;
>>>> + }
>>>> + futex_wait_while_lt(futex, NS__CREATED);
>>>> + /* Get child real pid */
>>>> + pid = child->owner.pid;
>>>> + if (prepare_userns(pid, child->user.e) < 0) {
>>>> + pr_err("Can't prepare child user_ns\n");
>>>> + goto out;
>>>> + }
>>>> + /* Set ns owner to criu's virt pid */
>>>> + child->owner.pid = p_arg->pid;
>>>> + futex_set_and_wake(futex, NS__MAPS_POPULATED);
>>>> +
>>>> + errno = 0;
>>>> + if (wait(&status) < 0 || WEXITSTATUS(status)) {
>>>
>>> If a process was killed, WEXITSTATUS(status) will be 0.
>>
>> Good point, thanks.
>>
>>> status = -1;
>>> if (waitpid(pid, &status, 0) < 0 || status) {
>>>
>>>> + pr_perror("Child process waiting: %d\n", WEXITSTATUS(status));
>>>> + goto out;
>>>> + }
>>>> + }
>>>> +
>>>> + ret = 0;
>>>> +out:
>>>> + if (p_futex)
>>>> + futex_set_and_wake(p_futex, ret ? NS__ERROR : NS__RESTORED);
>>>> + if (futex)
>>>> + munmap(futex, sizeof(*futex));
>>>> + return ret ? 1 : 0;
>>>> +}
>>>> +
>>>> +static int do_create_ns_hierarhy(void *ppid)
>>>> +{
>>>> + struct ns_arg arg;
>>>> + char buf[128];
>>>> + int fd;
>>>> +
>>>> + arg.me = root_user_ns;
>>>> + arg.futex = NULL;
>>>> + arg.pid = (pid_t)(long)ppid;
>>>> +
>>>> + fd = get_service_fd(CR_PROC_FD_OFF);
>>>> + if (fd < 0)
>>>> + exit(4);
>>>> +
>>>> + snprintf(buf, sizeof(buf), "%d/ns/user", root_item->pid->real);
>>>> + fd = openat(fd, buf, O_RDONLY);
>>>> + if (fd < 0) {
>>>> + pr_perror("Can't open %s", buf);
>>>> + exit(5);
>>>> + }
>>>> + if (setns(fd, CLONE_NEWUSER) < 0) {
>>>> + pr_perror("Can't setns()");
>>>> + exit(6);
>>>> + }
>>>> + if (prepare_userns_creds() < 0) {
>>>> + pr_err("Can't prepare creds\n");
>>>> + exit(7);
>>>> + }
>>>> + exit(create_user_ns_hierarhy_fn(&arg));
>>>> +}
>>>> +
>>>> +int create_ns_hierarhy(void)
>>>> +{
>>>> + char stack[128] __stack_aligned__;
>>>> + int status;
>>>> + pid_t pid;
>>>> +
>>>> + if (!(root_ns_mask & CLONE_NEWUSER))
>>>> + return 0;
>>>> +
>>>> + pid = clone(do_create_ns_hierarhy, stack + 127, CLONE_FILES | SIGCHLD, (void *)(long)getpid());
>>>> + if (pid < 0) {
>>>> + pr_perror("Can't clone()");
>>>> + return -1;
>>>> + }
>>>> +
>>>> + errno = 0;
>>>> + if (waitpid(pid, &status, 0) < 0 || WEXITSTATUS(status)) {
>>>> + pr_err("Can't create ns hierarhy: errno=%d, status=%d\n",
>>>> + errno, WEXITSTATUS(status));
>>>> + return -1;
>>>> + }
>>>> + return 0;
>>>> +}
>>>> +
>>>> struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
>>>> struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
>>>> diff --git a/criu/pstree.c b/criu/pstree.c
>>>> index 1ba762b80..d2d7339bc 100644
>>>> --- a/criu/pstree.c
>>>> +++ b/criu/pstree.c
>>>> @@ -873,8 +873,12 @@ static int prepare_pstree_kobj_ids(void)
>>>> * be born in a fresh new mount namespace
>>>> * which will be populated with all other
>>>> * namespaces' entries.
>>>> + *
>>>> + * User namespaces are created in create_ns_hierarhy()
>>>> + * before the tasks, as their hierarhy does not correlated
>>>> + * with tasks hierarhy in any way.
>>>> */
>>>> - rsti(item)->clone_flags &= ~CLONE_NEWNS;
>>>> + rsti(item)->clone_flags &= ~(CLONE_NEWNS | CLONE_NEWUSER);
>>>>
>>>> cflags &= CLONE_ALLNS;
>>>>
>>>>
>>>> _______________________________________________
>>>> CRIU mailing list
>>>> CRIU at openvz.org
>>>> https://lists.openvz.org/mailman/listinfo/criu
More information about the CRIU
mailing list