[CRIU] [PATCH v2 36/36] ns: Allow nested user namespaces
Andrei Vagin
avagin at virtuozzo.com
Tue Feb 7 08:31:31 PST 2017
On Tue, Feb 07, 2017 at 11:14:07AM +0300, Kirill Tkhai wrote:
> On 07.02.2017 03:51, Andrei Vagin wrote:
> > On Fri, Feb 03, 2017 at 07:16:55PM +0300, Kirill Tkhai wrote:
> >> Everything is prepared for nested user namespaces support.
> >> The only thing, we should do more, is to enter to dumped
> >> user namespace's parent before the dump.
> >> We use CLONE_VM for child tasks, so they may populate
> >> user_ns maps in parent memory without any tricks.
> >>
> >> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> >> ---
> >> criu/include/namespaces.h | 2 +
> >> criu/namespaces.c | 65 ++++++++++++++++++++++++++++++++++++++++++---
> >> 2 files changed, 62 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
> >> index 67a96d6b3..c7fc0a05e 100644
> >> --- a/criu/include/namespaces.h
> >> +++ b/criu/include/namespaces.h
> >> @@ -39,7 +39,7 @@
> >> #define CLONE_ALLNS (CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP)
> >>
> >> /* Nested namespaces are supported only for these types */
> >> -#define CLONE_SUBNS (CLONE_NEWNS)
> >> +#define CLONE_SUBNS (CLONE_NEWNS | CLONE_NEWUSER)
> >> #define EXTRA_SIZE 20
> >>
> >> #ifndef NSIO
> >> diff --git a/criu/namespaces.c b/criu/namespaces.c
> >> index 0bbd57e96..5cf46ed8f 100644
> >> --- a/criu/namespaces.c
> >> +++ b/criu/namespaces.c
> >> @@ -31,6 +31,7 @@
> >> #include "util.h"
> >> #include "images/ns.pb-c.h"
> >> #include "common/scm.h"
> >> +#include "proc_parse.h"
> >>
> >> static struct ns_desc *ns_desc_array[] = {
> >> &net_ns_desc,
> >> @@ -970,11 +971,29 @@ static int parse_id_map(pid_t pid, char *name, UidGidExtent ***pb_exts)
> >> return -1;
> >> }
> >>
> >> -static int dump_user_ns(struct ns_id *ns);
> >> +static int __dump_user_ns(struct ns_id *ns);
> >> +
> >> +static int dump_user_ns(void *arg)
> >> +{
> >> + struct ns_id *ns = arg;
> >> +
> >> + if (switch_ns(ns->parent->ns_pid, &user_ns_desc, NULL) < 0) {
> >> + pr_err("Can't enter user namespace\n");
> >> + return -1;
> >> + }
> >> +
> >> + return __dump_user_ns(ns);
> >> +}
> >>
> >> int collect_user_ns(struct ns_id *ns, void *oarg)
> >> {
> >> + int status, stack_size;
> >> + struct ns_id *p_ns;
> >> + pid_t pid = -1;
> >> UsernsEntry *e;
> >> + char *stack;
> >> +
> >> + p_ns = ns->parent;
> >>
> >> e = xmalloc(sizeof(*e));
> >> if (!e)
> >> @@ -990,8 +1009,43 @@ int collect_user_ns(struct ns_id *ns, void *oarg)
> >> * mappings, which are used for convirting local id-s to
> >> * userns id-s (userns_uid(), userns_gid())
> >> */
> >> - if (dump_user_ns(ns))
> >> - return -1;
> >> + if (p_ns) {
> >> + /*
> >> + * Currently, we are in NS_CRIU. To dump a NS_OTHER ns,
> >> + * we need to enter its parent ns. As entered to user_ns
> >> + * task has no a way back, we create a child for that.
> >> + * NS_ROOT is dumped w/o clone(), it's xids maps is relatively
> >> + * to NS_CRIU. We use CLONE_VM to make child share our memory,
> >> + * and to allow us see allocated maps, he do. Child's open_proc()
> >> + * may do changes about CRIU's internal files states in memory,
> >> + * so pass CLONE_FILES to reflect that.
> >> + */
> >> + stack_size = PAGE_SIZE;
> >> + stack = mmap(NULL, stack_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0);
> >
> > Maybe it is easier to allocate a big enough stack without MAP_GROWSDOWN.
> > In this case we will not need to read /proc/self/maps to get its size,
> > we will not need to care about guard pages, etc.
>
> I don't think so. We call several nested functions there, and they are used
> in generic code. It wouldn't be useful to have to check over all function
> call tree to determine, if some places like this should enlarge their
> stack allocation or not, when we modify these function. Big enough stack
> is uncountable concept.
In a kernel space we live with a fixed stack. Posix threads use fixed
stacks. Why is it a problem for us? We can allocate a few megabytes, the
real memory is allocated only when someone touch it.
>
> >> + if (stack == MAP_FAILED) {
> >> + pr_perror("Can't allocate stack");
> >> + return -1;
> >> + }
> >> + pid = clone(dump_user_ns, stack + stack_size - 1, CLONE_VM | CLONE_FILES | SIGCHLD, ns);
> >> + if (pid == -1) {
> >> + pr_perror("Can't clone");
> >> + return -1;
> >> + }
> >> + if (waitpid(pid, &status, 0) != pid) {
> >> + pr_perror("Unable to wait the %d process", pid);
> >> + return -1;
> >> + }
> >> + if (status) {
> >> + pr_err("Can't dump nested user_ns\n");
> >> + return -1;
> >> + }
> >> + stack_size = find_vma_size((unsigned long *)&stack);
> >> + munmap(stack, stack_size);
> >> + return 0;
> >> + } else {
> >> + if (__dump_user_ns(ns))
> >> + return -1;
> >> + }
> >>
> >> return 0;
> >> }
> >> @@ -1030,6 +1084,9 @@ static int check_user_ns(struct ns_id *ns)
> >> int status;
> >> pid_t chld;
> >>
> >> + if (ns->type != NS_ROOT)
> >> + return 0;
> >> +
> >> chld = fork();
> >> if (chld == -1) {
> >> pr_perror("Unable to fork a process");
> >> @@ -1125,7 +1182,7 @@ static int check_user_ns(struct ns_id *ns)
> >> return 0;
> >> }
> >>
> >> -static int dump_user_ns(struct ns_id *ns)
> >> +static int __dump_user_ns(struct ns_id *ns)
> >> {
> >> int ret, exit_code = -1;
> >> pid_t pid = ns->ns_pid;
> >>
> >> _______________________________________________
> >> CRIU mailing list
> >> CRIU at openvz.org
> >> https://lists.openvz.org/mailman/listinfo/criu
More information about the CRIU
mailing list