[CRIU] [PATCH v2 36/36] ns: Allow nested user namespaces

Kirill Tkhai ktkhai at virtuozzo.com
Wed Feb 8 02:57:04 PST 2017


On 07.02.2017 19:31, Andrei Vagin wrote:
> On Tue, Feb 07, 2017 at 11:14:07AM +0300, Kirill Tkhai wrote:
>> On 07.02.2017 03:51, Andrei Vagin wrote:
>>> On Fri, Feb 03, 2017 at 07:16:55PM +0300, Kirill Tkhai wrote:
>>>> Everything is prepared for nested user namespaces support.
>>>> The only thing, we should do more, is to enter to dumped
>>>> user namespace's parent before the dump.
>>>> We use CLONE_VM for child tasks, so they may populate
>>>> user_ns maps in parent memory without any tricks.
>>>>
>>>> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
>>>> ---
>>>>  criu/include/namespaces.h |    2 +
>>>>  criu/namespaces.c         |   65 ++++++++++++++++++++++++++++++++++++++++++---
>>>>  2 files changed, 62 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
>>>> index 67a96d6b3..c7fc0a05e 100644
>>>> --- a/criu/include/namespaces.h
>>>> +++ b/criu/include/namespaces.h
>>>> @@ -39,7 +39,7 @@
>>>>  #define CLONE_ALLNS	(CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP)
>>>>  
>>>>  /* Nested namespaces are supported only for these types */
>>>> -#define CLONE_SUBNS	(CLONE_NEWNS)
>>>> +#define CLONE_SUBNS	(CLONE_NEWNS | CLONE_NEWUSER)
>>>>  #define EXTRA_SIZE	20
>>>>  
>>>>  #ifndef NSIO
>>>> diff --git a/criu/namespaces.c b/criu/namespaces.c
>>>> index 0bbd57e96..5cf46ed8f 100644
>>>> --- a/criu/namespaces.c
>>>> +++ b/criu/namespaces.c
>>>> @@ -31,6 +31,7 @@
>>>>  #include "util.h"
>>>>  #include "images/ns.pb-c.h"
>>>>  #include "common/scm.h"
>>>> +#include "proc_parse.h"
>>>>  
>>>>  static struct ns_desc *ns_desc_array[] = {
>>>>  	&net_ns_desc,
>>>> @@ -970,11 +971,29 @@ static int parse_id_map(pid_t pid, char *name, UidGidExtent ***pb_exts)
>>>>  	return -1;
>>>>  }
>>>>  
>>>> -static int dump_user_ns(struct ns_id *ns);
>>>> +static int __dump_user_ns(struct ns_id *ns);
>>>> +
>>>> +static int dump_user_ns(void *arg)
>>>> +{
>>>> +	struct ns_id *ns = arg;
>>>> +
>>>> +	if (switch_ns(ns->parent->ns_pid, &user_ns_desc, NULL) < 0) {
>>>> +		pr_err("Can't enter user namespace\n");
>>>> +		return -1;
>>>> +	}
>>>> +
>>>> +	return __dump_user_ns(ns);
>>>> +}
>>>>  
>>>>  int collect_user_ns(struct ns_id *ns, void *oarg)
>>>>  {
>>>> +	int status, stack_size;
>>>> +	struct ns_id *p_ns;
>>>> +	pid_t pid = -1;
>>>>  	UsernsEntry *e;
>>>> +	char *stack;
>>>> +
>>>> +	p_ns = ns->parent;
>>>>  
>>>>  	e = xmalloc(sizeof(*e));
>>>>  	if (!e)
>>>> @@ -990,8 +1009,43 @@ int collect_user_ns(struct ns_id *ns, void *oarg)
>>>>  	 * mappings, which are used for convirting local id-s to
>>>>  	 * userns id-s (userns_uid(), userns_gid())
>>>>  	 */
>>>> -	if (dump_user_ns(ns))
>>>> -		return -1;
>>>> +	if (p_ns) {
>>>> +		/*
>>>> +		 * Currently, we are in NS_CRIU. To dump a NS_OTHER ns,
>>>> +		 * we need to enter its parent ns. As entered to user_ns
>>>> +		 * task has no a way back, we create a child for that.
>>>> +		 * NS_ROOT is dumped w/o clone(), it's xids maps is relatively
>>>> +		 * to NS_CRIU. We use CLONE_VM to make child share our memory,
>>>> +		 * and to allow us see allocated maps, he do. Child's open_proc()
>>>> +		 * may do changes about CRIU's internal files states in memory,
>>>> +		 * so pass CLONE_FILES to reflect that.
>>>> +		 */
>>>> +		stack_size = PAGE_SIZE;
>>>> +		stack = mmap(NULL, stack_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0);
>>>
>>> Maybe it is easier to allocate a big enough stack without MAP_GROWSDOWN.
>>> In this case we will not need to read /proc/self/maps to get its size,
>>> we will not need to care about guard pages, etc.
>>
>> I don't think so. We call several nested functions there, and they are used
>> in generic code. It wouldn't be useful to have to check over all function
>> call tree to determine, if some places like this should enlarge their
>> stack allocation or not, when we modify these function. Big enough stack
>> is uncountable concept.
> 
> In a kernel space we live with a fixed stack. Posix threads use fixed
> stacks. Why is it a problem for us? We can allocate a few megabytes, the
> real memory is allocated only when someone touch it.

Ok, sounds reasonable.
 
>>  
>>>> +		if (stack == MAP_FAILED) {
>>>> +			pr_perror("Can't allocate stack");
>>>> +			return -1;
>>>> +		}
>>>> +		pid = clone(dump_user_ns, stack + stack_size - 1, CLONE_VM | CLONE_FILES | SIGCHLD, ns);
>>>> +		if (pid == -1) {
>>>> +			pr_perror("Can't clone");
>>>> +			return -1;
>>>> +		}
>>>> +		if (waitpid(pid, &status, 0) != pid) {
>>>> +			pr_perror("Unable to wait the %d process", pid);
>>>> +			return -1;
>>>> +		}
>>>> +		if (status) {
>>>> +			pr_err("Can't dump nested user_ns\n");
>>>> +			return -1;
>>>> +		}
>>>> +		stack_size = find_vma_size((unsigned long *)&stack);
>>>> +		munmap(stack, stack_size);
>>>> +		return 0;
>>>> +	} else {
>>>> +		if (__dump_user_ns(ns))
>>>> +			return -1;
>>>> +	}
>>>>  
>>>>  	return 0;
>>>>  }
>>>> @@ -1030,6 +1084,9 @@ static int check_user_ns(struct ns_id *ns)
>>>>  	int status;
>>>>  	pid_t chld;
>>>>  
>>>> +	if (ns->type != NS_ROOT)
>>>> +		return 0;
>>>> +
>>>>  	chld = fork();
>>>>  	if (chld == -1) {
>>>>  		pr_perror("Unable to fork a process");
>>>> @@ -1125,7 +1182,7 @@ static int check_user_ns(struct ns_id *ns)
>>>>  	return 0;
>>>>  }
>>>>  
>>>> -static int dump_user_ns(struct ns_id *ns)
>>>> +static int __dump_user_ns(struct ns_id *ns)
>>>>  {
>>>>  	int ret, exit_code = -1;
>>>>  	pid_t pid = ns->ns_pid;
>>>>
>>>> _______________________________________________
>>>> CRIU mailing list
>>>> CRIU at openvz.org
>>>> https://lists.openvz.org/mailman/listinfo/criu


More information about the CRIU mailing list