[Devel] [PATCH 4/6] user namespace support for upstream containers

Mon Mar 11 12:06:23 PDT 2013

On 03/11/2013 04:01 AM, Glauber Costa wrote:
> This patch allows the execution of unprivileged containers running ontop
> of an upstream Linux Kernel. We will run at whatever UID is found in the
> configuration file.
>
> Signed-off-by: Glauber Costa <glommer at parallels.com>
> ---
>   include/types.h    |   1 +
>   src/lib/env.c      |  16 +++++++++
>   src/lib/hooks_ct.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>   3 files changed, 117 insertions(+), 3 deletions(-)
>
> diff --git a/include/types.h b/include/types.h
> index ceecb93..54eb1f4 100644
> --- a/include/types.h
> +++ b/include/types.h
> @@ -95,6 +95,7 @@ typedef struct vps_handler {
>   	int vzfd;	/**< /dev/vzctl file descriptor. */
>   	int stdfd;
>   	int can_join_pidns; /* can't enter otherwise */
> +	int can_join_userns; /* can't run non privileged otherwise */
>   	int (*is_run)(struct vps_handler *h, envid_t veid);
>   	int (*enter)(struct vps_handler *h, envid_t veid, const char *root, int flags);
>   	int (*destroy)(struct vps_handler *h, envid_t veid);
> diff --git a/src/lib/env.c b/src/lib/env.c
> index 2da848d..75e2dee 100644
> --- a/src/lib/env.c
> +++ b/src/lib/env.c
> @@ -280,6 +280,22 @@ int exec_container_init(struct arg_start *arg,
>   	if (read(arg->wait_p, &ret, sizeof(ret)) == 0)
>   		return -1;
>   
> +	/*
> +	 * If we are running on upstream Linux Kernel, we will arrive here as
> +	 * the default unprivileged user. This needs to be setup by the
> +	 * container thread (pid 1), but only after the mapping is established.
> +	 *
> +	 * Since the mapping can only ever be established by a process that has
> +	 * CAP_SETUID in the parent namespace, this has to be done from the
> +	 * process who called clone, not by the cloned children. We need some sort
> +	 * of synchronization to make sure the mappings are already in place, so
> +	 * we do it after the read of wait_p above.
> +	 */
> +	if (!is_vz_kernel(arg->h) && arg->h->can_join_userns) {
> +		setuid(0);
> +		setgid(0);
> +	}
> +
>   	if ((fd = open("/dev/null", O_RDWR)) != -1) {
>   		dup2(fd, 0);
>   		dup2(fd, 1);
> diff --git a/src/lib/hooks_ct.c b/src/lib/hooks_ct.c
> index 29d7eea..1d2493f 100644
> --- a/src/lib/hooks_ct.c
> +++ b/src/lib/hooks_ct.c
> @@ -141,7 +141,12 @@ static int _env_create(void *data)
>   	if ((ret = ct_chroot(arg->res->fs.root)))
>   		return ret;
>   
> -	if ((ret = vps_set_cap(arg->veid, &arg->res->env, &arg->res->cap, 1)))
> +	/*
> +	 * If we are using the user namespace, we will have the full capability
> +	 * set in the target namespace. So we don't need any of that.
> +	 */
> +	if (!arg->h->can_join_userns &&
> +		(ret = vps_set_cap(arg->veid, &arg->res->env, &arg->res->cap, 1)))
>   		return ret;
>   
>   	fill_container_param(arg, &create_param);
> @@ -153,6 +158,44 @@ static int _env_create(void *data)
>   	return exec_container_init(arg, &create_param);
>   }
>   
> +static int write_uid_gid_mapping(vps_handler *h, unsigned long uid, unsigned long gid, pid_t pid)
> +{
> +	char buf[64];
> +	char umap[64], gmap[64];
> +	int fdu, fdg, ret;
> +
> +	snprintf(umap, 64, "0 %ld 100000", uid);
> +	snprintf(gmap, 64, "0 %ld 100000", gid);

1 What is the magical 100000 number?

2 Please always use sizeof() in places like this.

> +
> +	snprintf(buf, 64, "/proc/%d/uid_map", pid);
> +	fdu = open(buf, O_WRONLY);
> +	if (fdu < 0) {
> +		perror("opening");
> +		return -1;
> +	}
> +
> +	snprintf(buf, 64, "/proc/%d/gid_map", pid);
> +	fdg = open(buf, O_WRONLY);
> +	if (fdg < 0) {
> +		perror("opening");
> +		return -1;
> +	}
> +
> +	ret = write(fdu, umap, sizeof(umap));
> +	if (ret < 0) {
> +		perror("writing");
> +		return -1;
> +	}
> +
> +	ret = write(fdg, gmap, sizeof(gmap));
> +	if (ret < 0) {
> +		perror("writing");
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
>   static int ct_env_create(struct arg_start *arg)
>   {
>   
> @@ -190,16 +233,29 @@ static int ct_env_create(struct arg_start *arg)
>   	 * Belong in the setup phase
>   	 */
>   	clone_flags = SIGCHLD;
> -	/* FIXME: USERNS is still work in progress */
>   	clone_flags |= CLONE_NEWUTS|CLONE_NEWPID|CLONE_NEWIPC;
>   	clone_flags |= CLONE_NEWNET|CLONE_NEWNS;
>   
> +	if (arg->h->can_join_userns)
> +		clone_flags |= CLONE_NEWUSER;
> +	else
> +		logger(-1, 0, "WARNING: Running container unprivileged. USER_NS not supported");
> +
>   	ret = clone(_env_create, child_stack, clone_flags, arg);
>   	if (ret  < 0) {
>   		logger(-1, errno, "Unable to clone");
>   		/* FIXME: remove ourselves from container first */
>   		destroy_container(arg->veid);
>   		return VZ_RESOURCE_ERROR;
> +	} else if (arg->h->can_join_userns) {
> +		/*
> +		 * Now we need to write to the mapping file. It has to be us,
> +		 * since CAP_SETUID is required in the parent namespace. vzctl
> +		 * is run as root, so we should have it. But our cloned kid
> +		 * will start as the overflow uid 65534 in the new namespace.
> +		 */
> +		write_uid_gid_mapping(arg->h, arg->res->misc.local_uid,
> +					arg->res->misc.local_gid, ret);

Do we want error checking/reporting here?

>   	}
>   
>   	snprintf(procpath, STR_SIZE, "/proc/%d/ns/net", ret);
> @@ -221,6 +277,7 @@ static int ct_enter(vps_handler *h, envid_t veid, const char *root, int flags)
>   	pid_t task_pid;
>   	int ret = VZ_RESOURCE_ERROR;
>   	bool joined_mnt_ns = false;
> +	int fd;
>   
>   	if (!h->can_join_pidns) {
>   		logger(-1, 0, "Kernel lacks setns for pid namespace");
> @@ -245,18 +302,45 @@ static int ct_enter(vps_handler *h, envid_t veid, const char *root, int flags)
>   		return VZ_RESOURCE_ERROR;
>   	}
>   
> +	/*
> +	 * Because all namespaces are associated with an owner userns,
> +	 * and capabilities may be needed for issuing setns syscalls into
> +	 * some key target namespaces (like the mount namespace), we will
> +	 * first enter the user namespace if it is available. Only then we
> +	 * scan all others and join them as they appear
> +	 */
> +	if (h->can_join_userns) {
> +		if (snprintf(path, sizeof(path), "/proc/%d/ns/user", task_pid) < 0)
> +			goto out;
> +
> +		if ((fd = open(path, O_RDONLY)) < 0)
> +			goto out;
> +
> +		if (setns(fd, CLONE_NEWUSER)) {
> +			logger(-1, errno, "Failed to set context for user namespace");
> +			goto out;
> +		}
> +		setuid(0);
> +		setgid(0);
> +	}
> +
>   	ret = VZ_RESOURCE_ERROR;
>   	while ((ep = readdir (dp))) {
> -		int fd;
>   		if (!strcmp(ep->d_name, "."))
>   			continue;
>   		if (!strcmp(ep->d_name, ".."))
>   			continue;
>   
> +		/* already joined */
> +		if ((!strcmp(ep->d_name, "user")))
> +			continue;
> +
>   		if (snprintf(path, sizeof(path), "/proc/%d/ns/%s", task_pid, ep->d_name) < 0)
>   			goto out;
> +
>   		if ((fd = open(path, O_RDONLY)) < 0)
>   			goto out;
> +
>   		if (setns(fd, 0))
>   			logger(-1, errno, "Failed to set context for %s", ep->d_name);
>   
> @@ -562,6 +646,7 @@ int ct_do_open(vps_handler *h)
>   {
>   	int ret;
>   	char path[STR_SIZE];
> +	char upath[STR_SIZE];
>   	struct stat st;
>   
>   	ret = container_init();
> @@ -578,6 +663,9 @@ int ct_do_open(vps_handler *h)
>   	if (snprintf(path, sizeof(path), "/proc/%d/ns/pid", getpid()) < 0)
>   		return VZ_RESOURCE_ERROR;
>   
> +	if (snprintf(upath, sizeof(upath), "/proc/%d/ns/user", getpid()) < 0)
> +		return VZ_RESOURCE_ERROR;
> +
>   	ret = mkdir(NETNS_RUN_DIR, S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH);
>   
>   	if (ret && (errno != EEXIST)) {
> @@ -586,6 +674,15 @@ int ct_do_open(vps_handler *h)
>   	}
>   
>   	h->can_join_pidns = !stat(path, &st);
> +	/*
> +	 * Being able to join the user namespace is a good indication that the
> +	 * user namespace is complete. For a long time, the user namespace
> +	 * existed, but were far away from being feature complete.  When
> +	 * running in such a kernel, joining the user namespace will just
> +	 * cripple our container, since we won't be able to do anything. It is
> +	 * only good for people who are okay running containers as root
> +	 */
> +	h->can_join_userns = !stat(upath, &st);
>   	h->is_run = ct_is_run;
>   	h->enter = ct_enter;
>   	h->destroy = ct_destroy;