[Devel] [PATCH v3 4/9] user namespace support for upstream containers

Mon May 13 20:02:16 PDT 2013

Oh my, four cases of whitespace-at-eol which I had to fixed manually.

Wanted to apply it nevertheless and fix some things myself, but then the 
list grew out too big.

Plus please see inline.

On 04/29/2013 10:16 PM, Glauber Costa wrote:
> From: Glauber Costa <glommer at parallels.com>
>
> This patch allows the execution of unprivileged containers running ontop
> of an upstream Linux Kernel. We will run at whatever UID is found in the
> configuration file (so far empty, thus disabled).
>
> Signed-off-by: Glauber Costa <glommer at parallels.com>
> ---
>   include/env.h      |   1 +
>   include/types.h    |   1 +
>   src/lib/hooks_ct.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>   3 files changed, 214 insertions(+), 4 deletions(-)
>
> diff --git a/include/env.h b/include/env.h
> index 06729f6..4fef438 100644
> --- a/include/env.h
> +++ b/include/env.h
> @@ -116,6 +116,7 @@ struct arg_start {
>   	vps_handler *h;
>   	void *data;
>   	env_create_FN fn;
> +	int userns_p; /* while running in userns, there's extra sync needed */
>   };
>   
>   struct env_create_param3;
> diff --git a/include/types.h b/include/types.h
> index a4bce73..fa511aa 100644
> --- a/include/types.h
> +++ b/include/types.h
> @@ -95,6 +95,7 @@ typedef struct vps_handler {
>   	int vzfd;	/**< /dev/vzctl file descriptor. */
>   	int stdfd;
>   	int can_join_pidns; /* can't enter otherwise */
> +	int can_join_userns; /* can't run non privileged otherwise */
>   	int (*is_run)(struct vps_handler *h, envid_t veid);
>   	int (*enter)(struct vps_handler *h, envid_t veid, const char *root, int flags);
>   	int (*destroy)(struct vps_handler *h, envid_t veid);
> diff --git a/src/lib/hooks_ct.c b/src/lib/hooks_ct.c
> index f24d91e..b0cb359 100644
> --- a/src/lib/hooks_ct.c
> +++ b/src/lib/hooks_ct.c
> @@ -66,6 +66,8 @@ static int sys_setns(int fd, int nstype)
>   # define MS_PRIVATE (1 << 18)
>   #endif
>   
> +#define UID_GID_RANGE 100000 /* how many users per container */
> +
>   /* This function is there in GLIBC, but not in headers */
>   extern int pivot_root(const char * new_root, const char * put_old);
>   
> @@ -138,10 +140,41 @@ static int _env_create(void *data)
>   	struct env_create_param3 create_param;
>   	int ret;
>   
> -	if ((ret = ct_chroot(arg->res->fs.root)))
> +	if ((arg->userns_p != -1) && (read(arg->userns_p, &ret, sizeof(ret)) == 0))

1 I'd suggest comparing with sizeof(ret)
2 We can't print any errors here, can we? If we can I'd rather do it.

> +		return -1;

You're supposed to return VZ_* exit codes from here.

> +
> +	ret = ct_chroot(arg->res->fs.root);
> +	close(arg->userns_p);
> +	/* Probably means chroot failed */
> +	if (ret)
>   		return ret;
>   
> -	if ((ret = vps_set_cap(arg->veid, &arg->res->env, &arg->res->cap, 1)))
> +	if (arg->h->can_join_userns) {
> +		int fd;
> +		setuid(0);
> +		setgid(0);
> +		/*
> +		 * We need the special flag "newinstance". This is a requirement
> +		 * of the userns-aware implementation of devpts as of Linux 3.9.
> +		 * Because of that special requirement, we do it here rather than
> +		 * later.
> +		 */
> +		mount("devpts", "/dev/pts", "devpts", 0, "newinstance");

What if mount point does not exist, or is not a directory?

I think it makes sense to
1. try creating it, ignoring EEXIST
2. check return from mount()

> +		/* /dev/ptmx, if it even exists, would refer to the root ptmx.
> +		 * We don't want that, we want our newly created instance to contain
> +		 * all ptys. So we bind mount the root device here
> +		 */
> +		fd = open("/dev/ptmx", O_RDWR|O_CREAT, 0);

You do open() to create the /dev/ptmx file in case it does not exist, right?
Why RDWR then?

> +		close(fd);
> +		mount("/dev/pts/ptmx", "/dev/ptmx", "", MS_BIND, 0);

Same, check return from mount(). It's better to fail earlier
than try to debug why sshd or vzctl enter doesn't work later.

> +	}
> +
> +	/*
> +	 * If we are using the user namespace, we will have the full capability
> +	 * set in the target namespace. So we don't need any of that.
> +	 */
> +	if (!arg->h->can_join_userns &&
> +		(ret = vps_set_cap(arg->veid, &arg->res->env, &arg->res->cap, 1)))
>   		return ret;
>   
>   	fill_container_param(arg, &create_param);
> @@ -224,6 +257,88 @@ static int ct_setlimits(vps_handler *h, envid_t veid, struct ub_struct *ub)
>   }
>   #undef add_value
>   
> +static int write_uid_gid_mapping(vps_handler *h, unsigned long uid, unsigned long gid, pid_t pid)
> +{
> +	char buf[STR_SIZE];
> +	char map[STR_SIZE];
> +	int fd;
> +	int len;
> +	int ret;
> +
> +	len = snprintf(map, sizeof(map), "0 %ld %d", uid, UID_GID_RANGE);
> +	snprintf(buf, sizeof(buf), "/proc/%d/uid_map", pid);
> +	if ((fd = open(buf, O_WRONLY)) < 0)
> +		goto out;
> +
> +	if ((write(fd, map, len) < 0))
> +		goto out;
> +
> +	close(fd);
> +
> +	len = snprintf(map, sizeof(map), "0 %ld %d", gid, UID_GID_RANGE);
> +	snprintf(buf, sizeof(map), "/proc/%d/gid_map", pid);
> +	if ((fd = open(buf, O_WRONLY)) < 0)
> +		goto out;
> +
> +	if ((write(fd, map, len) < 0))
> +		goto out;
> +	ret = 0;
> +out:
> +	if (fd > 0)
> +		close(fd);
> +	return ret;
> +}
> +
> +/*
> + * Those devices should exist in the container, and be valid device nodes with
> + * user access permission. But we need to be absolutely sure this is the case,
> + * so we will provide our own versions. That could actually happen since some
> + * distributions may come with emptied /dev's, waiting for udev to populate them.
> + * That won't happen, we do it ourselves.
> + */
> +static void create_devices(vps_handler *h, envid_t veid, const char *root)
> +{
> +	unsigned int i;
> +	char *devices[] = {
> +		"/dev/null",
> +		"/dev/zero",
> +		"/dev/random",
> +		"/dev/urandom",
> +	};
> +
> +	/*
> +	 * We will tolerate errors, and keep the container running, because it is
> +	 * likely we will be able to boot it to a barely functional state. But
> +	 * be vocal about it
> +	 */
> +	for (i = 0; i < ARRAY_SIZE(devices); i++) {
> +		char ct_devname[STR_SIZE];
> +		int ret;
> +
> +		ret = snprintf(ct_devname, sizeof(ct_devname), "%s/%s", root, devices[i]);
> +		if (ret < 0) {
> +			logger(-1, errno, "Could not allocate device string\n");

we don't need \n for logger() calls.

> +			continue;
> +		}
> +
> +		/*
> +		 * No need to be crazy about file flags. When we bind mount, the
> +		 * source permissions will be inherited.
> +		 */
> +		ret = open(ct_devname, O_RDWR|O_CREAT, 0);
> +		if (ret < 0) {
> +			logger(-1, errno, "Could not touch device %s\n", devices[i]);

ditto
> +			continue;
> +		}
> +		close(ret);
> +
> +		ret = mount(devices[i], ct_devname, "", MS_BIND, 0);
> +		if (ret < 0)
> +			logger(-1, errno, "Could not touch device %s\n", devices[i]);

1 ditto

2 s/touch/bind mount/

> +	}
> +
> +}
> +
>   static int ct_env_create(struct arg_start *arg)
>   {
>   
> @@ -233,6 +348,8 @@ static int ct_env_create(struct arg_start *arg)
>   	int ret;
>   	char procpath[STR_SIZE];
>   	char ctpath[STR_SIZE];
> +	int userns_p[2];
> +	int err;
>   
>   	stack_size = get_pagesize();
>   	if (stack_size < 0)
> @@ -272,17 +389,58 @@ static int ct_env_create(struct arg_start *arg)
>   	 * Belong in the setup phase
>   	 */
>   	clone_flags = SIGCHLD;
> -	/* FIXME: USERNS is still work in progress */
>   	clone_flags |= CLONE_NEWUTS|CLONE_NEWPID|CLONE_NEWIPC;
>   	clone_flags |= CLONE_NEWNET|CLONE_NEWNS;
>   
> +	if (!arg->h->can_join_userns) {
> +		logger(-1, 0, "WARNING: Running container unprivileged. USER_NS not supported");
> +
> +		userns_p[0] = userns_p[1] = -1;
> +	} else {
> +		clone_flags |= CLONE_NEWUSER;
> +		if (pipe(userns_p) < 0) {
> +			logger(-1, errno, "Can not create userns pipe");
> +			return VZ_RESOURCE_ERROR;
> +		}
> +	}
> +	arg->userns_p = userns_p[0];
> +
> +	if (arg->h->can_join_userns) {
> +		create_devices(arg->h, arg->veid, arg->res->fs.root);
> +	}
> +
>   	ret = clone(_env_create, child_stack, clone_flags, arg);
>   	if (ret  < 0) {
>   		logger(-1, errno, "Unable to clone");
>   		/* FIXME: remove ourselves from container first */
>   		destroy_container(arg->veid);
>   		return VZ_RESOURCE_ERROR;
> +	} else if (arg->h->can_join_userns) {

You don't need "else" here, do you?

> +		/*
> +		 * Now we need to write to the mapping file. It has to be us,
> +		 * since CAP_SETUID is required in the parent namespace. vzctl
> +		 * is run as root, so we should have it. But our cloned kid
> +		 * will start as the overflow uid 65534 in the new namespace.
> +		 */
> +		if (write_uid_gid_mapping(arg->h, *arg->res->misc.local_uid,
> +					  *arg->res->misc.local_gid, ret))
> +			return VZ_RESOURCE_ERROR;

Do we need something else here? Maybe destroy_container(arg->veid) and 
kill the child?
> +
> +		/*
> +		 * Nothing should proceed userns wide until we have the
> +		 * mapping.  That creates many non-determisnitic behaviors

Don't be an antimisnite, I'm sure you're not. Please correct your spelling.

> +		 * since some runs will execute with the mapping already done,
> +		 * while others with the mapping off. This is particularly
> +		 * important for setuid, for instance. It will categorically
> +		 * fail if called before a mapping is in place.
> +		 */
> +		if ((userns_p[1] != -1) &&
> +			write(userns_p[1], &err, sizeof(err)) != sizeof(err)) {
> +			logger(-1, errno, "Unable to read from userns pipe");

s/read from/write to/

and if write() fails we do not close fd

> +			return VZ_RESOURCE_ERROR;
> +		}
>   	}
> +	close(userns_p[1]); /* other end closed by child */

Are you sure? This is what clone(2) man page says:

               If CLONE_FILES is not set, the child process inherits a 
copy of all file descriptors opened in the calling process at the time 
of clone().   (The
               duplicated  file  descriptors  in  the  child refer to 
the same open file descriptions (see open(2)) as the corresponding file 
descriptors in the
               calling process.)  Subsequent operations that open or 
close file descriptors, or change file descriptor flags, performed by  
either  the  calling
               process or the child process do not affect the other process.

Essentially you have two sets of fds after clone(), and you need to 
close all them separately. Usually we do close the end we don't need 
right after clone() and the other one after use.

Same in child (i.e. _env_create()) -- we should close one end right at 
the start, and the other end after using it. That, I guess, means, that 
you have to pass both fds to the child -- the other one just for the 
sake of closing.

>   
>   	snprintf(procpath, STR_SIZE, "/proc/%d/ns/net", ret);
>   	ret = symlink(procpath, ctpath);
> @@ -304,6 +462,7 @@ static int ct_enter(vps_handler *h, envid_t veid, const char *root, int flags)
>   	int ret = VZ_RESOURCE_ERROR;
>   	int err;
>   	bool joined_mnt_ns = false;
> +	int fd;
>   
>   	if (!h->can_join_pidns) {
>   		logger(-1, 0, "Kernel lacks setns for pid namespace");
> @@ -328,17 +487,47 @@ static int ct_enter(vps_handler *h, envid_t veid, const char *root, int flags)
>   		goto out;
>   	}
>   
> +	/*
> +	 * Because all namespaces are associated with an owner userns,
> +	 * and capabilities may be needed for issuing setns syscalls into
> +	 * some key target namespaces (like the mount namespace), we will
> +	 * first enter the user namespace if it is available. Only then we
> +	 * scan all others and join them as they appear
> +	 */
> +	if (h->can_join_userns) {
> +		if (snprintf(path, sizeof(path), "/proc/%d/ns/user", task_pid) < 0)
> +			goto out;
> +
> +		if ((fd = open(path, O_RDONLY)) < 0)
> +			goto out;
> +
> +		if (setns(fd, CLONE_NEWUSER)) {
> +			logger(-1, errno, "Failed to set context for user namespace");
> +			close(fd);
> +			goto out;
> +		}
> +		close(fd);
> +		setuid(0);
> +		setgid(0);
> +	}
> +
> +	ret = VZ_RESOURCE_ERROR;
>   	while ((ep = readdir (dp))) {
> -		int fd;
>   		if (!strcmp(ep->d_name, "."))
>   			continue;
>   		if (!strcmp(ep->d_name, ".."))
>   			continue;
>   
> +		/* already joined */
> +		if ((!strcmp(ep->d_name, "user")))
> +			continue;
> +
>   		if (snprintf(path, sizeof(path), "/proc/%d/ns/%s", task_pid, ep->d_name) < 0)
>   			goto out;
> +
>   		if ((fd = open(path, O_RDONLY)) < 0)
>   			goto out;
> +
>   		if (setns(fd, 0))
>   			logger(-1, errno, "Failed to set context for %s", ep->d_name);
>   		close(fd);
> @@ -576,7 +765,9 @@ int ct_do_open(vps_handler *h, vps_param *param)
>   {
>   	int ret;
>   	char path[STR_SIZE];
> +	char upath[STR_SIZE];
>   	struct stat st;
> +	unsigned long *local_uid = param->res.misc.local_uid;
>   
>   	ret = container_init();
>   	if (ret) {
> @@ -592,6 +783,9 @@ int ct_do_open(vps_handler *h, vps_param *param)
>   	if (snprintf(path, sizeof(path), "/proc/%d/ns/pid", getpid()) < 0)
>   		return VZ_RESOURCE_ERROR;
>   
> +	if (snprintf(upath, sizeof(upath), "/proc/%d/ns/user", getpid()) < 0)
> +		return VZ_RESOURCE_ERROR;
> +
>   	ret = mkdir(NETNS_RUN_DIR, S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH);
>   
>   	if (ret && (errno != EEXIST)) {
> @@ -601,6 +795,20 @@ int ct_do_open(vps_handler *h, vps_param *param)
>   	}
>   
>   	h->can_join_pidns = !stat(path, &st);
> +	/*
> +	 * Being able to join the user namespace is a good indication that the
> +	 * user namespace is complete. For a long time, the user namespace
> +	 * existed, but were far away from being feature complete.  When
> +	 * running in such a kernel, joining the user namespace will just
> +	 * cripple our container, since we won't be able to do anything. It is
> +	 * only good for people who are okay running containers as root.
> +	 *
> +	 * It is not enough, however, for user namespaces to be present in the
> +	 * kernel. The container must have been setup to use it (we need the
> +	 * mapped user to own the files, etc. So we also need to find suitable
> +	 * configuration in the config files.
> +	 */
> +	h->can_join_userns = !stat(upath, &st) && local_uid;
>   	h->is_run = ct_is_run;
>   	h->enter = ct_enter;
>   	h->destroy = ct_destroy;