[Devel] [PATCH v3 4/9] user namespace support for upstream containers

Mon Apr 29 22:16:26 PDT 2013

From: Glauber Costa <glommer at parallels.com>

This patch allows the execution of unprivileged containers running ontop
of an upstream Linux Kernel. We will run at whatever UID is found in the
configuration file (so far empty, thus disabled).

Signed-off-by: Glauber Costa <glommer at parallels.com>
---
 include/env.h      |   1 +
 include/types.h    |   1 +
 src/lib/hooks_ct.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 214 insertions(+), 4 deletions(-)

diff --git a/include/env.h b/include/env.h
index 06729f6..4fef438 100644
--- a/include/env.h
+++ b/include/env.h
@@ -116,6 +116,7 @@ struct arg_start {
 	vps_handler *h;
 	void *data;
 	env_create_FN fn;
+	int userns_p; /* while running in userns, there's extra sync needed */
 };
 
 struct env_create_param3;
diff --git a/include/types.h b/include/types.h
index a4bce73..fa511aa 100644
--- a/include/types.h
+++ b/include/types.h
@@ -95,6 +95,7 @@ typedef struct vps_handler {
 	int vzfd;	/**< /dev/vzctl file descriptor. */
 	int stdfd;
 	int can_join_pidns; /* can't enter otherwise */
+	int can_join_userns; /* can't run non privileged otherwise */
 	int (*is_run)(struct vps_handler *h, envid_t veid);
 	int (*enter)(struct vps_handler *h, envid_t veid, const char *root, int flags);
 	int (*destroy)(struct vps_handler *h, envid_t veid);
diff --git a/src/lib/hooks_ct.c b/src/lib/hooks_ct.c
index f24d91e..b0cb359 100644
--- a/src/lib/hooks_ct.c
+++ b/src/lib/hooks_ct.c
@@ -66,6 +66,8 @@ static int sys_setns(int fd, int nstype)
 # define MS_PRIVATE (1 << 18)
 #endif
 
+#define UID_GID_RANGE 100000 /* how many users per container */
+
 /* This function is there in GLIBC, but not in headers */
 extern int pivot_root(const char * new_root, const char * put_old);
 
@@ -138,10 +140,41 @@ static int _env_create(void *data)
 	struct env_create_param3 create_param;
 	int ret;
 
-	if ((ret = ct_chroot(arg->res->fs.root)))
+	if ((arg->userns_p != -1) && (read(arg->userns_p, &ret, sizeof(ret)) == 0))
+		return -1;
+
+	ret = ct_chroot(arg->res->fs.root);
+	close(arg->userns_p);
+	/* Probably means chroot failed */
+	if (ret)
 		return ret;
 
-	if ((ret = vps_set_cap(arg->veid, &arg->res->env, &arg->res->cap, 1)))
+	if (arg->h->can_join_userns) {
+		int fd;
+		setuid(0);
+		setgid(0);
+		/* 
+		 * We need the special flag "newinstance". This is a requirement
+		 * of the userns-aware implementation of devpts as of Linux 3.9.
+		 * Because of that special requirement, we do it here rather than
+		 * later.
+		 */
+		mount("devpts", "/dev/pts", "devpts", 0, "newinstance");
+		/* /dev/ptmx, if it even exists, would refer to the root ptmx.
+		 * We don't want that, we want our newly created instance to contain
+		 * all ptys. So we bind mount the root device here
+		 */
+		fd = open("/dev/ptmx", O_RDWR|O_CREAT, 0);
+		close(fd);
+		mount("/dev/pts/ptmx", "/dev/ptmx", "", MS_BIND, 0);
+	}
+
+	/*
+	 * If we are using the user namespace, we will have the full capability
+	 * set in the target namespace. So we don't need any of that.
+	 */
+	if (!arg->h->can_join_userns &&
+		(ret = vps_set_cap(arg->veid, &arg->res->env, &arg->res->cap, 1)))
 		return ret;
 
 	fill_container_param(arg, &create_param);
@@ -224,6 +257,88 @@ static int ct_setlimits(vps_handler *h, envid_t veid, struct ub_struct *ub)
 }
 #undef add_value
 
+static int write_uid_gid_mapping(vps_handler *h, unsigned long uid, unsigned long gid, pid_t pid)
+{
+	char buf[STR_SIZE];
+	char map[STR_SIZE];
+	int fd;
+	int len;
+	int ret;
+
+	len = snprintf(map, sizeof(map), "0 %ld %d", uid, UID_GID_RANGE);
+	snprintf(buf, sizeof(buf), "/proc/%d/uid_map", pid);
+	if ((fd = open(buf, O_WRONLY)) < 0)
+		goto out;
+
+	if ((write(fd, map, len) < 0))
+		goto out;
+
+	close(fd);
+
+	len = snprintf(map, sizeof(map), "0 %ld %d", gid, UID_GID_RANGE);
+	snprintf(buf, sizeof(map), "/proc/%d/gid_map", pid);
+	if ((fd = open(buf, O_WRONLY)) < 0)
+		goto out;
+
+	if ((write(fd, map, len) < 0))
+		goto out;
+	ret = 0;
+out:
+	if (fd > 0)
+		close(fd);
+	return ret;
+}
+
+/*
+ * Those devices should exist in the container, and be valid device nodes with
+ * user access permission. But we need to be absolutely sure this is the case,
+ * so we will provide our own versions. That could actually happen since some
+ * distributions may come with emptied /dev's, waiting for udev to populate them.
+ * That won't happen, we do it ourselves.
+ */
+static void create_devices(vps_handler *h, envid_t veid, const char *root)
+{
+	unsigned int i;
+	char *devices[] = {
+		"/dev/null",
+		"/dev/zero",
+		"/dev/random",
+		"/dev/urandom",
+	};
+
+	/*
+	 * We will tolerate errors, and keep the container running, because it is
+	 * likely we will be able to boot it to a barely functional state. But
+	 * be vocal about it
+	 */
+	for (i = 0; i < ARRAY_SIZE(devices); i++) {
+		char ct_devname[STR_SIZE];
+		int ret;
+
+		ret = snprintf(ct_devname, sizeof(ct_devname), "%s/%s", root, devices[i]);
+		if (ret < 0) {
+			logger(-1, errno, "Could not allocate device string\n");
+			continue;
+		}
+
+		/* 
+		 * No need to be crazy about file flags. When we bind mount, the
+		 * source permissions will be inherited.
+		 */
+		ret = open(ct_devname, O_RDWR|O_CREAT, 0);
+		if (ret < 0) {
+			logger(-1, errno, "Could not touch device %s\n", devices[i]);
+			continue;
+		}
+		close(ret);
+
+		ret = mount(devices[i], ct_devname, "", MS_BIND, 0);
+		if (ret < 0)
+			logger(-1, errno, "Could not touch device %s\n", devices[i]);
+	}
+
+}
+
 static int ct_env_create(struct arg_start *arg)
 {
 
@@ -233,6 +348,8 @@ static int ct_env_create(struct arg_start *arg)
 	int ret;
 	char procpath[STR_SIZE];
 	char ctpath[STR_SIZE];
+	int userns_p[2];
+	int err;
 
 	stack_size = get_pagesize();
 	if (stack_size < 0)
@@ -272,17 +389,58 @@ static int ct_env_create(struct arg_start *arg)
 	 * Belong in the setup phase
 	 */
 	clone_flags = SIGCHLD;
-	/* FIXME: USERNS is still work in progress */
 	clone_flags |= CLONE_NEWUTS|CLONE_NEWPID|CLONE_NEWIPC;
 	clone_flags |= CLONE_NEWNET|CLONE_NEWNS;
 
+	if (!arg->h->can_join_userns) {
+		logger(-1, 0, "WARNING: Running container unprivileged. USER_NS not supported");
+
+		userns_p[0] = userns_p[1] = -1;
+	} else {
+		clone_flags |= CLONE_NEWUSER;
+		if (pipe(userns_p) < 0) {
+			logger(-1, errno, "Can not create userns pipe");
+			return VZ_RESOURCE_ERROR;
+		}
+	}
+	arg->userns_p = userns_p[0];
+
+	if (arg->h->can_join_userns) {
+		create_devices(arg->h, arg->veid, arg->res->fs.root);
+	}
+
 	ret = clone(_env_create, child_stack, clone_flags, arg);
 	if (ret  < 0) {
 		logger(-1, errno, "Unable to clone");
 		/* FIXME: remove ourselves from container first */
 		destroy_container(arg->veid);
 		return VZ_RESOURCE_ERROR;
+	} else if (arg->h->can_join_userns) {
+		/*
+		 * Now we need to write to the mapping file. It has to be us,
+		 * since CAP_SETUID is required in the parent namespace. vzctl
+		 * is run as root, so we should have it. But our cloned kid
+		 * will start as the overflow uid 65534 in the new namespace.
+		 */
+		if (write_uid_gid_mapping(arg->h, *arg->res->misc.local_uid,
+					  *arg->res->misc.local_gid, ret))
+			return VZ_RESOURCE_ERROR;
+
+		/*
+		 * Nothing should proceed userns wide until we have the
+		 * mapping.  That creates many non-determisnitic behaviors
+		 * since some runs will execute with the mapping already done,
+		 * while others with the mapping off. This is particularly
+		 * important for setuid, for instance. It will categorically
+		 * fail if called before a mapping is in place.
+		 */
+		if ((userns_p[1] != -1) &&
+			write(userns_p[1], &err, sizeof(err)) != sizeof(err)) {
+			logger(-1, errno, "Unable to read from userns pipe");
+			return VZ_RESOURCE_ERROR;
+		}
 	}
+	close(userns_p[1]); /* other end closed by child */
 
 	snprintf(procpath, STR_SIZE, "/proc/%d/ns/net", ret);
 	ret = symlink(procpath, ctpath);
@@ -304,6 +462,7 @@ static int ct_enter(vps_handler *h, envid_t veid, const char *root, int flags)
 	int ret = VZ_RESOURCE_ERROR;
 	int err;
 	bool joined_mnt_ns = false;
+	int fd;
 
 	if (!h->can_join_pidns) {
 		logger(-1, 0, "Kernel lacks setns for pid namespace");
@@ -328,17 +487,47 @@ static int ct_enter(vps_handler *h, envid_t veid, const char *root, int flags)
 		goto out;
 	}
 
+	/*
+	 * Because all namespaces are associated with an owner userns,
+	 * and capabilities may be needed for issuing setns syscalls into
+	 * some key target namespaces (like the mount namespace), we will
+	 * first enter the user namespace if it is available. Only then we 
+	 * scan all others and join them as they appear
+	 */
+	if (h->can_join_userns) {
+		if (snprintf(path, sizeof(path), "/proc/%d/ns/user", task_pid) < 0)
+			goto out;
+
+		if ((fd = open(path, O_RDONLY)) < 0)
+			goto out;
+
+		if (setns(fd, CLONE_NEWUSER)) {
+			logger(-1, errno, "Failed to set context for user namespace");
+			close(fd);
+			goto out;
+		}
+		close(fd);
+		setuid(0);
+		setgid(0);
+	}
+
+	ret = VZ_RESOURCE_ERROR;
 	while ((ep = readdir (dp))) {
-		int fd;
 		if (!strcmp(ep->d_name, "."))
 			continue;
 		if (!strcmp(ep->d_name, ".."))
 			continue;
 
+		/* already joined */
+		if ((!strcmp(ep->d_name, "user"))) 
+			continue;
+
 		if (snprintf(path, sizeof(path), "/proc/%d/ns/%s", task_pid, ep->d_name) < 0)
 			goto out;
+
 		if ((fd = open(path, O_RDONLY)) < 0)
 			goto out;
+
 		if (setns(fd, 0))
 			logger(-1, errno, "Failed to set context for %s", ep->d_name);
 		close(fd);
@@ -576,7 +765,9 @@ int ct_do_open(vps_handler *h, vps_param *param)
 {
 	int ret;
 	char path[STR_SIZE];
+	char upath[STR_SIZE];
 	struct stat st;
+	unsigned long *local_uid = param->res.misc.local_uid;
 
 	ret = container_init();
 	if (ret) {
@@ -592,6 +783,9 @@ int ct_do_open(vps_handler *h, vps_param *param)
 	if (snprintf(path, sizeof(path), "/proc/%d/ns/pid", getpid()) < 0)
 		return VZ_RESOURCE_ERROR;
 
+	if (snprintf(upath, sizeof(upath), "/proc/%d/ns/user", getpid()) < 0)
+		return VZ_RESOURCE_ERROR;
+
 	ret = mkdir(NETNS_RUN_DIR, S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH);
 
 	if (ret && (errno != EEXIST)) {
@@ -601,6 +795,20 @@ int ct_do_open(vps_handler *h, vps_param *param)
 	}
 
 	h->can_join_pidns = !stat(path, &st);
+	/*
+	 * Being able to join the user namespace is a good indication that the
+	 * user namespace is complete. For a long time, the user namespace
+	 * existed, but were far away from being feature complete.  When
+	 * running in such a kernel, joining the user namespace will just
+	 * cripple our container, since we won't be able to do anything. It is
+	 * only good for people who are okay running containers as root.
+	 *
+	 * It is not enough, however, for user namespaces to be present in the
+	 * kernel. The container must have been setup to use it (we need the
+	 * mapped user to own the files, etc. So we also need to find suitable
+	 * configuration in the config files.
+	 */
+	h->can_join_userns = !stat(upath, &st) && local_uid;
 	h->is_run = ct_is_run;
 	h->enter = ct_enter;
 	h->destroy = ct_destroy;
-- 
1.7.11.7