[Devel] [PATCH v2 3/8] user namespace support for upstream containers
Kir Kolyshkin
kir at openvz.org
Mon Apr 15 18:12:42 PDT 2013
On 03/22/2013 03:48 AM, Glauber Costa wrote:
> This patch allows the execution of unprivileged containers running ontop
> of an upstream Linux Kernel. We will run at whatever UID is found in the
> configuration file.
>
> Signed-off-by: Glauber Costa <glommer at parallels.com>
> ---
> include/env.h | 1 +
> include/types.h | 1 +
> src/lib/hooks_ct.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> 3 files changed, 191 insertions(+), 5 deletions(-)
>
> diff --git a/include/env.h b/include/env.h
> index 1628bbf..d41df2e 100644
> --- a/include/env.h
> +++ b/include/env.h
> @@ -116,6 +116,7 @@ struct arg_start {
> vps_handler *h;
> void *data;
> env_create_FN fn;
> + int userns_p; /* while running in userns, there's extra sync needed */
> };
>
> struct env_create_param3;
> diff --git a/include/types.h b/include/types.h
> index ceecb93..54eb1f4 100644
> --- a/include/types.h
> +++ b/include/types.h
> @@ -95,6 +95,7 @@ typedef struct vps_handler {
> int vzfd; /**< /dev/vzctl file descriptor. */
> int stdfd;
> int can_join_pidns; /* can't enter otherwise */
> + int can_join_userns; /* can't run non privileged otherwise */
> int (*is_run)(struct vps_handler *h, envid_t veid);
> int (*enter)(struct vps_handler *h, envid_t veid, const char *root, int flags);
> int (*destroy)(struct vps_handler *h, envid_t veid);
> diff --git a/src/lib/hooks_ct.c b/src/lib/hooks_ct.c
> index 29d7eea..6bd27c1 100644
> --- a/src/lib/hooks_ct.c
> +++ b/src/lib/hooks_ct.c
> @@ -66,6 +66,8 @@ static int sys_setns(int fd, int nstype)
> # define MS_PRIVATE (1 << 18)
> #endif
>
> +#define UID_GID_RANGE 100000 /* how many users per container */
> +
> /* This function is there in GLIBC, but not in headers */
> extern int pivot_root(const char * new_root, const char * put_old);
>
> @@ -138,10 +140,39 @@ static int _env_create(void *data)
> struct env_create_param3 create_param;
> int ret;
>
> - if ((ret = ct_chroot(arg->res->fs.root)))
> + if ((arg->userns_p != -1) && (read(arg->userns_p, &ret, sizeof(ret)) == 0))
> + return -1;
> +
> + ret = ct_chroot(arg->res->fs.root);
> + close(arg->userns_p);
> + /* Probably means chroot failed */
> + if (ret)
> return ret;
>
> - if ((ret = vps_set_cap(arg->veid, &arg->res->env, &arg->res->cap, 1)))
> + if (arg->h->can_join_userns) {
> + setuid(0);
> + setgid(0);
> + /*
> + * We need the special flag "newinstance". This is a requirement
> + * of the userns-aware implementation of devpts as of Linux 3.9.
> + * Because of that special requirement, we do it here rather than
> + * later.
> + */
> + mount("devpts", "/dev/pts", "devpts", 0, "newinstance");
> + /* /dev/ptmx, if it even exists, would refer to the root ptmx.
> + * We don't want that, we want our newly created instance to contain
> + * all ptys. So we bind mount the root device here
> + */
> + open("/dev/ptmx", O_RDWR|O_CREAT, 0);
> + mount("/dev/pts/ptmx", "/dev/ptmx", "", MS_BIND, 0);
> + }
> +
> + /*
> + * If we are using the user namespace, we will have the full capability
> + * set in the target namespace. So we don't need any of that.
> + */
> + if (!arg->h->can_join_userns &&
> + (ret = vps_set_cap(arg->veid, &arg->res->env, &arg->res->cap, 1)))
> return ret;
>
> fill_container_param(arg, &create_param);
> @@ -153,6 +184,79 @@ static int _env_create(void *data)
> return exec_container_init(arg, &create_param);
> }
>
> +static int write_uid_gid_mapping(vps_handler *h, unsigned long uid, unsigned long gid, pid_t pid)
> +{
> + char buf[STR_SIZE];
> + char map[STR_SIZE];
> + int fd;
> +
> + snprintf(map, sizeof(map), "0 %ld %d", uid, UID_GID_RANGE);
> + snprintf(buf, sizeof(buf), "/proc/%d/uid_map", pid);
> + if ((fd = open(buf, O_WRONLY)) < 0)
> + return -1;
> +
> + if ((write(fd, map, sizeof(map)) < 0))
> + return -1;
You write the whole STR_SIZE, while just strlen(map) (or value returned
by snprintf) should be enough.
> +
> + snprintf(map, sizeof(map), "0 %ld %d", gid, UID_GID_RANGE);
> + snprintf(buf, sizeof(map), "/proc/%d/gid_map", pid);
> + if ((fd = open(buf, O_WRONLY)) < 0)
> + return -1;
> +
> + if ((write(fd, map, sizeof(map)) < 0))
> + return -1;
ditto
> +
1 close the files opened
2 you can reuse write_val() from src/lib/env.c or maybe use
fopen/fprintf/fclose(). Up to you
> + return 0;
> +}
> +
> +/*
> + * Those devices should exist in the container, and be valid device nodes with
> + * user access permission. But we need to be absolutely sure this is the case,
> + * so we will provide our own versions. That could actually happen since some
> + * distributions may come with emptied /dev's, waiting for udev to populate them.
> + * That won't happen, we do it ourselves.
> + */
> +static void create_devices(vps_handler *h, envid_t veid, const char *root)
> +{
> + unsigned int i;
> + char *devices[] = {
> + "/dev/null",
> + "/dev/zero",
> + "/dev/random",
> + "/dev/urandom",
> + };
> +
> + /*
> + * We will tolerate errors, and keep the container running, because it is
> + * likely we will be able to boot it to a barely functional state. But
> + * be vocal about it
> + */
> + for (i = 0; i < ARRAY_SIZE(devices); i++) {
> + char ct_devname[STR_SIZE];
> + int ret;
> +
> + ret = snprintf(ct_devname, sizeof(ct_devname), "%s/%s", root, devices[i]);
> + if (ret < 0) {
> + logger(-1, errno, "Could not allocate device string\n");
> + continue;
> + }
> +
> + /*
> + * No need to be crazy about file flags. When we bind mount, the
> + * source permissions will be inherited.
> + */
> + ret = open(ct_devname, O_RDWR|O_CREAT, 0);
> + if (ret < 0) {
> + logger(-1, errno, "Could not touch device %s\n", devices[i]);
> + continue;
> + }
> + ret = mount(devices[i], ct_devname, "", MS_BIND, 0);
> + if (ret < 0)
> + logger(-1, errno, "Could not touch device %s\n", devices[i]);
close()
> + }
> +
> +}
> +
> static int ct_env_create(struct arg_start *arg)
> {
>
> @@ -162,7 +266,8 @@ static int ct_env_create(struct arg_start *arg)
> int ret;
> char procpath[STR_SIZE];
> char ctpath[STR_SIZE];
> -
> + int userns_p[2];
> + int err;
>
> /* non-fatal */
> if ((ret = ct_destroy(arg->h, arg->veid)))
> @@ -190,16 +295,54 @@ static int ct_env_create(struct arg_start *arg)
> * Belong in the setup phase
> */
> clone_flags = SIGCHLD;
> - /* FIXME: USERNS is still work in progress */
> clone_flags |= CLONE_NEWUTS|CLONE_NEWPID|CLONE_NEWIPC;
> clone_flags |= CLONE_NEWNET|CLONE_NEWNS;
>
> + if (!arg->h->can_join_userns) {
> + logger(-1, 0, "WARNING: Running container unprivileged. USER_NS not supported");
> +
> + userns_p[0] = userns_p[1] = -1;
> + } else {
> + clone_flags |= CLONE_NEWUSER;
> + if (pipe(userns_p) < 0) {
> + logger(-1, errno, "Can not create userns pipe");
> + return VZ_RESOURCE_ERROR;
> + }
> + }
> + arg->userns_p = userns_p[0];
It would be good if you have explicit close() here.
> +
> + create_devices(arg->h, arg->veid, arg->res->fs.root);
> +
> ret = clone(_env_create, child_stack, clone_flags, arg);
> if (ret < 0) {
> logger(-1, errno, "Unable to clone");
> /* FIXME: remove ourselves from container first */
> destroy_container(arg->veid);
> return VZ_RESOURCE_ERROR;
> + } else if (arg->h->can_join_userns) {
> + /*
> + * Now we need to write to the mapping file. It has to be us,
> + * since CAP_SETUID is required in the parent namespace. vzctl
> + * is run as root, so we should have it. But our cloned kid
> + * will start as the overflow uid 65534 in the new namespace.
> + */
> + if (write_uid_gid_mapping(arg->h, *arg->res->misc.local_uid,
> + *arg->res->misc.local_gid, ret))
> + return VZ_RESOURCE_ERROR;
> +
> + /*
> + * Nothing should proceed userns wide until we have the
> + * mapping. That creates many non-determisnitic behaviors
> + * since some runs will execute with the mapping already done,
> + * while others with the mapping off. This is particularly
> + * important for setuid, for instance. It will categorically
> + * fail if called before a mapping is in place.
> + */
> + if ((userns_p[1] != -1) &&
> + write(userns_p[1], &err, sizeof(err)) != sizeof(err)) {
> + logger(-1, errno, "Unable to read from userns pipe");
> + return -1;
return VZ_SOME_ERROR_CODE not -1
> + }
> }
>
> snprintf(procpath, STR_SIZE, "/proc/%d/ns/net", ret);
> @@ -221,6 +364,7 @@ static int ct_enter(vps_handler *h, envid_t veid, const char *root, int flags)
> pid_t task_pid;
> int ret = VZ_RESOURCE_ERROR;
> bool joined_mnt_ns = false;
> + int fd;
>
> if (!h->can_join_pidns) {
> logger(-1, 0, "Kernel lacks setns for pid namespace");
> @@ -245,18 +389,45 @@ static int ct_enter(vps_handler *h, envid_t veid, const char *root, int flags)
> return VZ_RESOURCE_ERROR;
> }
>
> + /*
> + * Because all namespaces are associated with an owner userns,
> + * and capabilities may be needed for issuing setns syscalls into
> + * some key target namespaces (like the mount namespace), we will
> + * first enter the user namespace if it is available. Only then we
> + * scan all others and join them as they appear
> + */
> + if (h->can_join_userns) {
> + if (snprintf(path, sizeof(path), "/proc/%d/ns/user", task_pid) < 0)
> + goto out;
> +
> + if ((fd = open(path, O_RDONLY)) < 0)
> + goto out;
> +
> + if (setns(fd, CLONE_NEWUSER)) {
> + logger(-1, errno, "Failed to set context for user namespace");
> + goto out;
> + }
close()
> + setuid(0);
> + setgid(0);
> + }
> +
> ret = VZ_RESOURCE_ERROR;
> while ((ep = readdir (dp))) {
> - int fd;
> if (!strcmp(ep->d_name, "."))
> continue;
> if (!strcmp(ep->d_name, ".."))
> continue;
>
> + /* already joined */
> + if ((!strcmp(ep->d_name, "user")))
> + continue;
> +
> if (snprintf(path, sizeof(path), "/proc/%d/ns/%s", task_pid, ep->d_name) < 0)
> goto out;
> +
> if ((fd = open(path, O_RDONLY)) < 0)
> goto out;
> +
> if (setns(fd, 0))
> logger(-1, errno, "Failed to set context for %s", ep->d_name);
>
> @@ -562,6 +733,7 @@ int ct_do_open(vps_handler *h)
> {
> int ret;
> char path[STR_SIZE];
> + char upath[STR_SIZE];
> struct stat st;
>
> ret = container_init();
> @@ -578,6 +750,9 @@ int ct_do_open(vps_handler *h)
> if (snprintf(path, sizeof(path), "/proc/%d/ns/pid", getpid()) < 0)
> return VZ_RESOURCE_ERROR;
>
> + if (snprintf(upath, sizeof(upath), "/proc/%d/ns/user", getpid()) < 0)
> + return VZ_RESOURCE_ERROR;
> +
> ret = mkdir(NETNS_RUN_DIR, S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH);
>
> if (ret && (errno != EEXIST)) {
> @@ -586,6 +761,15 @@ int ct_do_open(vps_handler *h)
> }
>
> h->can_join_pidns = !stat(path, &st);
> + /*
> + * Being able to join the user namespace is a good indication that the
> + * user namespace is complete. For a long time, the user namespace
> + * existed, but were far away from being feature complete. When
> + * running in such a kernel, joining the user namespace will just
> + * cripple our container, since we won't be able to do anything. It is
> + * only good for people who are okay running containers as root
> + */
> + h->can_join_userns = !stat(upath, &st);
> h->is_run = ct_is_run;
> h->enter = ct_enter;
> h->destroy = ct_destroy;
More information about the Devel
mailing list