[CRIU] [PATCH 3/3] sysctl: move sysctl calls to usernsd
Tycho Andersen
tycho.andersen at canonical.com
Wed Sep 16 20:02:38 PDT 2015
When in a userns, tasks can't write to certain sysctl files:
(00.009653) 1: Error (sysctl.c:142): Can't open sysctl kernel/hostname: Permission denied
Mostly for my own education in what is required to port something to be
userns restorable, I ported the sysctl stuff. A potential concern for this
patch is that copying structures with pointers around is kind of gory. I
did it ad-hoc here, but it may be worth inventing some mechanisms to make
it easier, although I'm not sure what exactly that would look like
(potentially re-using some of the protobuf bits; I'll investigate this more
if it looks helpful when doing the cgroup user namespaces port?).
Another issue is that there is not a great way to return non-fd stuff in
memory right now from userns_call; one of the little hacks in this code
would be "simplified" if we invented a way to do this.
v2: coalesce the individual struct sysctl_req requests into one big
sysctl_userns_req that is in a contiguous region of memory so that we
can pass it via userns_call. Hopefully nobody finds my little ascii
diagram too offensive :)
v3: use the fork/setns trick to change the syctl values in the right ns for
IPC/UTS nses; see inline comment for details
Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
include/namespaces.h | 2 +-
include/sysctl.h | 2 +-
include/util.h | 1 +
ipc_ns.c | 12 +-
kerndat.c | 4 +-
net.c | 2 +-
sysctl.c | 328 +++++++++++++++++++++++++++++++++++++++++++++------
test/zdtm.sh | 2 -
util.c | 20 ++++
uts_ns.c | 2 +-
10 files changed, 322 insertions(+), 53 deletions(-)
diff --git a/include/namespaces.h b/include/namespaces.h
index 00e0630..59385f4 100644
--- a/include/namespaces.h
+++ b/include/namespaces.h
@@ -95,7 +95,7 @@ typedef int (*uns_call_t)(void *arg, int fd, pid_t pid);
*/
#define UNS_FDOUT 0x2
-#define MAX_UNSFD_MSG_SIZE 256
+#define MAX_UNSFD_MSG_SIZE 4096
/*
* When we're restoring inside user namespace, some things are
diff --git a/include/sysctl.h b/include/sysctl.h
index 610fbf6..b949a40 100644
--- a/include/sysctl.h
+++ b/include/sysctl.h
@@ -8,7 +8,7 @@ struct sysctl_req {
int flags;
};
-extern int sysctl_op(struct sysctl_req *req, size_t nr_req, int op);
+extern int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns);
enum {
CTL_READ,
diff --git a/include/util.h b/include/util.h
index f2300a9..eeb0fb7 100644
--- a/include/util.h
+++ b/include/util.h
@@ -261,4 +261,5 @@ void split(char *str, char token, char ***out, int *n);
int fd_has_data(int lfd);
+const char *ns_to_string(unsigned int ns);
#endif /* __CR_UTIL_H__ */
diff --git a/ipc_ns.c b/ipc_ns.c
index c6e2eeb..37a9626 100644
--- a/ipc_ns.c
+++ b/ipc_ns.c
@@ -181,7 +181,7 @@ static int dump_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *ms
{ "kernel/msgmax", &msgmax, CTL_U32 },
};
- ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ);
+ ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, CLONE_NEWIPC);
if (ret < 0) {
pr_err("Failed to read max IPC message size\n");
goto err;
@@ -313,7 +313,7 @@ static int ipc_sysctl_req(IpcVarEntry *e, int op)
int ret;
- ret = sysctl_op(req, ARRAY_SIZE(req), op);
+ ret = sysctl_op(req, ARRAY_SIZE(req), op, CLONE_NEWIPC);
if (ret)
return ret;
@@ -322,7 +322,7 @@ static int ipc_sysctl_req(IpcVarEntry *e, int op)
return 0;
}
- return sysctl_op(req_mq, ARRAY_SIZE(req_mq), op);
+ return sysctl_op(req_mq, ARRAY_SIZE(req_mq), op, CLONE_NEWIPC);
}
/*
@@ -555,7 +555,7 @@ static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem)
};
struct semid_ds semid;
- ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE);
+ ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
if (ret < 0) {
pr_err("Failed to set desired IPC sem ID\n");
return ret;
@@ -691,7 +691,7 @@ static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq)
};
struct msqid_ds msqid;
- ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE);
+ ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
if (ret < 0) {
pr_err("Failed to set desired IPC msg ID\n");
return ret;
@@ -802,7 +802,7 @@ static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm)
};
struct shmid_ds shmid;
- ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE);
+ ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
if (ret < 0) {
pr_err("Failed to set desired IPC shm ID\n");
return ret;
diff --git a/kerndat.c b/kerndat.c
index e57c6fd..d6e54f8 100644
--- a/kerndat.c
+++ b/kerndat.c
@@ -221,7 +221,7 @@ static int tcp_read_sysctl_limits(void)
* Lets figure out which exactly amount of memory is
* availabe for send/read queues on restore.
*/
- ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ);
+ ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0);
if (ret) {
pr_warn("TCP mem sysctls are not available. Using defaults.\n");
goto out;
@@ -268,7 +268,7 @@ static int get_last_cap(void)
{ "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 },
};
- return sysctl_op(req, ARRAY_SIZE(req), CTL_READ);
+ return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0);
}
static bool kerndat_has_memfd_create(void)
diff --git a/net.c b/net.c
index 082ccb6..f5017c7 100644
--- a/net.c
+++ b/net.c
@@ -120,7 +120,7 @@ static int ipv4_conf_op(char *tgt, int *conf, int op, NetnsEntry **netns)
ri++;
}
- ret = sysctl_op(req, ri, op);
+ ret = sysctl_op(req, ri, op, CLONE_NEWNET);
if (ret < 0) {
pr_err("Failed to %s %s/<confs>\n", (op == CTL_READ)?"read":"write", tgt);
return -1;
diff --git a/sysctl.c b/sysctl.c
index b059140..2a46f66 100644
--- a/sysctl.c
+++ b/sysctl.c
@@ -3,11 +3,25 @@
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
#include "asm/types.h"
+#include "namespaces.h"
#include "sysctl.h"
#include "util.h"
+/* These are the namespaces we know how to restore in various ways.
+ */
+#define KNOWN_NS_MASK (CLONE_NEWUTS | CLONE_NEWNET | CLONE_NEWIPC)
+
+struct sysctl_userns_req {
+ int op;
+ unsigned int ns;
+ size_t nr_req;
+ struct sysctl_req *reqs;
+};
+
#define __SYSCTL_OP(__ret, __fd, __req, __type, __nr, __op) \
do { \
if (__op == CTL_READ) \
@@ -126,67 +140,303 @@ err:
return ret;
}
-static int __sysctl_op(int dir, struct sysctl_req *req, int op)
+static int sysctl_userns_arg_size(int type)
{
- int fd, ret = -1, nr = 1, flags;
-
- if (op == CTL_READ)
- flags = O_RDONLY;
- else
- flags = O_WRONLY;
-
- fd = openat(dir, req->name, flags);
- if (fd < 0) {
- if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
- return 0;
- pr_perror("Can't open sysctl %s", req->name);
- return -1;
- }
-
- switch (CTL_TYPE(req->type)) {
+ switch(CTL_TYPE(type)) {
case __CTL_U32A:
- nr = CTL_LEN(req->type);
+ return sizeof(u32) * CTL_LEN(type);
case CTL_U32:
- __SYSCTL_OP(ret, fd, req, u32, nr, op);
- break;
+ return sizeof(u32);
case CTL_32:
- __SYSCTL_OP(ret, fd, req, s32, nr, op);
- break;
+ return sizeof(s32);
case __CTL_U64A:
- nr = CTL_LEN(req->type);
+ return sizeof(u64) * CTL_LEN(type);
case CTL_U64:
- __SYSCTL_OP(ret, fd, req, u64, nr, op);
- break;
+ return sizeof(u64);
case __CTL_STR:
- nr = CTL_LEN(req->type);
- __SYSCTL_OP(ret, fd, req, char, nr, op);
- break;
+ return sizeof(char) * CTL_LEN(type) + 1;
+ default:
+ pr_err("unknown arg type %d\n", type);
+
+ /* Ensure overflow to cause an error */
+ return MAX_UNSFD_MSG_SIZE;
}
+}
- close_safe(&fd);
+/*
+ * In order to avoid lots of opening of /proc/sys for each struct sysctl_req,
+ * we encode each array of sysctl_reqs into one contiguous region of memory so
+ * it can be passed via userns_call if necessary. It looks like this:
+ *
+ * struct sysctl_userns_req struct sysctl_req name arg
+ * ---------------------------------------------------------------------------
+ * | op | nr_req | reqs | <fields> | name | arg | "the name" | "the arg" ...
+ * ---------------------------------------------------------------------------
+ * |____^ |______|__^ ^
+ * |_______________|
+ */
+static int do_sysctl_op(int *fds, struct sysctl_userns_req *userns_req)
+{
+ int i, op = userns_req->op;
+ struct sysctl_req *req;
- return ret;
+ req = userns_req->reqs;
+
+ for (i = 0; i < userns_req->nr_req; i++) {
+ int arg_len = sysctl_userns_arg_size(req->type);
+ int name_len = strlen((char *) &req[1]) + 1;
+ int total_len = sizeof(*req) + arg_len + name_len;
+ int nr = 1, ret = -1;
+ int fd = fds[i];
+
+ switch (CTL_TYPE(req->type)) {
+ case __CTL_U32A:
+ nr = CTL_LEN(req->type);
+ case CTL_U32:
+ __SYSCTL_OP(ret, fd, req, u32, nr, op);
+ break;
+ case CTL_32:
+ __SYSCTL_OP(ret, fd, req, s32, nr, op);
+ break;
+ case __CTL_U64A:
+ nr = CTL_LEN(req->type);
+ case CTL_U64:
+ __SYSCTL_OP(ret, fd, req, u64, nr, op);
+ break;
+ case __CTL_STR:
+ nr = CTL_LEN(req->type);
+ __SYSCTL_OP(ret, fd, req, char, nr, op);
+ break;
+ }
+
+ if (ret < 0)
+ return ret;
+
+ req = (struct sysctl_req *) (((char *) req) + total_len);
+ }
+
+ return 0;
}
-int sysctl_op(struct sysctl_req *req, size_t nr_req, int op)
+static int __sysctl_op(void *arg, int unused, pid_t pid)
{
- int ret = 0;
- int dir = -1;
+ int fd, ret = -1, flags, dir, i;
+ struct sysctl_userns_req *userns_req = arg;
+ int op = userns_req->op;
+ struct sysctl_req *req;
+ int *fds = NULL;
- dir = open("/proc/sys", O_RDONLY);
+ // fix up the pointer
+ req = userns_req->reqs = (struct sysctl_req *) &userns_req[1];
+
+ /* In the case of user namespaces, unprivileged users cannot write to
+ * some files in /proc/sys (e.g. kernel/hostname), so we need to proxy
+ * requests through usernsd. However, unprivileged users with
+ * CAP_NET_ADMIN can write to sysctl files under net/. So, the way we
+ * restore sysctl files depends on how they behave under their
+ * namespace. For files under net/ it is easy, since the task still has
+ * CAP_NET_ADMIN in its user namespace, so we just write to it from the
+ * process directly.
+ *
+ * For files in the IPC/UTS namespaces, it is more complicated.
+ * Unprivileged users cannot even open these files, so they must be
+ * opened by usernsd. However, the value in the kernel is changed for
+ * the IPC/UTS namespace that write()s to the open sysctl file (not who
+ * opened it). So, we must set the value from inside the usernsd
+ * caller's namespace. We:
+ *
+ * 1. unsd opens the sysctl files
+ * 2. forks a task
+ * 3. setns()es to the UTS/IPC namespace of the caller
+ * 4. write()s to the files and exits
+ */
+ dir = open("/proc/sys", O_RDONLY, O_DIRECTORY);
if (dir < 0) {
pr_perror("Can't open sysctl dir");
return -1;
}
- while (nr_req--) {
- ret = __sysctl_op(dir, req, op);
- if (ret < 0)
- break;
- req++;
+ fds = xmalloc(sizeof(int) * userns_req->nr_req);
+ if (!fds)
+ goto out;
+
+ memset(fds, -1, sizeof(int) * userns_req->nr_req);
+
+ for (i = 0; i < userns_req->nr_req; i++) {
+ int arg_len = sysctl_userns_arg_size(req->type);
+ int name_len = strlen((char *) &req[1]) + 1;
+ int total_len = sizeof(*req) + arg_len + name_len;
+
+ /* fix up the pointers */
+ req->name = (char *) &req[1];
+ req->arg = req->name + name_len;
+
+ if (((char *) req) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
+ pr_err("bad sysctl req %s, too big: %d\n", req->name, total_len);
+ goto out;
+ }
+
+ if (op == CTL_READ)
+ flags = O_RDONLY;
+ else
+ flags = O_WRONLY;
+
+ fd = openat(dir, req->name, flags);
+ if (fd < 0) {
+ if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
+ continue;
+ pr_perror("Can't open sysctl %s", req->name);
+ goto out;
+ }
+
+ fds[i] = fd;
+
+ req = (struct sysctl_req *) (((char *) req) + total_len);
+ }
+
+ /* Now, if we're trying to read stuff or if we're in the same pid, we
+ * can just do things directly in this process, since usernsd didn't
+ * call us (or we're reading and need to return the values).
+ *
+ * Otherwise, let's fork a new task as described above.
+ */
+ if (pid == getpid() || op == CTL_READ) {
+ ret = do_sysctl_op(fds, userns_req);
+ } else {
+ pid_t worker;
+ int status;
+
+ worker = fork();
+ if (worker < 0)
+ goto out;
+
+ if (!worker) {
+ int nsfd;
+ const char *nsname = ns_to_string(userns_req->ns);
+
+ BUG_ON(!nsname);
+ nsfd = open_proc(pid, "ns/%s", nsname);
+ if (nsfd < 0) {
+ pr_perror("failed to open pid %d's ns %s", pid, nsname);
+ exit(1);
+ }
+
+ if (setns(nsfd, 0) < 0) {
+ pr_perror("failed to setns to %d's ns %s", pid, nsname);
+ exit(1);
+ }
+
+ exit(do_sysctl_op(fds, userns_req));
+ }
+
+ if (waitpid(worker, &status, 0) != worker) {
+ pr_err("worker didn't die?");
+ kill(worker, SIGKILL);
+ goto out;
+ }
+
+ if (!WIFEXITED(status) || WEXITSTATUS(status)) {
+ pr_err("worker failed: %d\n", status);
+ goto out;
+ }
+
+ ret = 0;
+ }
+
+out:
+ if (fds) {
+ for (i = 0; i < userns_req->nr_req; i++) {
+ if (fds[i] < 0)
+ break;
+ close_safe(&fds[i]);
+ }
+
+ xfree(fds);
}
close_safe(&dir);
return ret;
}
+
+int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns)
+{
+ int ret = 0, i;
+ struct sysctl_userns_req *userns_req;
+ struct sysctl_req *cur;
+
+ if (nr_req == 0)
+ return 0;
+
+ if (ns & !KNOWN_NS_MASK) {
+ pr_err("don't know how to restore some namespaces in %u\n", ns);
+ return -1;
+ }
+
+ userns_req = alloca(MAX_UNSFD_MSG_SIZE);
+ userns_req->op = op;
+ userns_req->nr_req = nr_req;
+ userns_req->ns = ns;
+ userns_req->reqs = (struct sysctl_req *) (&userns_req[1]);
+
+ cur = userns_req->reqs;
+ for (i = 0; i < nr_req; i++) {
+ int arg_len = sysctl_userns_arg_size(req[i].type);
+ int name_len = strlen(req[i].name) + 1;
+ int total_len = sizeof(*cur) + arg_len + name_len;
+
+ if (((char *) cur) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
+ pr_err("sysctl msg %s too big: %d\n", req[i].name, total_len);
+ return -1;
+ }
+
+ /* copy over the non-pointer fields */
+ cur->type = req[i].type;
+ cur->flags = req[i].flags;
+
+ cur->name = (char *) &cur[1];
+ strcpy(cur->name, req[i].name);
+
+ cur->arg = cur->name + name_len;
+ memcpy(cur->arg, req[i].arg, arg_len);
+
+ cur = (struct sysctl_req *) (((char *) cur) + total_len);
+ }
+
+ /* Net namespaces can be restored without usernsd, since anything with
+ * CAP_SYS_ADMIN in its namespace can write to net/ sysctls. The other
+ * namespaces we allow to restore (IPC and UTS) must be restored via
+ * usernsd.
+ */
+ if (ns & CLONE_NEWNET)
+ ret = __sysctl_op(userns_req, -1, getpid());
+ else
+ ret = userns_call(__sysctl_op, UNS_ASYNC, userns_req, MAX_UNSFD_MSG_SIZE, -1);
+
+ if (ret < 0)
+ return -1;
+
+ if (op != CTL_READ)
+ return 0;
+
+ /*
+ * Here, we use a little hack: since we only read in dump mode when
+ * usernsd is not active, we know the above call happened in this
+ * address space, so we can just copy the value read back out. If there
+ * was an API to return stuff via userns_call(), that would be
+ * preferable.
+ */
+ cur = userns_req->reqs;
+ for (i = 0; i < nr_req; i++) {
+ int arg_len = sysctl_userns_arg_size(cur->type);
+ int name_len = strlen((char *) &cur[1]) + 1;
+ int total_len = sizeof(*cur) + arg_len + name_len;
+ void *arg = ((void *) &cur[1]) + name_len;
+
+ memcpy(req[i].arg, arg, arg_len);
+
+ cur = (struct sysctl_req *) (((char *) cur) + total_len);
+ }
+
+ return 0;
+}
diff --git a/test/zdtm.sh b/test/zdtm.sh
index f12156a..e4e1f00 100755
--- a/test/zdtm.sh
+++ b/test/zdtm.sh
@@ -215,8 +215,6 @@ generate_test_list()
"
TEST_MNTNS="
- ns/static/mnt_ext_auto
- ns/static/mnt_ext_master
ns/static/mntns_open
ns/static/mntns_link_remap
ns/static/mntns_link_ghost
diff --git a/util.c b/util.c
index b916eca..b3cce7b 100644
--- a/util.c
+++ b/util.c
@@ -845,3 +845,23 @@ int fd_has_data(int lfd)
return ret;
}
+
+const char *ns_to_string(unsigned int ns)
+{
+ switch (ns) {
+ case CLONE_NEWIPC:
+ return "ipc";
+ case CLONE_NEWNS:
+ return "mnt";
+ case CLONE_NEWNET:
+ return "net";
+ case CLONE_NEWPID:
+ return "pid";
+ case CLONE_NEWUSER:
+ return "user";
+ case CLONE_NEWUTS:
+ return "uts";
+ default:
+ return NULL;
+ }
+}
diff --git a/uts_ns.c b/uts_ns.c
index e2c1f68..ca6bbdc 100644
--- a/uts_ns.c
+++ b/uts_ns.c
@@ -61,7 +61,7 @@ int prepare_utsns(int pid)
req[1].arg = ue->domainname;
req[1].type = CTL_STR(strlen(ue->domainname));
- ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE);
+ ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWUTS);
utsns_entry__free_unpacked(ue, NULL);
out:
close_image(img);
--
2.1.4
More information about the CRIU
mailing list