[CRIU] [PATCH] sysctl: move sysctl calls to usernsd

Tycho Andersen tycho.andersen at canonical.com
Thu Oct 1 20:48:52 PDT 2015


When in a userns, tasks can't write to certain sysctl files:

(00.009653)      1: Error (sysctl.c:142): Can't open sysctl kernel/hostname: Permission denied

See inline comments for details on affected namespaces.

Mostly for my own education in what is required to port something to be
userns restorable, I ported the sysctl stuff. A potential concern for this
patch is that copying structures with pointers around is kind of gory. I
did it ad-hoc here, but it may be worth inventing some mechanisms to make
it easier, although I'm not sure what exactly that would look like
(potentially re-using some of the protobuf bits; I'll investigate this more
if it looks helpful when doing the cgroup user namespaces port?).

Another issue is that there is not a great way to return non-fd stuff in
memory right now from userns_call; one of the little hacks in this code
would be "simplified" if we invented a way to do this.

v2: coalesce the individual struct sysctl_req requests into one big
    sysctl_userns_req that is in a contiguous region of memory so that we
    can pass it via userns_call. Hopefully nobody finds my little ascii
    diagram too offensive :)
v3: use the fork/setns trick to change the syctl values in the right ns for
    IPC/UTS nses; see inline comment for details
v4: only use sysctl_userns_req when actually doing a userns_call.

Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
 include/namespaces.h |   2 +-
 include/sysctl.h     |   2 +-
 include/util.h       |   1 +
 ipc_ns.c             |  12 +--
 kerndat.c            |   4 +-
 net.c                |   2 +-
 sysctl.c             | 297 +++++++++++++++++++++++++++++++++++++++++++++++----
 util.c               |  20 ++++
 uts_ns.c             |   2 +-
 9 files changed, 307 insertions(+), 35 deletions(-)

diff --git a/include/namespaces.h b/include/namespaces.h
index 0e70a7b..8809acb 100644
--- a/include/namespaces.h
+++ b/include/namespaces.h
@@ -103,7 +103,7 @@ typedef int (*uns_call_t)(void *arg, int fd, pid_t pid);
  */
 #define UNS_FDOUT	0x2
 
-#define MAX_UNSFD_MSG_SIZE 256
+#define MAX_UNSFD_MSG_SIZE 4096
 
 /*
  * When we're restoring inside user namespace, some things are
diff --git a/include/sysctl.h b/include/sysctl.h
index 610fbf6..b949a40 100644
--- a/include/sysctl.h
+++ b/include/sysctl.h
@@ -8,7 +8,7 @@ struct sysctl_req {
 	int	flags;
 };
 
-extern int sysctl_op(struct sysctl_req *req, size_t nr_req, int op);
+extern int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns);
 
 enum {
 	CTL_READ,
diff --git a/include/util.h b/include/util.h
index 21a21b8..e815117 100644
--- a/include/util.h
+++ b/include/util.h
@@ -263,4 +263,5 @@ int fd_has_data(int lfd);
 
 int make_yard(char *path);
 
+const char *ns_to_string(unsigned int ns);
 #endif /* __CR_UTIL_H__ */
diff --git a/ipc_ns.c b/ipc_ns.c
index c6e2eeb..37a9626 100644
--- a/ipc_ns.c
+++ b/ipc_ns.c
@@ -181,7 +181,7 @@ static int dump_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *ms
 		{ "kernel/msgmax", &msgmax, CTL_U32 },
 	};
 
-	ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ);
+	ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, CLONE_NEWIPC);
 	if (ret < 0) {
 		pr_err("Failed to read max IPC message size\n");
 		goto err;
@@ -313,7 +313,7 @@ static int ipc_sysctl_req(IpcVarEntry *e, int op)
 
 	int ret;
 
-	ret = sysctl_op(req, ARRAY_SIZE(req), op);
+	ret = sysctl_op(req, ARRAY_SIZE(req), op, CLONE_NEWIPC);
 	if (ret)
 		return ret;
 
@@ -322,7 +322,7 @@ static int ipc_sysctl_req(IpcVarEntry *e, int op)
 		return 0;
 	}
 
-	return sysctl_op(req_mq, ARRAY_SIZE(req_mq), op);
+	return sysctl_op(req_mq, ARRAY_SIZE(req_mq), op, CLONE_NEWIPC);
 }
 
 /*
@@ -555,7 +555,7 @@ static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem)
 	};
 	struct semid_ds semid;
 
-	ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE);
+	ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
 	if (ret < 0) {
 		pr_err("Failed to set desired IPC sem ID\n");
 		return ret;
@@ -691,7 +691,7 @@ static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq)
 	};
 	struct msqid_ds msqid;
 
-	ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE);
+	ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
 	if (ret < 0) {
 		pr_err("Failed to set desired IPC msg ID\n");
 		return ret;
@@ -802,7 +802,7 @@ static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm)
 	};
 	struct shmid_ds shmid;
 
-	ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE);
+	ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
 	if (ret < 0) {
 		pr_err("Failed to set desired IPC shm ID\n");
 		return ret;
diff --git a/kerndat.c b/kerndat.c
index e57c6fd..d6e54f8 100644
--- a/kerndat.c
+++ b/kerndat.c
@@ -221,7 +221,7 @@ static int tcp_read_sysctl_limits(void)
 	 * Lets figure out which exactly amount of memory is
 	 * availabe for send/read queues on restore.
 	 */
-	ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ);
+	ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0);
 	if (ret) {
 		pr_warn("TCP mem sysctls are not available. Using defaults.\n");
 		goto out;
@@ -268,7 +268,7 @@ static int get_last_cap(void)
 		{ "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 },
 	};
 
-	return sysctl_op(req, ARRAY_SIZE(req), CTL_READ);
+	return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0);
 }
 
 static bool kerndat_has_memfd_create(void)
diff --git a/net.c b/net.c
index f525c48..79bc4d2 100644
--- a/net.c
+++ b/net.c
@@ -127,7 +127,7 @@ static int ipv4_conf_op(char *tgt, int *conf, int n, int op, NetnsEntry **netns)
 		ri++;
 	}
 
-	ret = sysctl_op(req, ri, op);
+	ret = sysctl_op(req, ri, op, CLONE_NEWNET);
 	if (ret < 0) {
 		pr_err("Failed to %s %s/<confs>\n", (op == CTL_READ)?"read":"write", tgt);
 		return -1;
diff --git a/sysctl.c b/sysctl.c
index b059140..e9fdf40 100644
--- a/sysctl.c
+++ b/sysctl.c
@@ -3,11 +3,25 @@
 #include <ctype.h>
 #include <string.h>
 #include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
 
 #include "asm/types.h"
+#include "namespaces.h"
 #include "sysctl.h"
 #include "util.h"
 
+/* These are the namespaces we know how to restore in various ways.
+ */
+#define KNOWN_NS_MASK (CLONE_NEWUTS | CLONE_NEWNET | CLONE_NEWIPC)
+
+struct sysctl_userns_req {
+	int			op;
+	unsigned int		ns;
+	size_t			nr_req;
+	struct sysctl_req	*reqs;
+};
+
 #define __SYSCTL_OP(__ret, __fd, __req, __type, __nr, __op)		\
 do {									\
 	if (__op == CTL_READ)						\
@@ -126,22 +140,32 @@ err:
 	return ret;
 }
 
-static int __sysctl_op(int dir, struct sysctl_req *req, int op)
+static int sysctl_userns_arg_size(int type)
 {
-	int fd, ret = -1, nr = 1, flags;
-
-	if (op == CTL_READ)
-		flags = O_RDONLY;
-	else
-		flags = O_WRONLY;
-
-	fd = openat(dir, req->name, flags);
-	if (fd < 0) {
-		if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
-			return 0;
-		pr_perror("Can't open sysctl %s", req->name);
-		return -1;
+	switch(CTL_TYPE(type)) {
+	case __CTL_U32A:
+		return sizeof(u32) * CTL_LEN(type);
+	case CTL_U32:
+		return sizeof(u32);
+	case CTL_32:
+		return sizeof(s32);
+	case __CTL_U64A:
+		return sizeof(u64) * CTL_LEN(type);
+	case CTL_U64:
+		return sizeof(u64);
+	case __CTL_STR:
+		return sizeof(char) * CTL_LEN(type) + 1;
+	default:
+		pr_err("unknown arg type %d\n", type);
+
+		/* Ensure overflow to cause an error */
+		return MAX_UNSFD_MSG_SIZE;
 	}
+}
+
+static int do_sysctl_op(int fd, struct sysctl_req *req, int op)
+{
+	int ret = -1, nr = 1;
 
 	switch (CTL_TYPE(req->type)) {
 	case __CTL_U32A:
@@ -163,30 +187,257 @@ static int __sysctl_op(int dir, struct sysctl_req *req, int op)
 		break;
 	}
 
-	close_safe(&fd);
+	return ret;
+}
+
+static int __userns_sysctl_op(void *arg, int unused, pid_t pid)
+{
+	int fd, ret = -1, dir, i, status, *fds = NULL;
+	struct sysctl_userns_req *userns_req = arg;
+	int op = userns_req->op;
+	struct sysctl_req *req, **reqs = NULL;
+	pid_t worker;
+
+	// fix up the pointer
+	req = userns_req->reqs = (struct sysctl_req *) &userns_req[1];
+
+	/* For files in the IPC/UTS namespaces, restoring is more complicated
+	 * than for net. Unprivileged users cannot even open these files, so
+	 * they must be opened by usernsd. However, the value in the kernel is
+	 * changed for the IPC/UTS namespace that write()s to the open sysctl
+	 * file (not who opened it). So, we must set the value from inside the
+	 * usernsd caller's namespace. We:
+	 *
+	 * 1. unsd opens the sysctl files
+	 * 2. forks a task
+	 * 3. setns()es to the UTS/IPC namespace of the caller
+	 * 4. write()s to the files and exits
+	 */
+	dir = open("/proc/sys", O_RDONLY, O_DIRECTORY);
+	if (dir < 0) {
+		pr_perror("Can't open sysctl dir");
+		return -1;
+	}
+
+	fds = xmalloc(sizeof(int) * userns_req->nr_req);
+	if (!fds)
+		goto out;
+
+	reqs = xmalloc(sizeof(struct sysctl_req) * userns_req->nr_req);
+	if (!fds)
+		goto out;
+
+	memset(fds, -1, sizeof(int) * userns_req->nr_req);
+
+	for (i = 0; i < userns_req->nr_req; i++)  {
+		int arg_len = sysctl_userns_arg_size(req->type);
+		int name_len = strlen((char *) &req[1]) + 1;
+		int total_len = sizeof(*req) + arg_len + name_len;
+		int flags;
+
+		/* fix up the pointers */
+		req->name = (char *) &req[1];
+		req->arg = req->name + name_len;
+
+		if (((char *) req) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
+			pr_err("bad sysctl req %s, too big: %d\n", req->name, total_len);
+			goto out;
+		}
+
+		if (op == CTL_READ)
+			flags = O_RDONLY;
+		else
+			flags = O_WRONLY;
+
+		fd = openat(dir, req->name, flags);
+		if (fd < 0) {
+			if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
+				continue;
+			pr_perror("Can't open sysctl %s", req->name);
+			goto out;
+		}
+
+		/* save a pointer to the req, so we don't need to recompute its
+		 * location
+		 */
+		reqs[i] = req;
+		fds[i] = fd;
+
+		req = (struct sysctl_req *) (((char *) req) + total_len);
+	}
+
+	worker = fork();
+	if (worker < 0)
+		goto out;
+
+	if (!worker) {
+		int nsfd;
+		const char *nsname = ns_to_string(userns_req->ns);
+
+		BUG_ON(!nsname);
+		nsfd = open_proc(pid, "ns/%s", nsname);
+		if (nsfd < 0) {
+			pr_perror("failed to open pid %d's ns %s", pid, nsname);
+			exit(1);
+		}
+
+		if (setns(nsfd, 0) < 0) {
+			pr_perror("failed to setns to %d's ns %s", pid, nsname);
+			exit(1);
+		}
+
+		close(nsfd);
+
+		for (i = 0; i < userns_req->nr_req; i++) {
+			if (do_sysctl_op(fds[i], reqs[i], op) < 0)
+				exit(1);
+		}
+
+		exit(0);
+	}
+
+	if (waitpid(worker, &status, 0) != worker) {
+		pr_err("worker didn't die?");
+		kill(worker, SIGKILL);
+		goto out;
+	}
+
+	if (!WIFEXITED(status) || WEXITSTATUS(status)) {
+		pr_err("worker failed: %d\n", status);
+		goto out;
+	}
+
+	ret = 0;
+
+out:
+	if (fds) {
+		for (i = 0; i < userns_req->nr_req; i++) {
+			if (fds[i] < 0)
+				break;
+			close_safe(&fds[i]);
+		}
+
+		xfree(fds);
+	}
+
+	if (reqs)
+		xfree(reqs);
+
+	close_safe(&dir);
 
 	return ret;
 }
 
-int sysctl_op(struct sysctl_req *req, size_t nr_req, int op)
+static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op)
 {
-	int ret = 0;
-	int dir = -1;
+	int dir, ret = -1;;
 
-	dir = open("/proc/sys", O_RDONLY);
+	dir = open("/proc/sys", O_RDONLY, O_DIRECTORY);
 	if (dir < 0) {
 		pr_perror("Can't open sysctl dir");
 		return -1;
 	}
 
 	while (nr_req--) {
-		ret = __sysctl_op(dir, req, op);
-		if (ret < 0)
-			break;
+		int fd, flags;
+
+		if (op == CTL_READ)
+			flags = O_RDONLY;
+		else
+			flags = O_WRONLY;
+
+		fd = openat(dir, req->name, flags);
+		if (fd < 0) {
+			if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
+				continue;
+			pr_perror("Can't open sysctl %s", req->name);
+			goto out;
+		}
+
+		ret = do_sysctl_op(fd, req, op);
+		close(fd);
 		req++;
 	}
 
-	close_safe(&dir);
+	ret = 0;
 
+out:
+	close(dir);
 	return ret;
 }
+
+int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns)
+{
+	int i;
+	struct sysctl_userns_req *userns_req;
+	struct sysctl_req *cur;
+
+	if (nr_req == 0)
+		return 0;
+
+	if (ns & !KNOWN_NS_MASK) {
+		pr_err("don't know how to restore some namespaces in %u\n", ns);
+		return -1;
+	}
+
+	/* The way sysctl files behave on open/write depends on the namespace
+	 * they correspond to. If we don't want to interact with something in a
+	 * namespace (e.g. kernel/cap_last_cap is global), we can do this from
+	 * the current process. Similarly, if we're accessing net namespaces,
+	 * we can just do the operation from our current process, since
+	 * anything with CAP_NET_ADMIN can write to the net/ sysctls, and we
+	 * still have that even when restoring in a user ns.
+	 *
+	 * For IPC/UTS, we restore them as described above.
+	 *
+	 * For read operations, we need to copy the values back to return.
+	 * Fortunately, we only do read on dump (or global reads on restore),
+	 * so we can do those in process as well.
+	 */
+	if (!ns || ns & CLONE_NEWNET || op == CTL_READ)
+		return __nonuserns_sysctl_op(req, nr_req, op);
+
+	/*
+	 * In order to avoid lots of opening of /proc/sys for each struct sysctl_req,
+	 * we encode each array of sysctl_reqs into one contiguous region of memory so
+	 * it can be passed via userns_call if necessary. It looks like this:
+	 *
+	 *  struct sysctl_userns_req    struct sysctl_req       name        arg
+	 * ---------------------------------------------------------------------------
+	 * |  op  |  nr_req  |  reqs  | <fields> | name | arg | "the name" | "the arg" ...
+	 * ---------------------------------------------------------------------------
+	 *                       |____^             |______|__^            ^
+	 *                                                 |_______________|
+	 */
+	userns_req = alloca(MAX_UNSFD_MSG_SIZE);
+	userns_req->op = op;
+	userns_req->nr_req = nr_req;
+	userns_req->ns = ns;
+	userns_req->reqs = (struct sysctl_req *) (&userns_req[1]);
+
+	cur = userns_req->reqs;
+	for (i = 0; i < nr_req; i++) {
+		int arg_len = sysctl_userns_arg_size(req[i].type);
+		int name_len = strlen(req[i].name) + 1;
+		int total_len = sizeof(*cur) + arg_len + name_len;
+
+		if (((char *) cur) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
+			pr_err("sysctl msg %s too big: %d\n", req[i].name, total_len);
+			return -1;
+		}
+
+		/* copy over the non-pointer fields */
+		cur->type = req[i].type;
+		cur->flags = req[i].flags;
+
+		cur->name = (char *) &cur[1];
+		strcpy(cur->name, req[i].name);
+
+		cur->arg = cur->name + name_len;
+		memcpy(cur->arg, req[i].arg, arg_len);
+
+		cur = (struct sysctl_req *) (((char *) cur) + total_len);
+	}
+
+	return userns_call(__userns_sysctl_op, UNS_ASYNC, userns_req, MAX_UNSFD_MSG_SIZE, -1);
+}
diff --git a/util.c b/util.c
index 75758b7..a1ab18b 100644
--- a/util.c
+++ b/util.c
@@ -860,3 +860,23 @@ int make_yard(char *path)
 
 	return 0;
 }
+
+const char *ns_to_string(unsigned int ns)
+{
+	switch (ns) {
+	case CLONE_NEWIPC:
+		return "ipc";
+	case CLONE_NEWNS:
+		return "mnt";
+	case CLONE_NEWNET:
+		return "net";
+	case CLONE_NEWPID:
+		return "pid";
+	case CLONE_NEWUSER:
+		return "user";
+	case CLONE_NEWUTS:
+		return "uts";
+	default:
+		return NULL;
+	}
+}
diff --git a/uts_ns.c b/uts_ns.c
index e2c1f68..ca6bbdc 100644
--- a/uts_ns.c
+++ b/uts_ns.c
@@ -61,7 +61,7 @@ int prepare_utsns(int pid)
 	req[1].arg = ue->domainname;
 	req[1].type = CTL_STR(strlen(ue->domainname));
 
-	ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE);
+	ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWUTS);
 	utsns_entry__free_unpacked(ue, NULL);
 out:
 	close_image(img);
-- 
2.1.4



More information about the CRIU mailing list