[CRIU] [PATCH 2/2] sysctl: move sysctl calls to usernsd

Tycho Andersen tycho.andersen at canonical.com
Tue Sep 8 16:28:29 PDT 2015


When in a userns, tasks can't write to certain sysctl files:

(00.009653)      1: Error (sysctl.c:142): Can't open sysctl kernel/hostname: Permission denied

Mostly for my own education in what is required to port something to be
userns restorable, I ported the sysctl stuff. A potential concern for this
patch is that copying structures with pointers around is kind of gory. I
did it ad-hoc here, but it may be worth inventing some mechanisms to make
it easier, although I'm not sure what exactly that would look like
(potentially re-using some of the protobuf bits; I'll investigate this more
if it looks helpful when doing the cgroup user namespaces port?).

Another issue is that there is not a great way to return non-fd stuff in
memory right now from userns_call; one of the little hacks in this code
would be "simplified" if we invented a way to do this.

v2: coalesce the individual struct sysctl_req requests into one big
    sysctl_userns_req that is in a contiguous region of memory so that we
    can pass it via userns_call. Hopefully nobody finds my little ascii
    diagram too offensive :)

Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
 include/namespaces.h |   2 +-
 namespaces.c         |   4 +-
 net.c                |   6 ++
 sysctl.c             | 198 ++++++++++++++++++++++++++++++++++++++++-----------
 4 files changed, 167 insertions(+), 43 deletions(-)

diff --git a/include/namespaces.h b/include/namespaces.h
index 18557af..8034cca 100644
--- a/include/namespaces.h
+++ b/include/namespaces.h
@@ -95,7 +95,7 @@ typedef int (*uns_call_t)(void *arg, int fd);
  */
 #define UNS_FDOUT	0x2
 
-#define MAX_UNSFD_MSG_SIZE 256
+#define MAX_UNSFD_MSG_SIZE 4096
 
 /*
  * When we're restoring inside user namespace, some things are
diff --git a/namespaces.c b/namespaces.c
index 1677a7a..76e05ce 100644
--- a/namespaces.c
+++ b/namespaces.c
@@ -908,7 +908,7 @@ static int usernsd(int sk)
 
 	while (1) {
 		struct unsc_msg um;
-		static char msg[MAX_MSG_SIZE];
+		static char msg[MAX_UNSFD_MSG_SIZE];
 		uns_call_t call;
 		int flags, fd, ret;
 
@@ -975,7 +975,7 @@ int userns_call(uns_call_t call, int flags,
 	bool async = flags & UNS_ASYNC;
 	struct unsc_msg um;
 
-	if (unlikely(arg_size > MAX_MSG_SIZE)) {
+	if (unlikely(arg_size > MAX_UNSFD_MSG_SIZE)) {
 		pr_err("UNS: message size exceeded\n");
 		return -1;
 	}
diff --git a/net.c b/net.c
index 082ccb6..e0483d8 100644
--- a/net.c
+++ b/net.c
@@ -120,6 +120,12 @@ static int ipv4_conf_op(char *tgt, int *conf, int op, NetnsEntry **netns)
 		ri++;
 	}
 
+	/* If we don't have anything to restore, let's not invoke usernsd and
+	 * all that.
+	 */
+	if (ri == 0)
+		return 0;
+
 	ret = sysctl_op(req, ri, op);
 	if (ret < 0) {
 		pr_err("Failed to %s %s/<confs>\n", (op == CTL_READ)?"read":"write", tgt);
diff --git a/sysctl.c b/sysctl.c
index b059140..9e1807f 100644
--- a/sysctl.c
+++ b/sysctl.c
@@ -5,9 +5,16 @@
 #include <stdlib.h>
 
 #include "asm/types.h"
+#include "namespaces.h"
 #include "sysctl.h"
 #include "util.h"
 
+struct sysctl_userns_req {
+	int			op;
+	size_t			nr_req;
+	struct sysctl_req	*reqs;
+};
+
 #define __SYSCTL_OP(__ret, __fd, __req, __type, __nr, __op)		\
 do {									\
 	if (__op == CTL_READ)						\
@@ -126,67 +133,178 @@ err:
 	return ret;
 }
 
-static int __sysctl_op(int dir, struct sysctl_req *req, int op)
+static int sysctl_userns_arg_size(int type)
 {
-	int fd, ret = -1, nr = 1, flags;
-
-	if (op == CTL_READ)
-		flags = O_RDONLY;
-	else
-		flags = O_WRONLY;
-
-	fd = openat(dir, req->name, flags);
-	if (fd < 0) {
-		if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
-			return 0;
-		pr_perror("Can't open sysctl %s", req->name);
-		return -1;
-	}
-
-	switch (CTL_TYPE(req->type)) {
+	switch(CTL_TYPE(type)) {
 	case __CTL_U32A:
-		nr = CTL_LEN(req->type);
+		return sizeof(u32) * CTL_LEN(type);
 	case CTL_U32:
-		__SYSCTL_OP(ret, fd, req, u32, nr, op);
-		break;
+		return sizeof(u32);
 	case CTL_32:
-		__SYSCTL_OP(ret, fd, req, s32, nr, op);
-		break;
+		return sizeof(s32);
 	case __CTL_U64A:
-		nr = CTL_LEN(req->type);
+		return sizeof(u64) * CTL_LEN(type);
 	case CTL_U64:
-		__SYSCTL_OP(ret, fd, req, u64, nr, op);
-		break;
+		return sizeof(u64);
 	case __CTL_STR:
-		nr = CTL_LEN(req->type);
-		__SYSCTL_OP(ret, fd, req, char, nr, op);
-		break;
-	}
-
-	close_safe(&fd);
+		return sizeof(char) * CTL_LEN(type) + 1;
+	default:
+		pr_err("unknown arg type %d\n", type);
 
-	return ret;
+		/* Ensure overflow to cause an error */
+		return MAX_UNSFD_MSG_SIZE;
+	}
 }
 
-int sysctl_op(struct sysctl_req *req, size_t nr_req, int op)
+/*
+ * In order to avoid lots of opening of /proc/sys for each struct sysctl_req,
+ * we encode each array of sysctl_reqs into one contiguous region of memory so
+ * it can be passed via userns_call. It looks like this:
+ *
+ *  struct sysctl_userns_req    struct sysctl_req       name        arg
+ * ---------------------------------------------------------------------------
+ * |  op  |  nr_req  |  reqs  | <fields> | name | arg | "the name" | "the arg" ...
+ * ---------------------------------------------------------------------------
+ *                       |____^             |______|__^            ^
+ *                                                 |_______________|
+ */
+
+static int __sysctl_op(void *arg, int unused)
 {
-	int ret = 0;
-	int dir = -1;
+	int fd, ret = -1, nr, flags, dir, i;
+	struct sysctl_userns_req *userns_req = arg;
+	int op = userns_req->op;
+	struct sysctl_req *req;
 
-	dir = open("/proc/sys", O_RDONLY);
+	dir = open("/proc/sys", O_RDONLY, O_DIRECTORY);
 	if (dir < 0) {
 		pr_perror("Can't open sysctl dir");
 		return -1;
 	}
 
-	while (nr_req--) {
-		ret = __sysctl_op(dir, req, op);
-		if (ret < 0)
+	// fix up the pointer
+	req = userns_req->reqs = (struct sysctl_req *) &userns_req[1];
+	for (i = 0; i < userns_req->nr_req; i++)  {
+		int arg_len = sysctl_userns_arg_size(req->type);
+		int name_len = strlen((char *) &req[1]) + 1;
+		int total_len = sizeof(*req) + arg_len + name_len;
+
+		/* fix up the pointers */
+		req->name = (char *) &req[1];
+		req->arg = req->name + name_len;
+
+		if (((char *) req) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
+			pr_err("bad sysctl req %s, too big: %d\n", req->name, total_len);
+			return -1;
+		}
+
+		if (op == CTL_READ)
+			flags = O_RDONLY;
+		else
+			flags = O_WRONLY;
+
+		fd = openat(dir, req->name, flags);
+		if (fd < 0) {
+			close_safe(&dir);
+			if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
+				continue;
+			pr_perror("Can't open sysctl %s", req->name);
+			return -1;
+		}
+
+		nr = 1;
+		switch (CTL_TYPE(req->type)) {
+		case __CTL_U32A:
+			nr = CTL_LEN(req->type);
+		case CTL_U32:
+			__SYSCTL_OP(ret, fd, req, u32, nr, op);
+			break;
+		case CTL_32:
+			__SYSCTL_OP(ret, fd, req, s32, nr, op);
 			break;
-		req++;
+		case __CTL_U64A:
+			nr = CTL_LEN(req->type);
+		case CTL_U64:
+			__SYSCTL_OP(ret, fd, req, u64, nr, op);
+			break;
+		case __CTL_STR:
+			nr = CTL_LEN(req->type);
+			__SYSCTL_OP(ret, fd, req, char, nr, op);
+			break;
+		}
+
+		close_safe(&fd);
+		req = (struct sysctl_req *) (((char *) req) + total_len);
 	}
 
 	close_safe(&dir);
 
 	return ret;
 }
+
+int sysctl_op(struct sysctl_req *req, size_t nr_req, int op)
+{
+	int ret = 0, i;
+	struct sysctl_userns_req *userns_req;
+	struct sysctl_req *cur;
+
+	if (nr_req == 0)
+		return 0;
+
+	userns_req = alloca(MAX_UNSFD_MSG_SIZE);
+	userns_req->op = op;
+	userns_req->nr_req = nr_req;
+	userns_req->reqs = (struct sysctl_req *) (&userns_req[1]);
+
+	cur = userns_req->reqs;
+	for (i = 0; i < nr_req; i++) {
+		int arg_len = sysctl_userns_arg_size(req[i].type);
+		int name_len = strlen(req[i].name) + 1;
+		int total_len = sizeof(*cur) + arg_len + name_len;
+
+		if (((char *) cur) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
+			pr_err("sysctl msg %s too big: %d\n", req[i].name, total_len);
+			return -1;
+		}
+
+		/* copy over the non-pointer fields */
+		cur->type = req[i].type;
+		cur->flags = req[i].flags;
+
+		cur->name = (char *) &cur[1];
+		strcpy(cur->name, req[i].name);
+
+		cur->arg = cur->name + name_len;
+		memcpy(cur->arg, req[i].arg, arg_len);
+
+		cur = (struct sysctl_req *) (((char *) cur) + total_len);
+	}
+
+	ret = userns_call(__sysctl_op, UNS_ASYNC, userns_req, MAX_UNSFD_MSG_SIZE, 0);
+	if (ret < 0)
+		return -1;
+
+	if (op != CTL_READ)
+		return 0;
+
+	/*
+	 * Here, we use a little hack: since we only read in dump mode when
+	 * usernsd is not active, we know the above call happened in this
+	 * address space, so we can just copy the value read back out. If there
+	 * was an API to return stuff via userns_call(), that would be
+	 * preferable.
+	 */
+	cur = userns_req->reqs;
+	for (i = 0; i < nr_req; i++) {
+		int arg_len = sysctl_userns_arg_size(cur->type);
+		int name_len = strlen((char *) &cur[1]) + 1;
+		int total_len = sizeof(*cur) + arg_len + name_len;
+		void *arg = ((void *) &cur[1]) + name_len;
+
+		memcpy(req[i].arg, arg, arg_len);
+
+		cur = (struct sysctl_req *) (((char *) cur) + total_len);
+	}
+
+	return 0;
+}
-- 
2.1.0



More information about the CRIU mailing list