[CRIU] [PATCH 1/2] usernsd: The way to restore priviledged stuff in userns

Pavel Emelyanov xemul at parallels.com
Thu Feb 12 02:39:15 PST 2015


We have collected a good set of calls that cannot be done inside
user namespaces, but we need to [1]. Some of them has already
being addressed, like prctl mm bits restore, but some are not.

I'm pretty sceptical about the ability to relax the security
checks on quite a lot of them (e.g. open-by-handle is indeed a
very dangerous operation if allowed to unpriviledged user), so
we need some way to call those things even in user namespaces.

The good news about it its that all the calls I've found operate
on file descriptors this way or another. So if we had a process,
that lived outside of user namespace, we could ask one to do the
high priority operation we need and exchange the affected file 
descriptor via unix socket.

So the usernsd is the one doing exactly this. It starts before we
create the user namespace and accepts requests via unix socket.
Clients (the processes we restore) send him the functions they
want to call, the descriptor they want to operate on and the
arguments blob. Optionally, they can request some file descriptor
back after the call.

In non usernamespace case the daemon is not started and the calls
are done right in the requestor's process environment.

In the next patch there's an example of how to use this daemon
to do the priviledged SO_SNDBUFFORCE/_RCVBUFFORCE sockopt on
a socket.

[1] http://criu.org/UserNamespace

Signed-off-by: Pavel Emelyanov <xemul at parallels.com>
---
 cr-restore.c         |  10 ++
 include/namespaces.h |  29 +++++
 include/rst_info.h   |   1 +
 include/servicefd.h  |   1 +
 namespaces.c         | 309 +++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 350 insertions(+)

diff --git a/cr-restore.c b/cr-restore.c
index 95e6fd1..2afdb45 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -1712,6 +1712,9 @@ static int restore_root_task(struct pstree_item *init)
 		return -1;
 	}
 
+	if (start_usernsd())
+		return -1;
+
 	futex_set(&task_entries->nr_in_progress,
 			stage_participants(CR_STATE_RESTORE_NS));
 
@@ -1775,6 +1778,10 @@ static int restore_root_task(struct pstree_item *init)
 	if (ret < 0)
 		goto out_kill;
 
+	ret = stop_usernsd();
+	if (ret < 0)
+		goto out_kill;
+
 	ret = move_veth_to_bridge();
 	if (ret < 0)
 		goto out_kill;
@@ -1849,6 +1856,7 @@ out_kill:
 	}
 
 out:
+	stop_usernsd();
 	__restore_switch_stage(CR_STATE_FAIL);
 	pr_err("Restoring FAILED.\n");
 	return -1;
@@ -1868,6 +1876,7 @@ static int prepare_task_entries(void)
 	task_entries->nr_helpers = 0;
 	futex_set(&task_entries->start, CR_STATE_RESTORE_NS);
 	mutex_init(&task_entries->zombie_lock);
+	mutex_init(&task_entries->userns_sync_lock);
 
 	return 0;
 }
@@ -2955,6 +2964,7 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 	close_image_dir();
 	close_proc();
 	close_service_fd(ROOT_FD_OFF);
+	close_service_fd(USERNSD_SK);
 
 	__gcov_flush();
 
diff --git a/include/namespaces.h b/include/namespaces.h
index d68a610..52d2f34 100644
--- a/include/namespaces.h
+++ b/include/namespaces.h
@@ -73,9 +73,38 @@ extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd);
 
 extern int collect_user_namespaces(bool for_dump);
 extern int prepare_userns(struct pstree_item *item);
+extern int start_usernsd(void);
+extern int stop_usernsd(void);
 extern int userns_uid(int uid);
 extern int userns_gid(int gid);
 extern int dump_user_ns(pid_t pid, int ns_id);
 extern void free_userns_maps(void);
 
+typedef int (*uns_call_t)(void *arg, int fd);
+/*
+ * Async call -- The call is guaranteed to be done till the
+ * CR_STATE_COMPLETE happens. The function may return even
+ * before the call starts.
+ * W/o flag the call is synchronous -- this function returns
+ * strictly after the call finishes.
+ */
+#define UNS_ASYNC	0x1
+/*
+ * The call returns an FD which should be sent back. Conflicts
+ * with UNS_ASYNC.
+ */
+#define UNS_FDOUT	0x2
+
+/*
+ * When we're restoring inside user namespace, some things are
+ * not allowed to be done there due to insufficient capabilities.
+ * If the operation in question can be offloaded to another process,
+ * this call allows to do that.
+ *
+ * In case we're not in userns, just call the callback immediatelly
+ * in the context of calling task.
+ */
+int userns_call(uns_call_t call, int flags,
+		void *arg, size_t arg_size, int fd);
+
 #endif /* __CR_NS_H__ */
diff --git a/include/rst_info.h b/include/rst_info.h
index d509c0c..f8d3ca8 100644
--- a/include/rst_info.h
+++ b/include/rst_info.h
@@ -11,6 +11,7 @@ struct task_entries {
 	futex_t start;
 	mutex_t	zombie_lock;
 	atomic_t cr_err;
+	mutex_t userns_sync_lock;
 };
 
 struct fdt {
diff --git a/include/servicefd.h b/include/servicefd.h
index bdadc0f..3c6e08a 100644
--- a/include/servicefd.h
+++ b/include/servicefd.h
@@ -17,6 +17,7 @@ enum sfd_type {
 			 */
 	ROOT_FD_OFF,	/* Root of the namespace we dump/restore */
 	CGROUP_YARD,
+	USERNSD_SK,	/* Socket for usernsd */
 
 	SERVICE_FD_MAX
 };
diff --git a/namespaces.c b/namespaces.c
index c1cf069..8f5aaaf 100644
--- a/namespaces.c
+++ b/namespaces.c
@@ -4,6 +4,9 @@
 #include <stdlib.h>
 #include <sys/prctl.h>
 #include <grp.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <stdarg.h>
 
 #include "cr-show.h"
 #include "util.h"
@@ -835,6 +838,312 @@ static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map)
 	return 0;
 }
 
+struct unsc_msg {
+	struct msghdr h;
+	/*
+	 * 0th is the call address
+	 * 1st is the flags
+	 * 2nd is the optional (NULL in responce) arguments
+	 */
+	struct iovec iov[3];
+	char c[CMSG_SPACE(sizeof(int))];
+};
+
+#define MAX_MSG_SIZE	256
+
+static int usernsd_pid;
+
+static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c,
+		int *x, void *arg, size_t asize, int fd)
+{
+	m->h.msg_iov = m->iov;
+	m->h.msg_iovlen = 2;
+
+	m->iov[0].iov_base = c;
+	m->iov[0].iov_len = sizeof(*c);
+	m->iov[1].iov_base = x;
+	m->iov[1].iov_len = sizeof(*x);
+
+	if (arg) {
+		m->iov[2].iov_base = arg;
+		m->iov[2].iov_len = asize;
+		m->h.msg_iovlen++;
+	}
+
+	m->h.msg_name = NULL;
+	m->h.msg_namelen = 0;
+	m->h.msg_flags = 0;
+
+	if (fd == -1) {
+		m->h.msg_control = NULL;
+		m->h.msg_controllen = 0;
+	} else {
+		struct cmsghdr *ch;
+
+		m->h.msg_control = &m->c;
+		m->h.msg_controllen = sizeof(m->c);
+		ch = CMSG_FIRSTHDR(&m->h);
+		ch->cmsg_len = CMSG_LEN(sizeof(int));
+		ch->cmsg_level = SOL_SOCKET;
+		ch->cmsg_type = SCM_RIGHTS;
+		*((int *)CMSG_DATA(ch)) = fd;
+	}
+}
+
+static int unsc_msg_fd(struct unsc_msg *um)
+{
+	struct cmsghdr *ch;
+
+	ch = CMSG_FIRSTHDR(&um->h);
+	if (ch && ch->cmsg_len == CMSG_LEN(sizeof(int))) {
+		BUG_ON(ch->cmsg_level != SOL_SOCKET);
+		BUG_ON(ch->cmsg_type != SCM_RIGHTS);
+		return *((int *)CMSG_DATA(ch));
+	}
+
+	return -1;
+}
+
+static int usernsd(int sk)
+{
+	pr_info("UNS: Daemon started\n");
+
+	while (1) {
+		struct unsc_msg um;
+		static char msg[MAX_MSG_SIZE];
+		uns_call_t call;
+		int flags, fd, ret;
+
+		unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0);
+		if (recvmsg(sk, &um.h, 0) <= 0) {
+			pr_perror("UNS: recv req error");
+			return -1;
+		}
+
+		fd = unsc_msg_fd(&um);
+		pr_debug("UNS: daemon calls %p (%d, %x)\n", call, fd, flags);
+
+		/*
+		 * Caller has sent us bare address of the routine it
+		 * wants to call. Since the caller is fork()-ed from the
+		 * same process as the daemon is, the latter has exactly
+		 * the same code at exactly the same address as the
+		 * former guy has. So go ahead and just call one!
+		 */
+
+		ret = call(msg, fd);
+
+		if (fd >= 0)
+			close(fd);
+
+		if (flags & UNS_ASYNC) {
+			/*
+			 * Async call failed and the called doesn't know
+			 * about it. Exit now and let the stop_usernsd()
+			 * check the exit code and abort the restoration.
+			 *
+			 * We'd get there either by the end of restore or
+			 * from the next userns_call() due to failed
+			 * sendmsg() in there.
+			 */
+			if (ret < 0) {
+				pr_err("UNS: Async call failed. Exiting\n");
+				return -1;
+			}
+
+			continue;
+		}
+
+		if (flags & UNS_FDOUT)
+			fd = ret;
+		else
+			fd = -1;
+
+		unsc_msg_init(&um, &call, &ret, NULL, 0, fd);
+		if (sendmsg(sk, &um.h, 0) <= 0) {
+			pr_perror("UNS: send resp error");
+			return -1;
+		}
+
+		if (fd >= 0)
+			close(fd);
+	}
+}
+
+int userns_call(uns_call_t call, int flags,
+		void *arg, size_t arg_size, int fd)
+{
+	int ret, res, sk;
+	bool async = flags & UNS_ASYNC;
+	struct unsc_msg um;
+
+	if (unlikely(arg_size > MAX_MSG_SIZE)) {
+		pr_err("UNS: message size exceeded\n");
+		return -1;
+	}
+
+	if (!(root_ns_mask & CLONE_NEWUSER))
+		return call(arg, fd);
+
+	sk = get_service_fd(USERNSD_SK);
+	pr_debug("UNS: calling %p (%d, %x)\n", call, fd, flags);
+
+	if (!async)
+		/*
+		 * Why don't we lock for async requests? Because
+		 * they just put the request in the daemon's
+		 * queue and do not wait for the responce. Thus
+		 * when daemon responce there's only one client
+		 * waiting for it in recvmsg below, so he
+		 * responces to proper caller.
+		 */
+		mutex_lock(&task_entries->userns_sync_lock);
+	else
+		/*
+		 * If we want the callback to give us and FD then
+		 * we should NOT do the asynchronous call.
+		 */
+		BUG_ON(flags & UNS_FDOUT);
+
+	/* Send the request */
+
+	unsc_msg_init(&um, &call, &flags, arg, arg_size, fd);
+	ret = sendmsg(sk, &um.h, 0);
+	if (ret <= 0) {
+		pr_perror("UNS: send req error");
+		ret = -1;
+		goto out;
+	}
+
+	if (async) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Get the responce back */
+
+	unsc_msg_init(&um, &call, &res, NULL, 0, 0);
+	ret = recvmsg(sk, &um.h, 0);
+	if (ret <= 0) {
+		pr_perror("UNS: recv resp error");
+		ret = -1;
+		goto out;
+	}
+
+	/* Decode the result and return */
+
+	if (flags & UNS_FDOUT)
+		ret = unsc_msg_fd(&um);
+	else
+		ret = res;
+out:
+	if (!async)
+		mutex_unlock(&task_entries->userns_sync_lock);
+
+	return ret;
+}
+
+int start_usernsd(void)
+{
+	int sk[2];
+
+	if (!(root_ns_mask & CLONE_NEWUSER))
+		return 0;
+
+	/*
+	 * Seqpacket to
+	 *
+	 * a) Help daemon distinguish individual requests from
+	 *    each other easily. Stream socket require manual
+	 *    messages boundaries.
+	 *
+	 * b) Make callers note the damon death by seeing the
+	 *    disconnected socket. In case of dgram socket
+	 *    callers would just get stuck in receiving the
+	 *    responce.
+	 */
+
+	if (socketpair(PF_UNIX, SOCK_SEQPACKET, 0, sk)) {
+		pr_perror("Can't make usernsd socket");
+		return -1;
+	}
+
+	usernsd_pid = fork();
+	if (usernsd_pid == 0) {
+		int ret;
+
+		close(sk[0]);
+		ret = usernsd(sk[1]);
+		exit(ret);
+	}
+
+	close(sk[1]);
+	install_service_fd(USERNSD_SK, sk[0]);
+	close(sk[0]);
+
+	return 0;
+}
+
+static int exit_usernsd(void *arg, int fd)
+{
+	int code = *(int *)arg;
+	pr_info("UNS: `- daemon exits w/ %d\n", code);
+	exit(code);
+}
+
+int stop_usernsd(void)
+{
+	int ret = 0;
+
+	if (usernsd_pid) {
+		int status = -1;
+		sigset_t blockmask, oldmask;
+
+		/*
+		 * Don't let the sigchld_handler() mess with us
+		 * calling waitpid() on the exited daemon. The
+		 * same is done in cr_system().
+		 */
+
+		sigemptyset(&blockmask);
+		sigaddset(&blockmask, SIGCHLD);
+		sigprocmask(SIG_BLOCK, &blockmask, &oldmask);
+
+		/*
+		 * Send a message to make sure the daemon _has_
+		 * proceeded all its queue of asynchronous requests.
+		 *
+		 * All the restoring processes might have already
+		 * closed their USERNSD_SK descriptors, but daemon
+		 * still has its in connected state -- this is us
+		 * who hold the last reference on the peer.
+		 *
+		 * If daemon has exited "in advance" due to async
+		 * call or socket error, the userns_call() and the
+		 * waitpid() below would both fail and we'll see
+		 * bad exit status.
+		 */
+
+		userns_call(exit_usernsd, UNS_ASYNC, &ret, sizeof(ret), -1);
+		waitpid(usernsd_pid, &status, 0);
+
+		if (WIFEXITED(status))
+			ret = WEXITSTATUS(status);
+		else
+			ret = -1;
+
+		usernsd_pid = 0;
+		sigprocmask(SIG_BLOCK, &oldmask, NULL);
+
+		if (ret != 0)
+			pr_err("UNS: daemon exited abnormally\n");
+		else
+			pr_info("UNS: daemon stopped\n");
+	}
+
+	return ret;
+}
+
 int prepare_userns(struct pstree_item *item)
 {
 	struct cr_img *img;
-- 
1.8.4.2



More information about the CRIU mailing list