[CRIU] [PATCH v4 32/41] pid: Create pid_ns helpers

Kirill Tkhai ktkhai at virtuozzo.com
Thu May 4 09:10:37 PDT 2017


Task may set last_pid only for its active pid namespace,
so if NSpid of a child contains more then one level, we
need external help to populate the whole pid hierarhy
(pid in parent pid_ns, pid in grand parent etc). Pid ns
helpers are used for that.

These are childred of usernsd, which are listening for
socket, and setting requested last pid in their active
pid_ns.

v4: Move destroy_pid_ns_helpers() before CR_STATE_RESTORE_SIGCHLD
change, as they must die before zombies.

v3: Block SIGCHLD during stoppinig of pid_ns helpers.

Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 criu/cr-restore.c         |    7 +
 criu/include/namespaces.h |    3 +
 criu/namespaces.c         |  253 +++++++++++++++++++++++++++++++++++++++++++++
 criu/ns-common.c          |   51 +++++++++
 criu/pie/restorer.c       |    5 +
 5 files changed, 319 insertions(+)
 create mode 100644 criu/ns-common.c

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 4c4ca37d7..3c35e5f08 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -1605,6 +1605,8 @@ static int restore_task_with_children(void *_arg)
 			pr_err("Can't add fd to fdstore\n");
 			return -1;
 		}
+		if (create_pid_ns_helper(pid_ns) < 0)
+			goto err;
 	}
 
 	if (restore_task_mnt_ns(current))
@@ -2038,6 +2040,10 @@ static int restore_root_task(struct pstree_item *init)
 			task_entries->nr_threads--;
 	}
 
+	ret = destroy_pid_ns_helpers();
+	if (ret < 0)
+		goto out_kill;
+
 	ret = restore_switch_stage(CR_STATE_RESTORE_SIGCHLD);
 	if (ret < 0)
 		goto out_kill;
@@ -2141,6 +2147,7 @@ static int restore_root_task(struct pstree_item *init)
 	return 0;
 
 out_kill:
+	destroy_pid_ns_helpers();
 	/*
 	 * The processes can be killed only when all of them have been created,
 	 * otherwise an external proccesses can be killed.
diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
index 37b65b0db..b81957668 100644
--- a/criu/include/namespaces.h
+++ b/criu/include/namespaces.h
@@ -267,5 +267,8 @@ static inline int pid_ns_root_off(void)
 	return 0;
 }
 extern int reserve_pid_ns_helpers(void);
+extern int create_pid_ns_helper(struct ns_id *ns);
+extern int destroy_pid_ns_helpers(void);
+extern int request_set_next_pid(int pid_ns_id, pid_t pid, int sk);
 
 #endif /* __CR_NS_H__ */
diff --git a/criu/namespaces.c b/criu/namespaces.c
index 97ea2b0e6..f65f06003 100644
--- a/criu/namespaces.c
+++ b/criu/namespaces.c
@@ -15,6 +15,7 @@
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/ptrace.h>
+#include <sys/file.h>
 
 #include "page.h"
 #include "rst-malloc.h"
@@ -38,6 +39,11 @@
 #include "fdstore.h"
 #include "proc_parse.h"
 
+#define __sys(foo)	foo
+#define __sys_err(ret)	(-errno)
+
+#include "ns-common.c"
+
 static struct ns_desc *ns_desc_array[] = {
 	&net_ns_desc,
 	&uts_ns_desc,
@@ -49,6 +55,8 @@ static struct ns_desc *ns_desc_array[] = {
 };
 
 static unsigned int join_ns_flags;
+/* Creation of every helper are synchronized by userns_sync_lock */
+static int nr_pid_ns_helper_created = 0;
 
 int check_namespace_opts(void)
 {
@@ -2532,5 +2540,250 @@ int reserve_pid_ns_helpers(void)
 	return walk_namespaces(&pid_ns_desc, do_reserve_pid_ns_helpers, NULL);
 }
 
+static int pid_ns_helper_sock(struct ns_id *ns)
+{
+	struct sockaddr_un addr;
+	socklen_t len;
+	int sk;
+
+	sk = socket(AF_UNIX, SOCK_DGRAM, 0);
+	if (sk < 0) {
+		pr_perror("Can't create helper socket");
+		return -1;
+	}
+	pid_ns_helper_socket_name(&addr, &len, ns->id);
+
+	if (bind(sk, (struct sockaddr *)&addr, len) < 0) {
+		pr_perror("Can't bind pid_ns sock");
+		return -1;
+	}
+
+	return sk;
+}
+
+static int pid_ns_helper(struct ns_id *ns, int sk)
+{
+	struct sockaddr_un addr;
+	struct msghdr msg = {0};
+	struct iovec iov;
+	pid_t pid;
+
+	msg.msg_name = &addr;
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
+	while (1) {
+		int answer = 0;
+		msg.msg_namelen = sizeof(addr);
+		iov.iov_base = &pid;
+		iov.iov_len = sizeof(pid);
+
+		if (recvmsg(sk, &msg, 0) < 0) {
+			pr_perror("recv() failed to read pid");
+			break;
+		}
+
+		if (pid != 0) {
+			if (__set_next_pid(pid) < 0) {
+				pr_err("Can't set next pid\n");
+				answer = -1;
+			}
+		}
+
+		iov.iov_base = &answer;
+		iov.iov_len = sizeof(answer);
+		if (sendmsg(sk, &msg, 0) < 0) {
+			pr_perror("Can't send answer");
+			break;
+		}
+
+		if (pid == 0)
+			return 0;
+	}
+
+	return -1;
+}
+
+static int do_create_pid_ns_helper(void *arg, int unused_fd, pid_t unused_pid)
+{
+	int pid_ns_fd, mnt_ns_fd, sk, fd, i, lock_fd, transport_fd;
+	struct ns_id *ns, *tmp;
+	struct pid *pid;
+	pid_t child;
+
+	pid_ns_fd = open_proc(PROC_SELF, "ns/pid");
+	if (pid_ns_fd < 0) {
+		pr_perror("Can't open pid ns");
+		return -1;
+	}
+	ns = *(struct ns_id **)arg;
+
+	fd = fdstore_get(ns->pid.nsfd_id);
+	if (fd < 0) {
+		pr_err("Can't get pid_ns fd\n");
+		return -1;
+	}
+	if (setns(fd, CLONE_NEWPID) < 0) {
+		pr_perror("Can't setns");
+		return -1;
+	}
+	close(fd);
+
+	sk = pid_ns_helper_sock(ns);
+	if (sk < 0)
+		return -1;
+
+	pid = __pstree_pid_by_virt(ns, ns->ns_pid);
+	if (!pid) {
+		pr_err("Can't find helper reserved pid\n");
+		return -1;
+	}
+
+	tmp = ns->parent;
+	if (tmp) {
+		futex_t *f = &tmp->pid.helper_created;
+		futex_wait_while_eq(f, 0);
+	}
+
+	if (switch_ns(root_item->pid->real, &mnt_ns_desc, &mnt_ns_fd) < 0) {
+		pr_err("Can't set mnt_ns\n");
+		return -1;
+	}
+
+	lock_fd = open("/proc/" LAST_PID_PATH, O_RDONLY);
+	if (lock_fd < 0)
+		return -1;
+
+	if (restore_ns(mnt_ns_fd, &mnt_ns_desc) < 0) {
+		pr_err("Can't restore ns\n");
+		return -1;
+	}
+
+	if (flock(lock_fd, LOCK_EX)) {
+		close(lock_fd);
+		pr_perror("Can't lock %s", LAST_PID_PATH);
+		return -1;
+	}
+
+	transport_fd = get_service_fd(TRANSPORT_FD_OFF);
+	/*
+	 * Starting not from pid->level - 1, as it's helper has not created yet
+	 * (we're creating it in the moment), and the true pid for this level
+	 * is set by the task, who does close(CLONE_NEWPID) (this task is sender of fd).
+	 */
+	for (i = pid->level - 2, tmp = ns->parent; i >= 0; i--, tmp = tmp->parent)
+		if (request_set_next_pid(tmp->id, pid->ns[i].virt, transport_fd)) {
+			pr_err("Can't set next pid using helper\n");
+			flock(lock_fd, LOCK_UN);
+			close(lock_fd);
+			return -1;
+		}
+	child = fork();
+	if (child < 0) {
+		flock(lock_fd, LOCK_UN);
+		close(lock_fd);
+		pr_perror("Can't fork");
+		return -1;
+	} else if (!child) {
+		close(lock_fd);
+		exit(pid_ns_helper(ns, sk));
+	}
+	close(sk);
+	futex_set_and_wake(&ns->pid.helper_created, 1);
+	flock(lock_fd, LOCK_UN);
+	close(lock_fd);
+	nr_pid_ns_helper_created++;
+
+	if (setns(pid_ns_fd, CLONE_NEWPID) < 0) {
+		pr_perror("Restore ns");
+		return -1;
+	}
+	return 0;
+}
+
+/*
+ * Task may set last_pid only for its active pid namespace,
+ * so if NSpid of a child contains more then one level, we
+ * need external help to populate the whole pid hierarhy
+ * (pid in parent pid_ns, pid in grand parent etc). Pid ns
+ * helpers are used for that.
+ *
+ * We need a task or tasks to be a parent of pid_ns helpers.
+ * To live in common hierarhy and to be a TASK_HELPER is not
+ * possible, because it introduces circular dependencies.
+ * The same is to be children of criu main task, because
+ * we already have dependencies between it and root_item
+ * (NO more dependencies!). So, we choose usernsd for that:
+ * it always exists and have command interface.
+ */
+int create_pid_ns_helper(struct ns_id *ns)
+{
+	BUG_ON(getpid() != INIT_PID);
+
+	if (__set_next_pid(ns->ns_pid) < 0) {
+		pr_err("Can't set next fd\n");
+		return -1;
+	}
+	if (userns_call(do_create_pid_ns_helper, 0, &ns, sizeof(ns), -1) < 0) {
+		pr_err("Can't create pid_ns helper\n");
+		return -1;
+	}
+	return 0;
+}
+
+static int do_destroy_pid_ns_helper(void *arg, int fd, pid_t pid)
+{
+	int i, sk, status, sig_blocked = true, nr_ok = 0, ret = 0;
+	sigset_t sig_mask;
+	struct ns_id *ns;
+
+	if (!nr_pid_ns_helper_created)
+		return 0;
+
+	if (block_sigmask(&sig_mask, SIGCHLD)) {
+		sig_blocked = false;
+		ret = -1;
+	}
+
+	sk = get_service_fd(TRANSPORT_FD_OFF);
+
+	for (ns = ns_ids; ns; ns = ns->next) {
+		if (ns->nd != &pid_ns_desc)
+			continue;
+		if (request_set_next_pid(ns->id, 0, sk) == 0)
+			nr_ok++;
+	}
+
+	if (nr_ok != nr_pid_ns_helper_created) {
+		pr_err("Not all pid_ns helpers killed\n");
+		ret = -1;
+	}
+
+	for (i = 0; i < nr_ok; i++) {
+		if (waitpid(-1, &status, 0) < 0) {
+			pr_perror("Error during waiting pid_ns helper");
+			ret = -1;
+		}
+	}
+	nr_pid_ns_helper_created = 0;
+
+	if (sig_blocked && restore_sigmask(&sig_mask))
+		ret = -1;
+
+	return ret;
+}
+
+int destroy_pid_ns_helpers(void)
+{
+	if (!(root_ns_mask & CLONE_NEWPID))
+		return 0;
+
+	if (userns_call(do_destroy_pid_ns_helper, 0, NULL, 0, -1) < 0) {
+		pr_err("Can't create pid_ns helper\n");
+		return -1;
+	}
+	return 0;
+}
+
 struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
 struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
diff --git a/criu/ns-common.c b/criu/ns-common.c
new file mode 100644
index 000000000..a8e28aa00
--- /dev/null
+++ b/criu/ns-common.c
@@ -0,0 +1,51 @@
+#include <sys/socket.h>
+#include <sys/un.h>
+
+void pid_ns_helper_socket_name(struct sockaddr_un *addr, socklen_t *len, unsigned int id)
+{
+	const char prefix[] = "0/criu-pid-ns-";
+	const char int_max[] = "2147483647";
+
+	*len = sizeof(*addr) - sizeof(addr->sun_path) +
+	       sizeof(prefix) - 1 + sizeof(int_max) - 1;
+
+	addr->sun_family = AF_UNIX;
+
+	memset(addr->sun_path + sizeof(prefix) - 1, '\0', sizeof(int_max) - 1);
+#ifdef CR_NOGLIBC
+	std_sprintf(addr->sun_path, "%s%d", prefix, id);
+#else
+	sprintf(addr->sun_path, "%s%d", prefix, id);
+#endif
+	addr->sun_path[0] = '\0';
+}
+
+/* Send helper a request to set next pid and receive success */
+int request_set_next_pid(int pid_ns_id, pid_t pid, int sk)
+{
+	struct sockaddr_un addr;
+	int answer, ret;
+	socklen_t len;
+
+	BUG_ON(pid == -1);
+
+	pid_ns_helper_socket_name(&addr, &len, pid_ns_id);
+	ret = __sys(sendto)(sk, &pid, sizeof(pid), 0, (struct sockaddr *)&addr, len);
+	if (ret	< 0) {
+		pr_err("Can't send request: err=%d\n", __sys_err(ret));
+		return -1;
+	}
+
+	ret = __sys(recvfrom)(sk, &answer, sizeof(answer), 0, NULL, NULL);
+	if (ret < 0) {
+		pr_err("Can't recv answer: err=%d\n", __sys_err(ret));
+		return -1;
+	}
+
+	if (answer != 0) {
+		pr_err("Error answer\n");
+		return -1;
+	}
+
+	return 0;
+}
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 030c7ff42..3b0b35710 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -47,6 +47,11 @@
 #include "restorer.h"
 #include "namespaces.h"
 
+#define __sys(foo)	sys_##foo
+#define __sys_err(ret)	ret
+
+#include "../ns-common.c"
+
 #ifndef PR_SET_PDEATHSIG
 #define PR_SET_PDEATHSIG 1
 #endif



More information about the CRIU mailing list