[CRIU] [PATCH 04/10] net: Pre-create nl diag sk

Pavel Emelyanov xemul at parallels.com
Mon Sep 29 11:04:25 PDT 2014


The setns() syscall (called by switch_ns()) can be extremely
slow. If we call it two or more times from the same task the
kernel will synchonously go on a very slow routine called
synchronize_rcu() trying to put a reference on old namespaces.

To avoid doing this more than once I propose to create all
per-ns sockets in one place with one setns call. In this
patch there's on nl diag socket used to collect other sockets
is created this way.

Signed-off-by: Pavel Emelyanov <xemul at parallels.com>
---
 cr-check.c           | 16 +++++++++++++++-
 include/namespaces.h |  5 +++++
 include/sockets.h    |  3 ++-
 net.c                | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 sockets.c            | 32 ++++++++------------------------
 5 files changed, 79 insertions(+), 27 deletions(-)

diff --git a/cr-check.c b/cr-check.c
index 7647cac..cb3747b 100644
--- a/cr-check.c
+++ b/cr-check.c
@@ -1,4 +1,5 @@
 #include <unistd.h>
+#include <linux/netlink.h>
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <sys/eventfd.h>
@@ -7,6 +8,7 @@
 #include <sys/signalfd.h>
 #include <sys/ptrace.h>
 #include <sys/wait.h>
+#include <sys/socket.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <linux/if.h>
@@ -91,11 +93,23 @@ static int check_map_files(void)
 	return -1;
 }
 
+#ifndef NETLINK_SOCK_DIAG
+#define NETLINK_SOCK_DIAG NETLINK_INET_DIAG
+#endif
+
 static int check_sock_diag(void)
 {
 	int ret;
+	struct ns_id ns;
+
+	ns.pid = 0;
+	ns.net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
+	if (ns.net.nlsk < 0) {
+		pr_perror("Can't make diag socket for check");
+		return -1;
+	}
 
-	ret = collect_sockets(0);
+	ret = collect_sockets(&ns);
 	if (!ret)
 		return 0;
 
diff --git a/include/namespaces.h b/include/namespaces.h
index 83fba9a..09d631c 100644
--- a/include/namespaces.h
+++ b/include/namespaces.h
@@ -21,6 +21,11 @@ struct ns_id {
 			struct mount_info *mntinfo_list;
 			struct mount_info *mntinfo_tree;
 		} mnt;
+
+		struct {
+			int nlsk;	/* for sockets collection */
+			int seqsk;	/* to talk to parasite daemons */
+		} net;
 	};
 };
 extern struct ns_id *ns_ids;
diff --git a/include/sockets.h b/include/sockets.h
index 3a2fe81..105cb10 100644
--- a/include/sockets.h
+++ b/include/sockets.h
@@ -32,7 +32,8 @@ extern int restore_prepare_socket(int sk);
 extern bool socket_test_collect_bit(unsigned int family, unsigned int proto);
 
 extern int sk_collect_one(int ino, int family, struct socket_desc *d);
-extern int collect_sockets(int pid);
+struct ns_id;
+extern int collect_sockets(struct ns_id *);
 extern int collect_inet_sockets(void);
 extern struct collect_image_info unix_sk_cinfo;
 extern int collect_unix_sockets(void);
diff --git a/net.c b/net.c
index 7cf3d2c..62c56a0 100644
--- a/net.c
+++ b/net.c
@@ -628,10 +628,58 @@ int veth_pair_add(char *in, char *out)
 	return 0;
 }
 
+/*
+ * The setns() syscall (called by switch_ns()) can be extremely
+ * slow. If we call it two or more times from the same task the
+ * kernel will synchonously go on a very slow routine called
+ * synchronize_rcu() trying to put a reference on old namespaces.
+ *
+ * To avoid doing this more than once we pre-create all the
+ * needed other-ns sockets in advance.
+ */
+
+static int prep_ns_sockets(struct ns_id *ns)
+{
+	int nsret = -1, ret;
+
+	if (ns->pid != getpid()) {
+		pr_info("Switching to %d's net for collecting sockets\n", ns->pid);
+		if (switch_ns(ns->pid, &net_ns_desc, &nsret))
+			return -1;
+	}
+
+	ret = ns->net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
+	if (ret < 0) {
+		pr_perror("Can't create sock diag socket");
+		goto err_nl;
+	}
+
+	ret = 0;
+out:
+	if (nsret >= 0 && restore_ns(nsret, &net_ns_desc) < 0) {
+		nsret = -1;
+		if (ret == 0)
+			goto err_ret;
+	}
+
+	return ret;
+
+err_ret:
+	close(ns->net.nlsk);
+err_nl:
+	goto out;
+}
+
 static int collect_net_ns(struct ns_id *ns)
 {
+	int ret;
+
 	pr_info("Collecting netns %d/%d\n", ns->id, ns->pid);
-	return collect_sockets(ns->pid);
+	ret = prep_ns_sockets(ns);
+	if (ret)
+		return ret;
+
+	return collect_sockets(ns);
 }
 
 int collect_net_namespaces(void)
diff --git a/sockets.c b/sockets.c
index 103774f..b36caef 100644
--- a/sockets.c
+++ b/sockets.c
@@ -515,27 +515,12 @@ static int do_collect_req(int nl, struct sock_diag_req *req, int size,
 	return tmp;
 }
 
-int collect_sockets(int pid)
+int collect_sockets(struct ns_id *ns)
 {
 	int err = 0, tmp;
-	int rst = -1;
-	int nl;
+	int nl = ns->net.nlsk;
 	struct sock_diag_req req;
 
-	if (root_ns_mask & CLONE_NEWNET) {
-		pr_info("Switching to %d's net for collecting sockets\n", pid);
-
-		if (switch_ns(pid, &net_ns_desc, &rst))
-			return -1;
-	}
-
-	nl = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
-	if (nl < 0) {
-		pr_perror("Can't create sock diag socket");
-		err = -1;
-		goto out;
-	}
-
 	memset(&req, 0, sizeof(req));
 	req.hdr.nlmsg_len	= sizeof(req);
 	req.hdr.nlmsg_type	= SOCK_DIAG_BY_FAMILY;
@@ -615,7 +600,7 @@ int collect_sockets(int pid)
 	tmp = do_collect_req(nl, &req, sizeof(req), packet_receive_one, NULL);
 	if (tmp) {
 		pr_warn("The current kernel doesn't support packet_diag\n");
-		if (pid == 0 || tmp != -ENOENT) /* Fedora 19 */
+		if (ns->pid == 0 || tmp != -ENOENT) /* Fedora 19 */
 			err = tmp;
 	}
 
@@ -625,16 +610,15 @@ int collect_sockets(int pid)
 	tmp = do_collect_req(nl, &req, sizeof(req), netlink_receive_one, NULL);
 	if (tmp) {
 		pr_warn("The current kernel doesn't support netlink_diag\n");
-		if (pid == 0 || tmp != -ENOENT) /* Fedora 19 */
+		if (ns->pid == 0 || tmp != -ENOENT) /* Fedora 19 */
 			err = tmp;
 	}
 
+	/* don't need anymore */
 	close(nl);
-out:
-	if (rst >= 0) {
-		if (restore_ns(rst, &net_ns_desc) < 0)
-			err = -1;
-	} else if (pid != 0) {
+	ns->net.nlsk = -1;
+
+	if (ns->pid == getpid()) {
 		/*
 		 * If netns isn't dumped, criu will fail only
 		 * if an unsupported socket will be really dumped.
-- 
1.8.4.2




More information about the CRIU mailing list