[CRIU] [PATCH 04/10] net: Pre-create nl diag sk
Pavel Emelyanov
xemul at parallels.com
Mon Sep 29 11:04:25 PDT 2014
The setns() syscall (called by switch_ns()) can be extremely
slow. If we call it two or more times from the same task the
kernel will synchonously go on a very slow routine called
synchronize_rcu() trying to put a reference on old namespaces.
To avoid doing this more than once I propose to create all
per-ns sockets in one place with one setns call. In this
patch there's on nl diag socket used to collect other sockets
is created this way.
Signed-off-by: Pavel Emelyanov <xemul at parallels.com>
---
cr-check.c | 16 +++++++++++++++-
include/namespaces.h | 5 +++++
include/sockets.h | 3 ++-
net.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
sockets.c | 32 ++++++++------------------------
5 files changed, 79 insertions(+), 27 deletions(-)
diff --git a/cr-check.c b/cr-check.c
index 7647cac..cb3747b 100644
--- a/cr-check.c
+++ b/cr-check.c
@@ -1,4 +1,5 @@
#include <unistd.h>
+#include <linux/netlink.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/eventfd.h>
@@ -7,6 +8,7 @@
#include <sys/signalfd.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
+#include <sys/socket.h>
#include <fcntl.h>
#include <signal.h>
#include <linux/if.h>
@@ -91,11 +93,23 @@ static int check_map_files(void)
return -1;
}
+#ifndef NETLINK_SOCK_DIAG
+#define NETLINK_SOCK_DIAG NETLINK_INET_DIAG
+#endif
+
static int check_sock_diag(void)
{
int ret;
+ struct ns_id ns;
+
+ ns.pid = 0;
+ ns.net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
+ if (ns.net.nlsk < 0) {
+ pr_perror("Can't make diag socket for check");
+ return -1;
+ }
- ret = collect_sockets(0);
+ ret = collect_sockets(&ns);
if (!ret)
return 0;
diff --git a/include/namespaces.h b/include/namespaces.h
index 83fba9a..09d631c 100644
--- a/include/namespaces.h
+++ b/include/namespaces.h
@@ -21,6 +21,11 @@ struct ns_id {
struct mount_info *mntinfo_list;
struct mount_info *mntinfo_tree;
} mnt;
+
+ struct {
+ int nlsk; /* for sockets collection */
+ int seqsk; /* to talk to parasite daemons */
+ } net;
};
};
extern struct ns_id *ns_ids;
diff --git a/include/sockets.h b/include/sockets.h
index 3a2fe81..105cb10 100644
--- a/include/sockets.h
+++ b/include/sockets.h
@@ -32,7 +32,8 @@ extern int restore_prepare_socket(int sk);
extern bool socket_test_collect_bit(unsigned int family, unsigned int proto);
extern int sk_collect_one(int ino, int family, struct socket_desc *d);
-extern int collect_sockets(int pid);
+struct ns_id;
+extern int collect_sockets(struct ns_id *);
extern int collect_inet_sockets(void);
extern struct collect_image_info unix_sk_cinfo;
extern int collect_unix_sockets(void);
diff --git a/net.c b/net.c
index 7cf3d2c..62c56a0 100644
--- a/net.c
+++ b/net.c
@@ -628,10 +628,58 @@ int veth_pair_add(char *in, char *out)
return 0;
}
+/*
+ * The setns() syscall (called by switch_ns()) can be extremely
+ * slow. If we call it two or more times from the same task the
+ * kernel will synchonously go on a very slow routine called
+ * synchronize_rcu() trying to put a reference on old namespaces.
+ *
+ * To avoid doing this more than once we pre-create all the
+ * needed other-ns sockets in advance.
+ */
+
+static int prep_ns_sockets(struct ns_id *ns)
+{
+ int nsret = -1, ret;
+
+ if (ns->pid != getpid()) {
+ pr_info("Switching to %d's net for collecting sockets\n", ns->pid);
+ if (switch_ns(ns->pid, &net_ns_desc, &nsret))
+ return -1;
+ }
+
+ ret = ns->net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
+ if (ret < 0) {
+ pr_perror("Can't create sock diag socket");
+ goto err_nl;
+ }
+
+ ret = 0;
+out:
+ if (nsret >= 0 && restore_ns(nsret, &net_ns_desc) < 0) {
+ nsret = -1;
+ if (ret == 0)
+ goto err_ret;
+ }
+
+ return ret;
+
+err_ret:
+ close(ns->net.nlsk);
+err_nl:
+ goto out;
+}
+
static int collect_net_ns(struct ns_id *ns)
{
+ int ret;
+
pr_info("Collecting netns %d/%d\n", ns->id, ns->pid);
- return collect_sockets(ns->pid);
+ ret = prep_ns_sockets(ns);
+ if (ret)
+ return ret;
+
+ return collect_sockets(ns);
}
int collect_net_namespaces(void)
diff --git a/sockets.c b/sockets.c
index 103774f..b36caef 100644
--- a/sockets.c
+++ b/sockets.c
@@ -515,27 +515,12 @@ static int do_collect_req(int nl, struct sock_diag_req *req, int size,
return tmp;
}
-int collect_sockets(int pid)
+int collect_sockets(struct ns_id *ns)
{
int err = 0, tmp;
- int rst = -1;
- int nl;
+ int nl = ns->net.nlsk;
struct sock_diag_req req;
- if (root_ns_mask & CLONE_NEWNET) {
- pr_info("Switching to %d's net for collecting sockets\n", pid);
-
- if (switch_ns(pid, &net_ns_desc, &rst))
- return -1;
- }
-
- nl = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
- if (nl < 0) {
- pr_perror("Can't create sock diag socket");
- err = -1;
- goto out;
- }
-
memset(&req, 0, sizeof(req));
req.hdr.nlmsg_len = sizeof(req);
req.hdr.nlmsg_type = SOCK_DIAG_BY_FAMILY;
@@ -615,7 +600,7 @@ int collect_sockets(int pid)
tmp = do_collect_req(nl, &req, sizeof(req), packet_receive_one, NULL);
if (tmp) {
pr_warn("The current kernel doesn't support packet_diag\n");
- if (pid == 0 || tmp != -ENOENT) /* Fedora 19 */
+ if (ns->pid == 0 || tmp != -ENOENT) /* Fedora 19 */
err = tmp;
}
@@ -625,16 +610,15 @@ int collect_sockets(int pid)
tmp = do_collect_req(nl, &req, sizeof(req), netlink_receive_one, NULL);
if (tmp) {
pr_warn("The current kernel doesn't support netlink_diag\n");
- if (pid == 0 || tmp != -ENOENT) /* Fedora 19 */
+ if (ns->pid == 0 || tmp != -ENOENT) /* Fedora 19 */
err = tmp;
}
+ /* don't need anymore */
close(nl);
-out:
- if (rst >= 0) {
- if (restore_ns(rst, &net_ns_desc) < 0)
- err = -1;
- } else if (pid != 0) {
+ ns->net.nlsk = -1;
+
+ if (ns->pid == getpid()) {
/*
* If netns isn't dumped, criu will fail only
* if an unsupported socket will be really dumped.
--
1.8.4.2
More information about the CRIU
mailing list