[CRIU] [PATCH v4 1/3] net/sysctl: c/r all except *.conf.* and *.neigh.*

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Wed Jul 20 07:27:56 PDT 2016


Add array of sysctls which we need(r/w in netns) to dump by name along
with their types, add NamedSysctlEntry'es to dump sysctl name+value
pairs to image

Skip:
1. non-(readable)writable sysctls as we can do nothing for them through
procfs
2. conf and neigh directories are per-device and will be restored after
devices restore separately, also they might need special care like conf
sysctls need special order
3. nf_log.xx if it is "NONE" as we can not set it with sysctl_op as
sysctl_write_char prints "\n" at the end and nf_log_proc_dostring does
not like '\n' for now, to fix it, sent patch "[v2]netfilter: nf_log:
fix error on write NONE to logger choice sysctl"

Not skip: "igmp_link_local_mcast_reports" - !DANGEROUS! to use criu on
kernels v4.3-v4.5, @xemul: "we just suggest one to add the fixing
patch to kernel", so fixing patch is: commit 87a8a2ae65b7 ("igmp:
Namespaceify igmp_llm_reports sysctl knob"), see more in RHBZ#1352177

*We have now 32 such net.* sysctls writable in VZ7 CT
https://jira.sw.ru/browse/PSBM-48397

v3: skip igmp_link_local_mcast_reports
v4: do not readdir /proc/sys/net but do a static list of sysctls, not
skip igmp_link_local_mcast_reports

Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 criu/net.c          | 293 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 images/netdev.proto |  14 +--
 images/sysctl.proto |   5 +
 3 files changed, 305 insertions(+), 7 deletions(-)

diff --git a/criu/net.c b/criu/net.c
index a6d5f00..6822c04 100644
--- a/criu/net.c
+++ b/criu/net.c
@@ -1144,10 +1144,264 @@ static inline int dump_iptables(struct cr_imgset *fds)
 	return 0;
 }
 
+#define NR_CTL_STR 16
+
+struct sysctl_info {
+	char		*name;
+	SysctlType	type;
+} sysctl_net[] = {
+	{"net/core/somaxconn", SYSCTL_TYPE__CTL_32},
+	/* default 30 - Hard timeout in seconds for acquire requests */
+	{"net/core/xfrm_acq_expires", SYSCTL_TYPE__CTL_32},
+	/* default 10 - Default threshhold timeout to ratelimit xfrm events */
+	{"net/core/xfrm_aevent_etime", SYSCTL_TYPE__CTL_32},
+	/* default 2 - Default threshhold for packets to ratelimit xfrm events */
+	{"net/core/xfrm_aevent_rseqth", SYSCTL_TYPE__CTL_32},
+	{"net/core/xfrm_larval_drop", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/fwmark_reflect", SYSCTL_TYPE__CTL_32},
+	/* default 0 - Iignore ICMP ECHO requests */
+	{"net/ipv4/icmp_echo_ignore_all", SYSCTL_TYPE__CTL_32},
+	/* default 1 - Ignore ICMP ECHO and TIMESTAMP requests sent via broadcast/multicast */
+	{"net/ipv4/icmp_echo_ignore_broadcasts", SYSCTL_TYPE__CTL_32},
+	/* default 0 - Send back ICMP errors */
+	{"net/ipv4/icmp_errors_use_inbound_ifaddr", SYSCTL_TYPE__CTL_32},
+	/* default 0 - Disable RFC1122 violation warnings */
+	{"net/ipv4/icmp_ignore_bogus_error_responses", SYSCTL_TYPE__CTL_32},
+	/* default 1000 - Limit the maximal rates for sending ICMP packets whose type matches icmp_ratemask */
+	{"net/ipv4/icmp_ratelimit", SYSCTL_TYPE__CTL_32},
+	/* default 6168 - Bitmask(19bit) to match ICMP types */
+	{"net/ipv4/icmp_ratemask", SYSCTL_TYPE__CTL_32},
+	/* default 1 - Enable IGMP reports for local multicast groups
+	 * in v4.3-v4.5 kernels changing it can lead to kernel crashes in random places
+	 * fix: commit 87a8a2ae65b7 ("igmp: Namespaceify igmp_llm_reports sysctl knob") */
+	{"net/ipv4/igmp_link_local_mcast_reports", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/igmp_max_memberships", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/igmp_max_msf", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/igmp_qrv", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/ip_default_ttl", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/ip_dynaddr", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/ip_early_demux", SYSCTL_TYPE__CTL_32},
+	/* default 0 - Enable ip packets forwrading between interfaces, is special RFC1122 RFC1812 */
+	{"net/ipv4/ip_forward", SYSCTL_TYPE__CTL_32},
+	/* default 0 - Enable path mtus information for special userspace software */
+	{"net/ipv4/ip_forward_use_pmtu", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/ip_local_port_range", SYSCTL_TYPE__CTL_STR},
+	{"net/ipv4/ip_local_reserved_ports", SYSCTL_TYPE__CTL_STR},
+	{"net/ipv4/ip_no_pmtu_disc", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/ip_nonlocal_bind", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/ipfrag_high_thresh", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/ipfrag_low_thresh", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/ipfrag_max_dist", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/ipfrag_time", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/ping_group_range", SYSCTL_TYPE__CTL_STR},
+	{"net/ipv4/tcp_base_mss", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_ecn", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_ecn_fallback", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_fin_timeout", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_fwmark_accept", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_keepalive_intvl", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_keepalive_probes", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_keepalive_time", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_mtu_probing", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_notsent_lowat", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_orphan_retries", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_probe_interval", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_probe_threshold", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_reordering", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_retries1", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_retries2", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_syn_retries", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_synack_retries", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/tcp_syncookies", SYSCTL_TYPE__CTL_32},
+	{"net/ipv4/xfrm4_gc_thresh", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/anycast_src_echo_reply", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/auto_flowlabels", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/bindv6only", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/flowlabel_consistency", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/flowlabel_state_ranges", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/fwmark_reflect", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/icmp/ratelimit", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/idgen_delay", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/idgen_retries", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/ip6frag_high_thresh", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/ip6frag_low_thresh", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/ip6frag_time", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/ip_nonlocal_bind", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/route/gc_elasticity", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/route/gc_interval", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/route/gc_min_interval", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/route/gc_min_interval_ms", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/route/gc_thresh", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/route/gc_timeout", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/route/max_size", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/route/min_adv_mss", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/route/mtu_expires", SYSCTL_TYPE__CTL_32},
+	{"net/ipv6/xfrm6_gc_thresh", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_acct", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_checksum", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_events", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_expect_max", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_frag6_high_thresh", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_frag6_low_thresh", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_frag6_timeout", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_generic_timeout", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_helper", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_icmp_timeout", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_icmpv6_timeout", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_log_invalid", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_max", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_tcp_be_liberal", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_tcp_loose", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_tcp_max_retrans", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_tcp_timeout_close", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_tcp_timeout_close_wait", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_tcp_timeout_established", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_tcp_timeout_fin_wait", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_tcp_timeout_last_ack", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_tcp_timeout_max_retrans", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_tcp_timeout_syn_recv", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_tcp_timeout_syn_sent", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_tcp_timeout_time_wait", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_tcp_timeout_unacknowledged", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_timestamp", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_udp_timeout", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_conntrack_udp_timeout_stream", SYSCTL_TYPE__CTL_32},
+	{"net/unix/max_dgram_qlen", SYSCTL_TYPE__CTL_32},
+	{"net/netfilter/nf_log/0", SYSCTL_TYPE__CTL_STR},
+	{"net/netfilter/nf_log/1", SYSCTL_TYPE__CTL_STR},
+	{"net/netfilter/nf_log/2", SYSCTL_TYPE__CTL_STR},
+	{"net/netfilter/nf_log/3", SYSCTL_TYPE__CTL_STR},
+	{"net/netfilter/nf_log/4", SYSCTL_TYPE__CTL_STR},
+	{"net/netfilter/nf_log/5", SYSCTL_TYPE__CTL_STR},
+	{"net/netfilter/nf_log/6", SYSCTL_TYPE__CTL_STR},
+	{"net/netfilter/nf_log/7", SYSCTL_TYPE__CTL_STR},
+	{"net/netfilter/nf_log/8", SYSCTL_TYPE__CTL_STR},
+	{"net/netfilter/nf_log/9", SYSCTL_TYPE__CTL_STR},
+	{"net/netfilter/nf_log/10", SYSCTL_TYPE__CTL_STR},
+	{"net/netfilter/nf_log/11", SYSCTL_TYPE__CTL_STR},
+	{"net/netfilter/nf_log/12", SYSCTL_TYPE__CTL_STR},
+};
+
+/*
+ * We have only two sysctls longer than 256:
+ * /proc/sys/dev/cdrom/info - CDROM_STR_SIZE=1000
+ * /proc/sys/net/ipv4/tcp_allowed_congestion_control - TCP_CA_BUF_MAX=2048
+ * first one is readonly and second is hostonly
+ */
+#define PROC_ARG_MAX_LEN 257
+
+static int dump_netns_sysctls(NetnsEntry *netns)
+{
+	int nr = ARRAY_SIZE(sysctl_net);
+	int ret = 0;
+	int i;
+	struct sysctl_req req[ARRAY_SIZE(sysctl_net)];
+
+	for (i = 0; i < nr; i++) {
+		NamedSysctlEntry *nse = netns->nses[i];
+		SysctlEntry *se = nse->se;
+
+		req[i].name = nse->name;
+		req[i].flags = CTL_FLAGS_OPTIONAL;
+		if (se->type == SYSCTL_TYPE__CTL_32) {
+			req[i].type = CTL_32;
+			req[i].arg = &se->iarg;
+		} else if (se->type == SYSCTL_TYPE__CTL_STR) {
+			req[i].type = CTL_STR(PROC_ARG_MAX_LEN);
+			req[i].arg = se->sarg;
+		}
+	}
+
+	ret = sysctl_op(req, nr, CTL_READ, CLONE_NEWNET);
+	if (ret != 0) {
+		pr_err("Failed to read net sysctls\n");
+		return ret;
+	}
+
+	for (i = 0; i < nr; i++) {
+		SysctlEntry *se = netns->nses[i]->se;
+
+		if (req[i].flags & CTL_FLAGS_HAS) {
+			if (se->type == SYSCTL_TYPE__CTL_32)
+				se->has_iarg = true;
+			else if (se->type == SYSCTL_TYPE__CTL_STR) {
+				/* Strip trailing newline */
+				if (se->sarg[strlen(se->sarg) - 1] == '\n')
+				    se->sarg[strlen(se->sarg) - 1] = '\0';
+
+				/*
+				 * Skip nf_log/xx if it is set to default "NONE"
+				 */
+				if (strstr(req[i].name, "net/netfilter/nf_log")
+				     || !strcmp(se->sarg, "NONE"))
+					se->sarg = NULL;
+			}
+		} else if (se->type == SYSCTL_TYPE__CTL_STR)
+			se->sarg = NULL;
+	}
+
+	return ret;
+}
+
+/*
+ * Max sysctl path is 70 chars:
+ * "/proc/sys/net/ipv4/conf/virbr0-nic/igmpv2_unsolicited_report_interval"
+ */
+#define PROC_PATH_MAX_LEN 100
+
+static int restore_netns_sysctls(NetnsEntry *netns)
+{
+	struct sysctl_req req[ARRAY_SIZE(sysctl_net)];
+	int i, ri;
+	int ret = 0;
+
+	for (i = 0, ri = 0; i < netns->n_nses; i++) {
+		NamedSysctlEntry *nse = netns->nses[i];
+		char path[PROC_PATH_MAX_LEN];
+
+		sprintf(path, "/proc/sys/%s", nse->name);
+
+		/* Skip restore not writable sysctls */
+		if (access(path, W_OK) != 0)
+			continue;
+
+		switch (nse->se->type) {
+			case SYSCTL_TYPE__CTL_32:
+				/* skip non-existing sysctl */
+				if (!nse->se->has_iarg)
+					continue;
+
+				req[ri].type = CTL_32;
+				req[ri].arg = &nse->se->iarg;
+				break;
+			case SYSCTL_TYPE__CTL_STR:
+				/* skip non-existing sysctl */
+				if (!nse->se->sarg)
+					continue;
+
+				req[ri].type = CTL_STR(strlen(nse->se->sarg));
+				req[ri].arg = nse->se->sarg;
+				break;
+			default:
+				continue;
+		}
+
+		req[ri].name = nse->name;
+		req[ri].flags = 0;
+		ri++;
+	}
+
+	ret = sysctl_op(req, ri, CTL_WRITE, CLONE_NEWNET);
+	if (ret < 0)
+		pr_err("Failed to write net sysctls\n");
+
+	return ret;
+}
+
 static int dump_netns_conf(struct cr_imgset *fds)
 {
 	int ret = -1;
-	int i;
+	int i, j;
 	NetnsEntry netns = NETNS_ENTRY__INIT;
 	SysctlEntry *def_confs4 = NULL, *all_confs4 = NULL;
 	int size4 = ARRAY_SIZE(devconfs4);
@@ -1156,6 +1410,11 @@ static int dump_netns_conf(struct cr_imgset *fds)
 	char def_stable_secret[MAX_STR_CONF_LEN + 1] = {};
 	char all_stable_secret[MAX_STR_CONF_LEN + 1] = {};
 
+	NamedSysctlEntry	*pnses[ARRAY_SIZE(sysctl_net)];
+	NamedSysctlEntry	nses[ARRAY_SIZE(sysctl_net)];
+	SysctlEntry		ses[ARRAY_SIZE(sysctl_net)];
+	char			sargs[NR_CTL_STR][PROC_ARG_MAX_LEN];
+
 	netns.n_def_conf4 = size4;
 	netns.n_all_conf4 = size4;
 	netns.def_conf4 = xmalloc(sizeof(SysctlEntry *) * size4);
@@ -1211,6 +1470,32 @@ static int dump_netns_conf(struct cr_imgset *fds)
 		}
 	}
 
+	netns.nses = pnses;
+	netns.n_nses = ARRAY_SIZE(sysctl_net);
+	for (i = 0, j = 0; i < netns.n_nses; i++) {
+		/* Init PB */
+		pnses[i] = &nses[i];
+		named_sysctl_entry__init(pnses[i]);
+		nses[i].name = sysctl_net[i].name;
+		nses[i].se = &ses[i];
+		sysctl_entry__init(nses[i].se);
+		ses[i].type = sysctl_net[i].type;
+		if (ses[i].type == SYSCTL_TYPE__CTL_32)
+			ses[i].sarg = NULL;
+		else if (ses[i].type == SYSCTL_TYPE__CTL_STR) {
+			/*
+			 * We should always have NR_CTL_STR equal to
+			 * the number of string sysctls
+			 */
+			BUG_ON(j >= NR_CTL_STR);
+			ses[i].sarg = sargs[j++];
+		}
+	}
+
+	ret = dump_netns_sysctls(&netns);
+	if (ret < 0)
+		goto err_free;
+
 	ret = ipv4_conf_op("default", netns.def_conf4, size4, CTL_READ, NULL);
 	if (ret < 0)
 		goto err_free;
@@ -1347,6 +1632,12 @@ static int restore_netns_conf(int pid, NetnsEntry **netns)
 		return -1;
 	}
 
+	if ((*netns)->nses) {
+		ret = restore_netns_sysctls(*netns);
+		if (ret)
+			goto out;
+	}
+
 	if ((*netns)->def_conf4) {
 		ret = ipv4_conf_op("all", (*netns)->all_conf4, (*netns)->n_all_conf4, CTL_WRITE, NULL);
 		if (ret)
diff --git a/images/netdev.proto b/images/netdev.proto
index 19b501c..08f7eb9 100644
--- a/images/netdev.proto
+++ b/images/netdev.proto
@@ -41,12 +41,14 @@ message net_device_entry {
 }
 
 message netns_entry {
-	repeated int32 def_conf		= 1;
-	repeated int32 all_conf		= 2;
+	repeated int32 def_conf			= 1;
+	repeated int32 all_conf			= 2;
 
-	repeated sysctl_entry def_conf4	= 3;
-	repeated sysctl_entry all_conf4	= 4;
+	repeated sysctl_entry def_conf4		= 3;
+	repeated sysctl_entry all_conf4		= 4;
 
-	repeated sysctl_entry def_conf6	= 5;
-	repeated sysctl_entry all_conf6	= 6;
+	repeated sysctl_entry def_conf6		= 5;
+	repeated sysctl_entry all_conf6		= 6;
+
+	repeated named_sysctl_entry nses	= 7;
 }
diff --git a/images/sysctl.proto b/images/sysctl.proto
index 4ecdf27..5927386 100644
--- a/images/sysctl.proto
+++ b/images/sysctl.proto
@@ -11,3 +11,8 @@ message sysctl_entry {
 	optional int32 iarg		= 2;
 	optional string sarg		= 3;
 }
+
+message named_sysctl_entry {
+	required string name		= 1;
+	required sysctl_entry se	= 2;
+}
-- 
2.5.5



More information about the CRIU mailing list