[Devel] [PATCH vz9 15/20] ve/netfilter: Implement pernet net->ct.max / virtualize "nf_conntrack_max" sysctl

Nikita Yushchenko nikita.yushchenko at virtuozzo.com
Wed Oct 13 18:26:26 MSK 2021


From: Konstantin Khorenko <khorenko at virtuozzo.com>

Rebasing and splitting netfilters sybsystem
(port 66-diff-ve-net-netfilter-combined).
Part 1.

https://jira.sw.ru/browse/PSBM-18322

Signed-off-by: Kirill Tkhai <ktkhai at parallels.com>

(cherry picked from vz7 commit c34a99c00f9d ("ve/netfilter: Implement
pernet net->ct.max / virtualize "nf_conntrack_max" sysctl"))

VZ 8 rebase part https://jira.sw.ru/browse/PSBM-127783

Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>

+++
ve/nf_conntrack: expose "nf_conntrack_max" in containers

Series:
This series brings to vz7 all the nf_conntrack sysctl's,
which are available in vz6.

https://jira.sw.ru/browse/PSBM-40044

This sysctl table contains only one entry: "/proc/sys/net/nf_conntrack_max".
This is now visible inside ct.
However, have to say, that "/proc/sys/net/netfilter/nf_conntrack_max" and
friends (despite on they are containerized) arebehind init_user_ns.

Signed-off-by: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>

(cherry picked from vz7 commit 9d3a8c692557 ("ve/nf_conntrack: expose
"nf_conntrack_max" in containers"))

VZ 8 rebase part https://jira.sw.ru/browse/PSBM-127783

Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>

Ported vz8 commit 4c888c1fa5e4 ("ve/netfilter: Implement pernet
net->ct.max / virtualize "nf_conntrack_max" sysctl")

The policy of using init_net's value as the upper limit for settable
value in other namespaces is not fully relable:
- if init_net's value is changed to a lower value, some namespaces can
  end having a value above the (new) limit,
- "zero=unlimited" semantics is not honoured.

Because of that, removed set-time limit, instead check at runtime against
both init_net's limit and per-namespace limit.

Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 include/net/netfilter/nf_conntrack.h    |  4 +-
 net/netfilter/nf_conntrack_core.c       | 35 +++++++++---
 net/netfilter/nf_conntrack_netlink.c    |  9 ++--
 net/netfilter/nf_conntrack_standalone.c | 72 ++++++++++++++++---------
 4 files changed, 81 insertions(+), 39 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 42dd967fdfbb..81983ac7e28a 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -46,6 +46,7 @@ union nf_conntrack_expect_proto {
 struct nf_conntrack_net {
 	/* only used when new connection is allocated: */
 	atomic_t count;
+	unsigned int max;
 	unsigned int expect_count;
 	unsigned int expect_max;
 	u8 sysctl_auto_assign_helper;
@@ -57,6 +58,7 @@ struct nf_conntrack_net {
 	unsigned int users_bridge;
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*sysctl_header;
+	struct ctl_table_header	*parent_sysctl_header;
 #endif
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 	struct delayed_work ecache_dwork;
@@ -314,7 +316,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
 extern struct hlist_nulls_head *nf_conntrack_hash;
 extern unsigned int nf_conntrack_htable_size;
 extern seqcount_spinlock_t nf_conntrack_generation;
-extern unsigned int nf_conntrack_max;
 
 /* must be called with rcu read lock held */
 static inline void
@@ -340,6 +341,7 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl);
 
 u32 nf_ct_get_id(const struct nf_conn *ct);
 u32 nf_conntrack_count(const struct net *net);
+u32 nf_conntrack_max(const struct net *net);
 
 static inline void
 nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 68209532f0be..8dc77131f2bc 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -183,11 +183,11 @@ static void nf_conntrack_all_unlock(void)
 unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
-unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
 seqcount_spinlock_t nf_conntrack_generation __read_mostly;
 static unsigned int nf_conntrack_hash_rnd __read_mostly;
 
+static unsigned int initial_nf_conntrack_max __ro_after_init;
+
 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
 			      const struct net *net)
 {
@@ -1361,14 +1361,15 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 static void gc_worker(struct work_struct *work)
 {
 	unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
-	unsigned int i, hashsz, nf_conntrack_max95 = 0;
+	unsigned int i, hashsz, init_nf_conntrack_max95 = 0;
 	unsigned long next_run = GC_SCAN_INTERVAL;
 	struct conntrack_gc_work *gc_work;
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
 	if (gc_work->early_drop)
-		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
+		init_nf_conntrack_max95 =
+				nf_ct_pernet(&init_net)->max / 100u * 95u;
 
 	do {
 		struct nf_conntrack_tuple_hash *h;
@@ -1387,6 +1388,8 @@ static void gc_worker(struct work_struct *work)
 		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
 			struct nf_conntrack_net *cnet;
 			struct net *net;
+			unsigned int nf_conntrack_max95 = 0;
+			unsigned int ct_count;
 
 			tmp = nf_ct_tuplehash_to_ctrack(h);
 
@@ -1400,12 +1403,21 @@ static void gc_worker(struct work_struct *work)
 				continue;
 			}
 
-			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
+			if (gc_worker_skip_ct(tmp))
 				continue;
 
 			net = nf_ct_net(tmp);
 			cnet = nf_ct_pernet(net);
-			if (atomic_read(&cnet->count) < nf_conntrack_max95)
+			if (gc_work->early_drop)
+				nf_conntrack_max95 = cnet->max / 100u * 95u;
+
+			/* skip if cnet->count is small enough againt both
+			 * global and per-ns limit */
+			ct_count = atomic_read(&cnet->count);
+			if ((nf_conntrack_max95 == 0 ||
+					ct_count < nf_conntrack_max95) &&
+			    (init_nf_conntrack_max95 == 0 ||
+					ct_count < init_nf_conntrack_max95))
 				continue;
 
 			/* need to take reference to avoid possible races */
@@ -1469,13 +1481,15 @@ __nf_conntrack_alloc(struct net *net,
 		     gfp_t gfp, u32 hash)
 {
 	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+	struct nf_conntrack_net *init_cnet = nf_ct_pernet(&init_net);
 	unsigned int ct_count;
 	struct nf_conn *ct;
 
 	/* We don't want any race condition at early drop stage */
 	ct_count = atomic_inc_return(&cnet->count);
 
-	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+	if ((cnet->max && unlikely(ct_count > cnet->max)) ||
+	    (init_cnet->max && unlikely(ct_count > init_cnet->max))) {
 		if (!early_drop(net, hash)) {
 			if (!conntrack_gc_work.early_drop)
 				conntrack_gc_work.early_drop = true;
@@ -2624,7 +2638,7 @@ int nf_conntrack_init_start(void)
 	if (!nf_conntrack_hash)
 		return -ENOMEM;
 
-	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+	initial_nf_conntrack_max = max_factor * nf_conntrack_htable_size;
 
 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
 						sizeof(struct nf_conn),
@@ -2726,6 +2740,11 @@ int nf_conntrack_init_net(struct net *net)
 	BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
 	atomic_set(&cnet->count, 0);
 
+	if (net == &init_net)
+		cnet->max = initial_nf_conntrack_max;
+	else
+		cnet->max = nf_ct_pernet(&init_net)->max;
+
 	net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
 	if (!net->ct.pcpu_lists)
 		goto err_stat;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index e81af33b233b..b741d9ef5aa9 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2542,7 +2542,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 			    struct net *net)
 {
 	unsigned int flags = portid ? NLM_F_MULTI : 0, event;
-	unsigned int nr_conntracks;
+	unsigned int conntrack_count, conntrack_max;
 	struct nlmsghdr *nlh;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS);
@@ -2551,11 +2551,12 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	if (!nlh)
 		goto nlmsg_failure;
 
-	nr_conntracks = nf_conntrack_count(net);
-	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
+	conntrack_count = nf_conntrack_count(net);
+	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(conntrack_count)))
 		goto nla_put_failure;
 
-	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+	conntrack_max = nf_conntrack_max(net);
+	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(conntrack_max)))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 9340a3c993f0..7085d1a94298 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -518,6 +518,14 @@ u32 nf_conntrack_count(const struct net *net)
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_count);
 
+u32 nf_conntrack_max(const struct net *net)
+{
+	const struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+	return cnet->max;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_max);
+
 /* Sysctl support */
 
 #ifdef CONFIG_SYSCTL
@@ -545,8 +553,6 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
 	return ret;
 }
 
-static struct ctl_table_header *nf_ct_netfilter_header;
-
 enum nf_ct_sysctl_index {
 	NF_SYSCTL_CT_MAX,
 	NF_SYSCTL_CT_COUNT,
@@ -621,7 +627,6 @@ enum nf_ct_sysctl_index {
 static struct ctl_table nf_ct_sysctl_table[] = {
 	[NF_SYSCTL_CT_MAX] = {
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -961,10 +966,16 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 	{}
 };
 
-static struct ctl_table nf_ct_netfilter_table[] = {
-	{
+enum nf_ct_parent_sysctl_index {
+	NF_PARENT_SYSCTL_CT_MAX,
+	__NF_PARENT_SYSCTL_CT_LAST_SYSCTL,
+};
+
+#define NF_PARENT_SYSCTL_CT_LAST_SYSCTL (__NF_PARENT_SYSCTL_CT_LAST_SYSCTL + 1)
+
+static struct ctl_table nf_ct_parent_sysctl_table[] = {
+	[NF_PARENT_SYSCTL_CT_MAX] = {
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -1068,7 +1079,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 {
 	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
 	struct nf_udp_net *un = nf_udp_pernet(net);
-	struct ctl_table *table;
+	struct ctl_table *table, *parent_table;
 
 	BUILD_BUG_ON(ARRAY_SIZE(nf_ct_sysctl_table) != NF_SYSCTL_CT_LAST_SYSCTL);
 
@@ -1077,6 +1088,8 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 	if (!table)
 		return -ENOMEM;
 
+	table[NF_SYSCTL_CT_MAX].data = &cnet->max;
+
 	table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
 	table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
 	table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
@@ -1105,17 +1118,35 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	/* Don't allow non-init_net ns to alter global sysctls */
 	if (!net_eq(&init_net, net)) {
-		table[NF_SYSCTL_CT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
 	}
 
 	cnet->sysctl_header = register_net_sysctl(net, "net/netfilter", table);
 	if (!cnet->sysctl_header)
-		goto out_unregister_netfilter;
+		goto out_free;
+
+	BUILD_BUG_ON(ARRAY_SIZE(nf_ct_parent_sysctl_table) !=
+			NF_PARENT_SYSCTL_CT_LAST_SYSCTL);
+
+	parent_table = kmemdup(nf_ct_parent_sysctl_table,
+			sizeof(nf_ct_parent_sysctl_table), GFP_KERNEL);
+	if (!parent_table)
+		goto out_unregister;
+
+	parent_table[NF_PARENT_SYSCTL_CT_MAX].data = &cnet->max;
+
+	cnet->parent_sysctl_header = register_net_sysctl(net, "net",
+			parent_table);
+	if (!cnet->parent_sysctl_header)
+		goto out_free_parent;
 
 	return 0;
 
-out_unregister_netfilter:
+out_free_parent:
+	kfree(parent_table);
+out_unregister:
+	unregister_net_sysctl_table(cnet->sysctl_header);
+out_free:
 	kfree(table);
 	return -ENOMEM;
 }
@@ -1123,7 +1154,11 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 static void nf_conntrack_standalone_fini_sysctl(struct net *net)
 {
 	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
-	struct ctl_table *table;
+	struct ctl_table *table, *parent_table;
+
+	parent_table = cnet->parent_sysctl_header->ctl_table_arg;
+	unregister_net_sysctl_table(cnet->parent_sysctl_header);
+	kfree(parent_table);
 
 	table = cnet->sysctl_header->ctl_table_arg;
 	unregister_net_sysctl_table(cnet->sysctl_header);
@@ -1210,14 +1245,6 @@ static int __init nf_conntrack_standalone_init(void)
 	BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER);
 
 #ifdef CONFIG_SYSCTL
-	nf_ct_netfilter_header =
-		register_net_sysctl(&init_net, "net", nf_ct_netfilter_table);
-	if (!nf_ct_netfilter_header) {
-		pr_err("nf_conntrack: can't register to sysctl.\n");
-		ret = -ENOMEM;
-		goto out_sysctl;
-	}
-
 	nf_conntrack_htable_size_user = nf_conntrack_htable_size;
 #endif
 
@@ -1229,10 +1256,6 @@ static int __init nf_conntrack_standalone_init(void)
 	return 0;
 
 out_pernet:
-#ifdef CONFIG_SYSCTL
-	unregister_net_sysctl_table(nf_ct_netfilter_header);
-out_sysctl:
-#endif
 	nf_conntrack_cleanup_end();
 out_start:
 	return ret;
@@ -1242,9 +1265,6 @@ static void __exit nf_conntrack_standalone_fini(void)
 {
 	nf_conntrack_cleanup_start();
 	unregister_pernet_subsys(&nf_conntrack_net_ops);
-#ifdef CONFIG_SYSCTL
-	unregister_net_sysctl_table(nf_ct_netfilter_header);
-#endif
 	nf_conntrack_cleanup_end();
 }
 
-- 
2.30.2



More information about the Devel mailing list