[Devel] [PATCH 13/14] net: Track netfilter modules per net-namespace-v3

Mon Jun 8 07:22:41 PDT 2015

Porting patch diff-ve-net-track-netfilter-modules-per-net-namespace from
2.6.32:

cpt related hunks are separated in
diff-cpt-track-netfilter-modules-per-net-namespace-v3

Previously in rhel6 we've been not allowing to create nested net namespaces
so all netfilter modules tracking has been done via VE structure

	struct ve_struct {
		...
		__u64			ipt_mask;
		__u64			_iptables_modules;
		...
	}

Here @ipt_mask stands for features bits which are set up from VE
configutarion and represent netfilter modules which are allowed
to use inside VE (might be situation when modules are loaded on
the node but forbidden inside VE).

When some particular module is allowed inside VE and get loaded we're
setting up an appropriate bit in @_iptables_modules and clear it back
upon module unload: net_ipt_module_set() helper is doing that usually
in module net-init() call. Same time this bit is signalling that
module specific resource allocated for VE use should be freed once
module is unloaded, iow net-exit() is called.

For example

	| static int __net_init iptable_mangle_net_init(struct net *net)
	| {
	| 	if (!net_ipt_permitted(net, VE_IP_MANGLE))
	| 		return 0;
	|
	| 	/* Register table */
	| 	net->ipv4.iptable_mangle =
	| 		ipt_register_table(net, &packet_mangler, &initial_table.repl);
	| 	if (IS_ERR(net->ipv4.iptable_mangle))
	| 		return PTR_ERR(net->ipv4.iptable_mangle);
	|
	| 	net_ipt_module_set(net, VE_IP_MANGLE);
	| 	return 0;
	| }
	|
	| static void __net_exit iptable_mangle_net_exit(struct net *net)
	| {
	| 	if (!net_is_ipt_module_set(net, VE_IP_MANGLE))
	| 		return;
	| }
	|
	| static void __net_exit iptable_mangle_net_exit(struct net *net)
	| {
	| 	if (!net_is_ipt_module_set(net, VE_IP_MANGLE))
	| 		return;
	|
	| 	ipt_unregister_table(net->ipv4.iptable_mangle);
	|
	| 	net_ipt_module_clear(net, VE_IP_MANGLE);
	| }
	|
	| static struct pernet_operations iptable_mangle_net_ops = {
	| 	.init = iptable_mangle_net_init,
	| 	.exit = iptable_mangle_net_exit,
	| };


here we allocate @net->ipv4.iptable_mangle which should be cleaned up
on exit, moreover the net-namespace engine is designed that way: if
somehting if failed in init() call then exit() is called and must
clear all data if been allocated.

This worked well until we have allowed nested net-namespaces. Every
nested net-namespace share same @owner_ve (ie the VE which is a keeper
of it). Thus once nested namespace called net_ipt_module_clear()
the shared @owner_ve get @_iptables_modules bit clear and finally
the primary net namespace no longer see this bit set in @_iptables_modules.

Lets move @_iptables_modules to struct net instead. Toplevel VE's
net is referred via @ve_netns member which exist all the container's
lifetime. Also we've to move it into @init_net thus node's net would
be tracked same way automatically.

https://jira.sw.ru/browse/PSBM-31451

Signed-off-by: Cyrill Gorcunov <gorcunov at parallels.com>
Acked-by: Vladimir Davydov <vdavydov at parallels.com>
CC: Andrey Vagin <avagin at parallels.com>
CC: Kirill Tkhai <ktkhai at parallels.com>
Signed-off-by: Kirill Tkhai <ktkhai at odin.com>
---
 include/linux/netfilter.h   |    6 +++---
 include/linux/ve.h          |    1 -
 include/net/net_namespace.h |    3 +++
 kernel/ve/ve.c              |    1 -
 net/core/net_namespace.c    |    3 +++
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 03e6588..fba14ed 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -353,16 +353,16 @@ static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
 
 #define net_ipt_module_set(netns, ipt)					\
 	({								\
-		(netns)->owner_ve->_iptables_modules |= ipt##_MOD;	\
+		(netns)->_iptables_modules |= ipt##_MOD;	\
 	})
 
 #define net_ipt_module_clear(netns, ipt)				\
 	({								\
-		(netns)->owner_ve->_iptables_modules &= ~ipt##_MOD;	\
+		(netns)->_iptables_modules &= ~ipt##_MOD;	\
 	})
 
 #define net_is_ipt_module_set(netns, ipt)				\
-	((netns)->owner_ve->_iptables_modules & (ipt##_MOD))
+	((netns)->_iptables_modules & (ipt##_MOD))
 
 #else /* CONFIG_VE_IPTABLES */
 
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 758ff85..0e32d58 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -102,7 +102,6 @@ struct ve_struct {
 #ifdef CONFIG_VE_IPTABLES
 /* core/netfilter.c virtualization */
 	__u64			ipt_mask;
-	__u64			_iptables_modules;
 #endif /* CONFIG_VE_IPTABLES */
 #endif
 
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index bcc35c3..e66a928 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -78,6 +78,9 @@ struct net {
 
 #ifdef CONFIG_VE
 	struct ve_struct	*owner_ve;
+#ifdef CONFIG_VE_IPTABLES
+	__u64			_iptables_modules;
+#endif
 #endif
 
 	/* core fib_rules */
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 55b7d86..02d69c0 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -71,7 +71,6 @@ struct ve_struct ve0 = {
 	.is_running		= 1,
 #ifdef CONFIG_VE_IPTABLES
 	.ipt_mask		= VE_IP_ALL,	/* everything is allowed */
-	._iptables_modules	= VE_IP_NONE,	/* but nothing yet loaded */
 #endif
 	.features		= -1,
 	.fsync_enable		= FSYNC_FILTERED,
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7a7a989..b9b9807 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -33,6 +33,9 @@ EXPORT_SYMBOL_GPL(net_namespace_list);
 
 struct net init_net = {
 	.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
+#if defined(CONFIG_VE) && defined(CONFIG_VE_IPTABLES)
+	._iptables_modules = VE_IP_NONE,
+#endif
 };
 EXPORT_SYMBOL(init_net);