[Devel] [PATCH RHEL7 COMMIT] ve/net: Track netfilter modules per net-namespace-v3

Konstantin Khorenko khorenko at virtuozzo.com
Wed Jun 10 08:15:01 PDT 2015


The commit is pushed to "branch-rh7-3.10.0-123.1.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.12
------>
commit c158fe3fdf4a6961ba5a010fb03c4c365ca3e9bd
Author: Kirill Tkhai <ktkhai at odin.com>
Date:   Wed Jun 10 19:15:01 2015 +0400

    ve/net: Track netfilter modules per net-namespace-v3
    
    Porting patch diff-ve-net-track-netfilter-modules-per-net-namespace from
    2.6.32:
    
    cpt related hunks are separated in
    diff-cpt-track-netfilter-modules-per-net-namespace-v3
    
    Previously in rhel6 we've been not allowing to create nested net namespaces
    so all netfilter modules tracking has been done via VE structure
    
    	struct ve_struct {
    		...
    		__u64			ipt_mask;
    		__u64			_iptables_modules;
    		...
    	}
    
    Here @ipt_mask stands for features bits which are set up from VE
    configutarion and represent netfilter modules which are allowed
    to use inside VE (might be situation when modules are loaded on
    the node but forbidden inside VE).
    
    When some particular module is allowed inside VE and get loaded we're
    setting up an appropriate bit in @_iptables_modules and clear it back
    upon module unload: net_ipt_module_set() helper is doing that usually
    in module net-init() call. Same time this bit is signalling that
    module specific resource allocated for VE use should be freed once
    module is unloaded, iow net-exit() is called.
    
    For example
    
    	| static int __net_init iptable_mangle_net_init(struct net *net)
    	| {
    	| 	if (!net_ipt_permitted(net, VE_IP_MANGLE))
    	| 		return 0;
    	|
    	| 	/* Register table */
    	| 	net->ipv4.iptable_mangle =
    	| 		ipt_register_table(net, &packet_mangler, &initial_table.repl);
    	| 	if (IS_ERR(net->ipv4.iptable_mangle))
    	| 		return PTR_ERR(net->ipv4.iptable_mangle);
    	|
    	| 	net_ipt_module_set(net, VE_IP_MANGLE);
    	| 	return 0;
    	| }
    	|
    	| static void __net_exit iptable_mangle_net_exit(struct net *net)
    	| {
    	| 	if (!net_is_ipt_module_set(net, VE_IP_MANGLE))
    	| 		return;
    	| }
    	|
    	| static void __net_exit iptable_mangle_net_exit(struct net *net)
    	| {
    	| 	if (!net_is_ipt_module_set(net, VE_IP_MANGLE))
    	| 		return;
    	|
    	| 	ipt_unregister_table(net->ipv4.iptable_mangle);
    	|
    	| 	net_ipt_module_clear(net, VE_IP_MANGLE);
    	| }
    	|
    	| static struct pernet_operations iptable_mangle_net_ops = {
    	| 	.init = iptable_mangle_net_init,
    	| 	.exit = iptable_mangle_net_exit,
    	| };
    
    here we allocate @net->ipv4.iptable_mangle which should be cleaned up
    on exit, moreover the net-namespace engine is designed that way: if
    somehting if failed in init() call then exit() is called and must
    clear all data if been allocated.
    
    This worked well until we have allowed nested net-namespaces. Every
    nested net-namespace share same @owner_ve (ie the VE which is a keeper
    of it). Thus once nested namespace called net_ipt_module_clear()
    the shared @owner_ve get @_iptables_modules bit clear and finally
    the primary net namespace no longer see this bit set in @_iptables_modules.
    
    Lets move @_iptables_modules to struct net instead. Toplevel VE's
    net is referred via @ve_netns member which exist all the container's
    lifetime. Also we've to move it into @init_net thus node's net would
    be tracked same way automatically.
    
    https://jira.sw.ru/browse/PSBM-31451
    
    Signed-off-by: Cyrill Gorcunov <gorcunov at parallels.com>
    
    Acked-by: Vladimir Davydov <vdavydov at parallels.com>
    CC: Andrey Vagin <avagin at parallels.com>
    CC: Kirill Tkhai <ktkhai at parallels.com>
    Signed-off-by: Kirill Tkhai <ktkhai at odin.com>
---
 include/linux/netfilter.h   | 6 +++---
 include/linux/ve.h          | 1 -
 include/net/net_namespace.h | 3 +++
 kernel/ve/ve.c              | 1 -
 net/core/net_namespace.c    | 3 +++
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 03e6588..fba14ed 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -353,16 +353,16 @@ static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
 
 #define net_ipt_module_set(netns, ipt)					\
 	({								\
-		(netns)->owner_ve->_iptables_modules |= ipt##_MOD;	\
+		(netns)->_iptables_modules |= ipt##_MOD;	\
 	})
 
 #define net_ipt_module_clear(netns, ipt)				\
 	({								\
-		(netns)->owner_ve->_iptables_modules &= ~ipt##_MOD;	\
+		(netns)->_iptables_modules &= ~ipt##_MOD;	\
 	})
 
 #define net_is_ipt_module_set(netns, ipt)				\
-	((netns)->owner_ve->_iptables_modules & (ipt##_MOD))
+	((netns)->_iptables_modules & (ipt##_MOD))
 
 #else /* CONFIG_VE_IPTABLES */
 
diff --git a/include/linux/ve.h b/include/linux/ve.h
index e48a1a3..e3fc636 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -102,7 +102,6 @@ struct ve_struct {
 #ifdef CONFIG_VE_IPTABLES
 /* core/netfilter.c virtualization */
 	__u64			ipt_mask;
-	__u64			_iptables_modules;
 #endif /* CONFIG_VE_IPTABLES */
 #endif
 
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index bcc35c3..e66a928 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -78,6 +78,9 @@ struct net {
 
 #ifdef CONFIG_VE
 	struct ve_struct	*owner_ve;
+#ifdef CONFIG_VE_IPTABLES
+	__u64			_iptables_modules;
+#endif
 #endif
 
 	/* core fib_rules */
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index e699784..0ffd8d8 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -71,7 +71,6 @@ struct ve_struct ve0 = {
 	.is_running		= 1,
 #ifdef CONFIG_VE_IPTABLES
 	.ipt_mask		= VE_IP_ALL,	/* everything is allowed */
-	._iptables_modules	= VE_IP_NONE,	/* but nothing yet loaded */
 #endif
 	.features		= -1,
 	.fsync_enable		= FSYNC_FILTERED,
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 0ad9aa1..3f7da6a 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -35,6 +35,9 @@ struct net init_net = {
 	.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
 #ifdef CONFIG_VE
 	.owner_ve = &ve0,
+#ifdef CONFIG_VE_IPTABLES
+	._iptables_modules = VE_IP_NONE,
+#endif
 #endif
 };
 EXPORT_SYMBOL(init_net);



More information about the Devel mailing list