[Devel] [PATCH RHEL7 COMMIT] net/netfilter: make nft NAT working in different netns simultaneously

Thu Apr 30 18:16:53 MSK 2020

The commit is pushed to "branch-rh7-3.10.0-1127.vz7.150.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1127.vz7.150.5
------>
commit 5ff26bf49074aeaca8a2577290dfec51546954eb
Author: Konstantin Khorenko <khorenko at virtuozzo.com>
Date:   Mon Apr 27 17:43:58 2020 +0300

    net/netfilter: make nft NAT working in different netns simultaneously
    
    Brief info:
    ===========
    
    * at the moment NAT chains are linked into a single list - even if they are for
      different netns
    * only first NAT chain can be processed on every conntrack setup.
      Even if the chain handling function returns error, the conntrack is
      considered as "configured" (from NAT's poinr of view) and next NAT chains are
      not processed.
    
    => nft NAT can work only in the netns where it was configured first:
    because the NAT chain for that netns appears to be first in the list.
    
    Let's don't call chain handling at all in case it's related to a different
    netns.
    
    Detailed info:
    ==============
    
    Imagine we configured nf dnat for VE first and for host - last,
    so the hooks are stored in this particular order.
    
    Now we try to use dnat for host, send a packet which we expect
    go through our host dnat rule and change, say, dst port number.
    
    1 ip_rcv
    2  nf_hook_slow
    3   nf_iterate
    4    nft_nat_ipv4_in
    5     nf_nat_ipv4_in
    6      nf_nat_ipv4_fn
    7       nf_nat_packet
    8        if (ct->status & statusbit)
    9         l3proto->manip_pkt() == nf_nat_ipv4_manip_pkt()
    10         iph->daddr = target->dst.u3.ip;
    
    This is a normal path when dnat rule is applied to a packet.
    We never get to line 10 because we never get though condition
    on line 8: ct->status never has IPS_DST_NAT bit.
    
    This bit IPS_DST_NAT should have been set up earlier by the stack:
    ("good" call stack, how it's should be)
    
    1 ip_rcv
    2  nf_hook_slow
    3   nf_iterate
    4    nft_nat_ipv4_in
    5     nf_nat_ipv4_in
    6      nf_nat_ipv4_fn
    7        case IP_CT_NEW:
    8         if (!nf_nat_initialized()) {
    9          do_chain() == nft_nat_do_chain()
    10          nft_do_chain
    11           nft_nat_eval // sets range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED
    12            nf_nat_setup_info
    13             get_unique_tuple()
    14             {
    15              if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
    16                            // goes further
    17              } else if (!nf_nat_used_tuple(tuple, ct)) {
    18                            goto out; // no unique tuple allocated
    19              }
    20
    21              l4proto->unique_tuple() // really allocates unique tuple
    22             }
    23
    24            // nf_nat_setup_info() func continues
    25            if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
    26                   // so we get here only in case get_unique_tuple()
    27                   // allocates really unique tuple
    28
    29                   if (maniptype == NF_NAT_MANIP_SRC)
    30                           ct->status |= IPS_SRC_NAT;
    31                   else
    32                           ct->status |= IPS_DST_NAT;
    33            }
    
    But in our "bad" case IPS_DST_NAT is not set because if we handle a packet for
    init netns, but our first chain at line 9 is for CT netns,
    nft_do_chain() exists immediately with NF_ACCEPT (and does nothing, in
    particular does not set IPS_DST_NAT flag to conntrack's status),
    nf_nat_ipv4_fn() considers NF_ACCEPT as "ok" and continues execution:
    
    nf_nat_ipv4_fn()
                    if (!nf_nat_initialized(ct, maniptype)) {
                            unsigned int ret;
    
                            ret = do_chain(ops, skb, state, ct);
                            if (ret != NF_ACCEPT)
                                    return ret;
    
                            if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum)))
                                    break;
    
                            ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
                            if (ret != NF_ACCEPT)
                                    return ret;
    
    ==================
    nf_nat_alloc_null_binding
     __nf_nat_alloc_null_binding
      nf_nat_setup_info
    
    As i miss nft_nat_eval() on this callstack, we don't have
    NF_NAT_RANGE_PROTO_SPECIFIED flag on range->flags (line 11 above),
    thus we don't create unique tuple in get_unique_tuple() (line 18),
    thus don't set ct->status |= IPS_DST_NAT in nf_nat_setup_info() (line 32),
    
    but successfully set ct->status |= IPS_SRC_NAT_DONE at the end of
    nf_nat_setup_info().
    
    When we are called for the same conntrack (ct) with another ops (and chain with
    proper netns), in nf_nat_ipv4_fn() line 8 condition will be false,
    as nf_nat_initialized() checks exactly (ct->status & IPS_SRC_NAT_DONE),
    and thus the miss conntrack configuration for proper dnat rule.
    
    So the root cause is that nf_nat_ipv4_fn() calls do_chain() for chain with
    unmatching netns causing do_chain() to exit with NF_ACCEPT but with no
    configuration done, and later nf_nat_ipv4_fn() does not distinguish such
    NF_ACCEPT from NF_ACCEPT when ct configuration has been really completed and
    marks ct as "configured".
    
    So to fix it we need to either
      1) make do_chain() to return an error if called with wrong netns and handle
         the error in nf_nat_ipv4_fn()
    or
      2) just don't call do_chain() with wrong netns.
    
    Way 1) is complex because nft_nat_do_chain() is called in many places,
    and need to make sure every place handles new error properly.
    So let's go way 2).
    
    https://jira.sw.ru/browse/PSBM-102728
    
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    v2: drop redundant variable "basechain".
---
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 11 +++++++++++
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 11 +++++++++++
 2 files changed, 22 insertions(+)

diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 3b8b048ffc6cb..fa880d189bed3 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -25,6 +25,7 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_tables.h>
 
 static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
 
@@ -251,6 +252,11 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
 	/* maniptype == SRC for postrouting. */
 	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
 
+	const struct nft_chain *chain = ops->priv;
+	const struct net *chain_net =
+		read_pnet(&nft_base_chain(chain)->pnet);
+	const struct net *net;
+
 	/* We never see fragments: conntrack defrags on pre-routing
 	 * and local-out, and nf_nat_out protects post-routing.
 	 */
@@ -265,6 +271,11 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
 	if (!ct)
 		return NF_ACCEPT;
 
+	/* Ignore chains that are not for the current network namespace */
+	net = nf_ct_net(ct);
+	if (!net_eq(net, chain_net))
+		return NF_ACCEPT;
+
 	/* Don't try to NAT if this packet is not conntracked */
 	if (nf_ct_is_untracked(ct))
 		return NF_ACCEPT;
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 540dc0fdaf102..f369123f89246 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -24,6 +24,7 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_tables.h>
 
 static const struct nf_nat_l3proto nf_nat_l3proto_ipv6;
 
@@ -264,6 +265,11 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
 	int hdrlen;
 	u8 nexthdr;
 
+	const struct nft_chain *chain = ops->priv;
+	const struct net *chain_net =
+		read_pnet(&nft_base_chain(chain)->pnet);
+	const struct net *net;
+
 	ct = nf_ct_get(skb, &ctinfo);
 	/* Can't track?  It's not due to stress, or conntrack would
 	 * have dropped it.  Hence it's the user's responsibilty to
@@ -273,6 +279,11 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
 	if (!ct)
 		return NF_ACCEPT;
 
+	/* Ignore chains that are not for the current network namespace */
+	net = nf_ct_net(ct);
+	if (!net_eq(net, chain_net))
+		return NF_ACCEPT;
+
 	/* Don't try to NAT if this packet is not conntracked */
 	if (nf_ct_is_untracked(ct))
 		return NF_ACCEPT;