[Devel] [PATCH RHEL7 COMMIT] net/netfilter: make nft NAT working in different netns simultaneously

Konstantin Khorenko khorenko at virtuozzo.com
Tue May 12 15:04:02 MSK 2020


The commit is pushed to "branch-rh7-3.10.0-1127.vz7.150.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1127.vz7.150.9
------>
commit 061604d5c765bb28e740d1eb3002768eb7d2bcdc
Author: Konstantin Khorenko <khorenko at virtuozzo.com>
Date:   Tue May 12 15:04:01 2020 +0300

    net/netfilter: make nft NAT working in different netns simultaneously
    
    Brief info:
    ===========
    
    * at the moment NAT chains are linked into a single list - even if they are for
      different netns
    * only first NAT chain can be processed on every conntrack setup.
      Even if the chain handling function returns error, the conntrack is
      considered as "configured" (from NAT's poinr of view) and next NAT chains are
      not processed.
    
    => nft NAT can work only in the netns where it was configured first:
    because the NAT chain for that netns appears to be first in the list.
    
    Let's don't call chain handling at all in case it's related to a different
    netns.
    
    Detailed info:
    ==============
    
    Imagine we configured nf dnat for VE first and for host - last,
    so the hooks are stored in this particular order.
    
    Now we try to use dnat for host, send a packet which we expect
    go through our host dnat rule and change, say, dst port number.
    
    1 ip_rcv
    2  nf_hook_slow
    3   nf_iterate
    4    nft_nat_ipv4_in
    5     nf_nat_ipv4_in
    6      nf_nat_ipv4_fn
    7       nf_nat_packet
    8        if (ct->status & statusbit)
    9         l3proto->manip_pkt() == nf_nat_ipv4_manip_pkt()
    10         iph->daddr = target->dst.u3.ip;
    
    This is a normal path when dnat rule is applied to a packet.
    We never get to line 10 because we never get though condition
    on line 8: ct->status never has IPS_DST_NAT bit.
    
    This bit IPS_DST_NAT should have been set up earlier by the stack:
    ("good" call stack, how it's should be)
    
    1 ip_rcv
    2  nf_hook_slow
    3   nf_iterate
    4    nft_nat_ipv4_in
    5     nf_nat_ipv4_in
    6      nf_nat_ipv4_fn
    7        case IP_CT_NEW:
    8         if (!nf_nat_initialized()) {
    9          do_chain() == nft_nat_do_chain()
    10          nft_do_chain
    11           nft_nat_eval // sets range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED
    12            nf_nat_setup_info
    13             get_unique_tuple()
    14             {
    15              if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
    16                            // goes further
    17              } else if (!nf_nat_used_tuple(tuple, ct)) {
    18                            goto out; // no unique tuple allocated
    19              }
    20
    21              l4proto->unique_tuple() // really allocates unique tuple
    22             }
    23
    24            // nf_nat_setup_info() func continues
    25            if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
    26                   // so we get here only in case get_unique_tuple()
    27                   // allocates really unique tuple
    28
    29                   if (maniptype == NF_NAT_MANIP_SRC)
    30                           ct->status |= IPS_SRC_NAT;
    31                   else
    32                           ct->status |= IPS_DST_NAT;
    33            }
    
    But in our "bad" case IPS_DST_NAT is not set because if we handle a packet for
    init netns, but our first chain at line 9 is for CT netns,
    nft_do_chain() exists immediately with NF_ACCEPT (and does nothing, in
    particular does not set IPS_DST_NAT flag to conntrack's status),
    nf_nat_ipv4_fn() considers NF_ACCEPT as "ok" and continues execution:
    
    nf_nat_ipv4_fn()
                    if (!nf_nat_initialized(ct, maniptype)) {
                            unsigned int ret;
    
                            ret = do_chain(ops, skb, state, ct);
                            if (ret != NF_ACCEPT)
                                    return ret;
    
                            if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum)))
                                    break;
    
                            ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
                            if (ret != NF_ACCEPT)
                                    return ret;
    
    ==================
    nf_nat_alloc_null_binding
     __nf_nat_alloc_null_binding
      nf_nat_setup_info
    
    As i miss nft_nat_eval() on this callstack, we don't have
    NF_NAT_RANGE_PROTO_SPECIFIED flag on range->flags (line 11 above),
    thus we don't create unique tuple in get_unique_tuple() (line 18),
    thus don't set ct->status |= IPS_DST_NAT in nf_nat_setup_info() (line 32),
    
    but successfully set ct->status |= IPS_SRC_NAT_DONE at the end of
    nf_nat_setup_info().
    
    When we are called for the same conntrack (ct) with another ops (and chain with
    proper netns), in nf_nat_ipv4_fn() line 8 condition will be false,
    as nf_nat_initialized() checks exactly (ct->status & IPS_SRC_NAT_DONE),
    and thus the miss conntrack configuration for proper dnat rule.
    
    So the root cause is that nf_nat_ipv4_fn() calls do_chain() for chain with
    unmatching netns causing do_chain() to exit with NF_ACCEPT but with no
    configuration done, and later nf_nat_ipv4_fn() does not distinguish such
    NF_ACCEPT from NF_ACCEPT when ct configuration has been really completed and
    marks ct as "configured".
    
    So to fix it we need to either
      1) make do_chain() to return an error if called with wrong netns and handle
         the error in nf_nat_ipv4_fn()
    or
      2) just don't call do_chain() with wrong netns.
    
    Way 1) is complex because nft_nat_do_chain() is called in many places,
    and need to make sure every place handles new error properly.
    So let's go way 2).
    
    Note: we cannot just check "do_chain" argument in nf_nat_ipv{4,6}_fn()
    because in that case we have to export nft_nat_do_chain() functions and
    this introduces a cycle in symbols' dependence.
    
    So we detect nft chain by ops->is_nft_ops flag.
    
    https://jira.sw.ru/browse/PSBM-102728
    https://jira.sw.ru/browse/PSBM-103718
    https://jira.sw.ru/browse/PSBM-103746
    
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    v2: drop redundant variable "basechain".
    v3: introduce new return code for nft_do_chain()
    v4: drop new return code for nft_do_chain()
        (too many places to check it later).
        Check proper netns in nf_nat_ipv{4,6}_fn() in case
        nft_nat_do_chain() is provided as a do_chain() argument.
    v5: introduce callbacks for netns check. The callback for nft does its
        job, for iptables - the callback is dummy.
    v6: rework callback: make it bool for nft, drop dummy callbacks for
        iptables.
    v7: it's not wise to define a symbol in module A, and use it in modules
        B and C only, so let's we have 2 different static callbacks for ipv4
        and ipv6 (this also does not introduce new dependence between
        modules B and C) and code of real nets comparison be a "define" in a
        common header file.
    v8: drop the callback, check netns for nft chains only,
        use the newly introduced flag to detect nft chains.
---
 include/net/netfilter/nf_nat.h           | 20 ++++++++++++++++++++
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c |  5 +++++
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c |  5 +++++
 3 files changed, 30 insertions(+)

diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index a71dd333ac686..c9ca6ed53bcea 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -78,4 +78,24 @@ static inline bool nf_nat_oif_changed(unsigned int hooknum,
 #endif
 }
 
+/*
+ * Check if nft chain's netns fits conntrack netns.
+ * Uses ops->is_nft_ops flag to detect nft ops.
+ * If ops is not nft-related, the check is considered passed.
+ */
+#define is_valid_netns(ops, ct) ({					\
+	const struct nft_chain *__chain;				\
+	const struct net *__chain_net;					\
+	const struct net *__net;					\
+	bool __ret;							\
+									\
+	if (ops->is_nft_ops) {						\
+		__chain = ops->priv;					\
+		__chain_net = read_pnet(&nft_base_chain(__chain)->pnet);\
+		__net = nf_ct_net(ct);					\
+		__ret = net_eq(__net, __chain_net);			\
+	} else								\
+		__ret = true;						\
+	__ret;								\
+})
 #endif
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 3b8b048ffc6cb..3a261e2c1156f 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -25,6 +25,7 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_tables.h>
 
 static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
 
@@ -291,6 +292,10 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		if (!nf_nat_initialized(ct, maniptype)) {
 			unsigned int ret;
 
+			/* Ignore nft chains with wrong netns. */
+			if (!is_valid_netns(ops, ct))
+				return NF_ACCEPT;
+
 			ret = do_chain(ops, skb, state, ct);
 			if (ret != NF_ACCEPT)
 				return ret;
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 540dc0fdaf102..fdedb609a368a 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -24,6 +24,7 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_tables.h>
 
 static const struct nf_nat_l3proto nf_nat_l3proto_ipv6;
 
@@ -304,6 +305,10 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		if (!nf_nat_initialized(ct, maniptype)) {
 			unsigned int ret;
 
+			/* Ignore nft chains with wrong netns. */
+			if (!is_valid_netns(ops, ct))
+				return NF_ACCEPT;
+
 			ret = do_chain(ops, skb, state, ct);
 			if (ret != NF_ACCEPT)
 				return ret;


More information about the Devel mailing list