[Devel] [PATCH rh7 v2] net/netfilter: make nft NAT working in different netns simultaneously

Tue Apr 28 12:55:27 MSK 2020

Brief info:
===========

* at the moment NAT chains are linked into a single list - even if they are for
  different netns
* only first NAT chain can be processed on every conntrack setup.
  Even if the chain handling function returns error, the conntrack is
  considered as "configured" (from NAT's poinr of view) and next NAT chains are
  not processed.

=> nft NAT can work only in the netns where it was configured first:
because the NAT chain for that netns appears to be first in the list.

Let's don't call chain handling at all in case it's related to a different
netns.

Detailed info:
==============

Imagine we configured nf dnat for VE first and for host - last,
so the hooks are stored in this particular order.

Now we try to use dnat for host, send a packet which we expect
go through our host dnat rule and change, say, dst port number.

1 ip_rcv
2  nf_hook_slow
3   nf_iterate
4    nft_nat_ipv4_in
5     nf_nat_ipv4_in
6      nf_nat_ipv4_fn
7       nf_nat_packet
8        if (ct->status & statusbit)
9         l3proto->manip_pkt() == nf_nat_ipv4_manip_pkt()
10         iph->daddr = target->dst.u3.ip;

This is a normal path when dnat rule is applied to a packet.
We never get to line 10 because we never get though condition
on line 8: ct->status never has IPS_DST_NAT bit.

This bit IPS_DST_NAT should have been set up earlier by the stack:
("good" call stack, how it's should be)

1 ip_rcv
2  nf_hook_slow
3   nf_iterate
4    nft_nat_ipv4_in
5     nf_nat_ipv4_in
6      nf_nat_ipv4_fn
7        case IP_CT_NEW:
8         if (!nf_nat_initialized()) {
9          do_chain() == nft_nat_do_chain()
10          nft_do_chain
11           nft_nat_eval // sets range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED
12            nf_nat_setup_info
13             get_unique_tuple()
14             {
15              if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
16                            // goes further
17              } else if (!nf_nat_used_tuple(tuple, ct)) {
18                            goto out; // no unique tuple allocated
19              }
20
21              l4proto->unique_tuple() // really allocates unique tuple
22             }
23
24            // nf_nat_setup_info() func continues
25            if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
26                   // so we get here only in case get_unique_tuple()
27                   // allocates really unique tuple
28
29                   if (maniptype == NF_NAT_MANIP_SRC)
30                           ct->status |= IPS_SRC_NAT;
31                   else
32                           ct->status |= IPS_DST_NAT;
33            }

But in our "bad" case IPS_DST_NAT is not set because if we handle a packet for
init netns, but our first chain at line 9 is for CT netns,
nft_do_chain() exists immediately with NF_ACCEPT (and does nothing, in
particular does not set IPS_DST_NAT flag to conntrack's status),
nf_nat_ipv4_fn() considers NF_ACCEPT as "ok" and continues execution:

nf_nat_ipv4_fn()
                if (!nf_nat_initialized(ct, maniptype)) {
                        unsigned int ret;

                        ret = do_chain(ops, skb, state, ct);
                        if (ret != NF_ACCEPT)
                                return ret;

                        if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum)))
                                break;

                        ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
                        if (ret != NF_ACCEPT)
                                return ret;

==================
nf_nat_alloc_null_binding
 __nf_nat_alloc_null_binding
  nf_nat_setup_info

As i miss nft_nat_eval() on this callstack, we don't have
NF_NAT_RANGE_PROTO_SPECIFIED flag on range->flags (line 11 above),
thus we don't create unique tuple in get_unique_tuple() (line 18),
thus don't set ct->status |= IPS_DST_NAT in nf_nat_setup_info() (line 32),

but successfully set ct->status |= IPS_SRC_NAT_DONE at the end of
nf_nat_setup_info().

When we are called for the same conntrack (ct) with another ops (and chain with
proper netns), in nf_nat_ipv4_fn() line 8 condition will be false,
as nf_nat_initialized() checks exactly (ct->status & IPS_SRC_NAT_DONE),
and thus the miss conntrack configuration for proper dnat rule.

So the root cause is that nf_nat_ipv4_fn() calls do_chain() for chain with
unmatching netns causing do_chain() to exit with NF_ACCEPT but with no
configuration done, and later nf_nat_ipv4_fn() does not distinguish such
NF_ACCEPT from NF_ACCEPT when ct configuration has been really completed and
marks ct as "configured".

So to fix it we need to either
  1) make do_chain() to return an error if called with wrong netns and handle
     the error in nf_nat_ipv4_fn()
or
  2) just don't call do_chain() with wrong netns.

Way 1) is complex because nft_nat_do_chain() is called in many places,
and need to make sure every place handles new error properly.
So let's go way 2).

https://jira.sw.ru/browse/PSBM-102728

Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>

v2: drop redundant variable "basechain".
---
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 11 +++++++++++
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 11 +++++++++++
 2 files changed, 22 insertions(+)

diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 3b8b048ffc6cb..fa880d189bed3 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -25,6 +25,7 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_tables.h>
 
 static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
 
@@ -251,6 +252,11 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
 	/* maniptype == SRC for postrouting. */
 	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
 
+	const struct nft_chain *chain = ops->priv;
+	const struct net *chain_net =
+		read_pnet(&nft_base_chain(chain)->pnet);
+	const struct net *net;
+
 	/* We never see fragments: conntrack defrags on pre-routing
 	 * and local-out, and nf_nat_out protects post-routing.
 	 */
@@ -265,6 +271,11 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
 	if (!ct)
 		return NF_ACCEPT;
 
+	/* Ignore chains that are not for the current network namespace */
+	net = nf_ct_net(ct);
+	if (!net_eq(net, chain_net))
+		return NF_ACCEPT;
+
 	/* Don't try to NAT if this packet is not conntracked */
 	if (nf_ct_is_untracked(ct))
 		return NF_ACCEPT;
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 540dc0fdaf102..f369123f89246 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -24,6 +24,7 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_tables.h>
 
 static const struct nf_nat_l3proto nf_nat_l3proto_ipv6;
 
@@ -264,6 +265,11 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
 	int hdrlen;
 	u8 nexthdr;
 
+	const struct nft_chain *chain = ops->priv;
+	const struct net *chain_net =
+		read_pnet(&nft_base_chain(chain)->pnet);
+	const struct net *net;
+
 	ct = nf_ct_get(skb, &ctinfo);
 	/* Can't track?  It's not due to stress, or conntrack would
 	 * have dropped it.  Hence it's the user's responsibilty to
@@ -273,6 +279,11 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
 	if (!ct)
 		return NF_ACCEPT;
 
+	/* Ignore chains that are not for the current network namespace */
+	net = nf_ct_net(ct);
+	if (!net_eq(net, chain_net))
+		return NF_ACCEPT;
+
 	/* Don't try to NAT if this packet is not conntracked */
 	if (nf_ct_is_untracked(ct))
 		return NF_ACCEPT;
-- 
2.15.1