[Devel] [PATCH RHEL7 COMMIT] ms/ipv6: Implement automatic flow label generation on transmit

Vasily Averin vvs at virtuozzo.com
Tue Oct 26 14:26:00 MSK 2021


The commit is pushed to "branch-rh7-3.10.0-1160.42.2.vz7.184.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.42.2.vz7.184.2
------>
commit 684ee9efef5ea1c46a6cbd4694bf6f3311e4aab3
Author: Tom Herbert <therbert at google.com>
Date:   Tue Oct 26 14:25:58 2021 +0300

    ms/ipv6: Implement automatic flow label generation on transmit
    
    Automatically generate flow labels for IPv6 packets on transmit.
    The flow label is computed based on skb_get_hash. The flow label will
    only automatically be set when it is zero otherwise (i.e. flow label
    manager hasn't set one). This supports the transmit side functionality
    of RFC 6438.
    
    Added an IPv6 sysctl auto_flowlabels to enable/disable this behavior
    system wide, and added IPV6_AUTOFLOWLABEL socket option to enable this
    functionality per socket.
    
    By default, auto flowlabels are disabled to avoid possible conflicts
    with flow label manager, however if this feature proves useful we
    may want to enable it by default.
    
    It should also be noted that FreeBSD has already implemented automatic
    flow labels (including the sysctl and socket option). In FreeBSD,
    automatic flow labels default to enabled.
    
    Performance impact:
    
    Running super_netperf with 200 flows for TCP_RR and UDP_RR for
    IPv6. Note that in UDP case, __skb_get_hash will be called for
    every packet with explains slight regression. In the TCP case
    the hash is saved in the socket so there is no regression.
    
    Automatic flow labels disabled:
    
      TCP_RR:
        86.53% CPU utilization
        127/195/322 90/95/99% latencies
        1.40498e+06 tps
    
      UDP_RR:
        90.70% CPU utilization
        118/168/243 90/95/99% latencies
        1.50309e+06 tps
    
    Automatic flow labels enabled:
    
      TCP_RR:
        85.90% CPU utilization
        128/199/337 90/95/99% latencies
        1.40051e+06
    
      UDP_RR
        92.61% CPU utilization
        115/164/236 90/95/99% latencies
        1.4687e+06
    
    Signed-off-by: Tom Herbert <therbert at google.com>
    Signed-off-by: David S. Miller <davem at davemloft.net>
    
    (cherry picked from commit cb1ce2ef387b01686469487edd45994872d52d73)
    [VvS: a number of conflicts have been resolved,
     restored hunks dropped in RHEL7 backports]
    https://pmc.acronis.com/browse/VSTOR-47671
    Signed-off-by: Vasily Averin <vvs at virtuozzo.com>
---
 Documentation/networking/ip-sysctl.txt |  9 +++++++++
 include/linux/ipv6.h                   |  3 ++-
 include/net/ipv6.h                     | 25 +++++++++++++++++++++++++
 include/net/netns/ipv6.h               |  1 +
 include/uapi/linux/in6.h               |  1 +
 net/ipv6/af_inet6.c                    |  1 +
 net/ipv6/ip6_gre.c                     |  4 +++-
 net/ipv6/ip6_output.c                  |  8 ++++++--
 net/ipv6/ip6_tunnel.c                  |  3 ++-
 net/ipv6/ipv6_sockglue.c               |  8 ++++++++
 net/ipv6/sysctl_net_ipv6.c             |  8 ++++++++
 11 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 5bce561..f549a89 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1197,6 +1197,15 @@ mld_qrv - INTEGER
 	Default: 2 (as specified by RFC3810 9.1)
 	Minimum: 1 (as specified by RFC6636 4.5)
 
+auto_flowlabels - BOOLEAN
+	Automatically generate flow labels based based on a flow hash
+	of the packet. This allows intermediate devices, such as routers,
+	to idenfify packet flows for mechanisms like Equal Cost Multipath
+	Routing (see RFC 6438).
+	TRUE: enabled
+	FALSE: disabled
+	Default: false
+
 anycast_src_echo_reply - BOOLEAN
 	Controls the use of anycast addresses as source addresses for ICMPv6
 	echo reply
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 98b79d1..452fc5f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -213,7 +213,8 @@ struct ipv6_pinfo {
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
-				dontfrag:1;
+				dontfrag:1,
+				autoflowlabel:1;
 	__u8			min_hopcount;
 	__u8			tclass;
 	__be32			rcv_flowinfo;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 3a83ff7..daa5937 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -717,10 +717,35 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
+static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
+					__be32 flowlabel, bool autolabel)
+{
+	if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) {
+		__be32 hash;
+
+		hash = skb_get_hash(skb);
+
+		/* Since this is being sent on the wire obfuscate hash a bit
+		 * to minimize possbility that any useful information to an
+		 * attacker is leaked. Only lower 20 bits are relevant.
+		 */
+		hash ^= hash >> 12;
+
+		flowlabel = hash & IPV6_FLOWLABEL_MASK;
+	}
+
+	return flowlabel;
+}
 #else
 static inline void ip6_set_txhash(struct sock *sk) { }
+static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
+					__be32 flowlabel, bool autolabel)
+{
+	return flowlabel;
+}
 #endif
 
+
 /*
  *	Header manipulation
  */
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 005e2c2..65b0477 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -27,6 +27,7 @@ struct netns_sysctl_ipv6 {
 	int ip6_rt_gc_elasticity;
 	int ip6_rt_mtu_expires;
 	int ip6_rt_min_advmss;
+	int auto_flowlabels;
 	int icmpv6_time;
 };
 
diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index bfd883c..64bb863 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -233,6 +233,7 @@ struct in6_flowlabel_req {
 #if 0	/* not yet */
 #define IPV6_USE_MIN_MTU	63
 #endif
+#define IPV6_AUTOFLOWLABEL	64
 
 /*
  * Netfilter (1)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index dbac490..8142153 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -835,6 +835,7 @@ static int __net_init inet6_net_init(struct net *net)
 	net->ipv6.sysctl.icmpv6_time = 1*HZ;
 	net->idgen_retries = 3;
 	net->idgen_delay = 1 * HZ;
+	net->ipv6.sysctl.auto_flowlabels = 0;
 
 	err = ipv6_init_mibs(net);
 	if (err)
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index b927320..cd29f46 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1043,7 +1043,9 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev,
 	__be16 *p;
 
 	ipv6h = skb_push(skb, t->hlen + sizeof(*ipv6h));
-	ip6_flow_hdr(ipv6h, 0, t->fl.u.ip6.flowlabel);
+	ip6_flow_hdr(ipv6h, 0,
+		     ip6_make_flowlabel(dev_net(dev), skb,
+					t->fl.u.ip6.flowlabel, false));
 	ipv6h->hop_limit = t->parms.hop_limit;
 	ipv6h->nexthdr = NEXTHDR_GRE;
 	ipv6h->saddr = t->parms.laddr;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6c2082d..f6e951e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -224,7 +224,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	if (hlimit < 0)
 		hlimit = ip6_dst_hoplimit(dst);
 
-	ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
+	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
+						     np->autoflowlabel));
 
 	hdr->payload_len = htons(seg_len);
 	hdr->nexthdr = proto;
@@ -1613,6 +1614,7 @@ struct sk_buff * __ip6_make_skb(struct sock *sk,
 	struct sk_buff *skb, *tmp_skb;
 	struct sk_buff **tail_skb;
 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
+	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct net *net = sock_net(sk);
 	struct ipv6hdr *hdr;
 	struct ipv6_txoptions *opt = v6_cork->opt;
@@ -1652,7 +1654,9 @@ struct sk_buff * __ip6_make_skb(struct sock *sk,
 	skb_reset_network_header(skb);
 	hdr = ipv6_hdr(skb);
 
-	ip6_flow_hdr(hdr, v6_cork->tclass, fl6->flowlabel);
+	ip6_flow_hdr(hdr, v6_cork->tclass,
+		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
+					np->autoflowlabel));
 	hdr->hop_limit = v6_cork->hop_limit;
 	hdr->nexthdr = proto;
 	hdr->saddr = fl6->saddr;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 900cd7f..d1d2d88 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1193,7 +1193,8 @@ route_lookup:
 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
 	ipv6h = ipv6_hdr(skb);
-	ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel);
+	ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
+		     ip6_make_flowlabel(net, skb, fl6->flowlabel, false));
 	ipv6h->hop_limit = hop_limit;
 	ipv6h->nexthdr = proto;
 	ipv6h->saddr = fl6->saddr;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 4f9a56c..e9ab0a9 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -864,6 +864,10 @@ pref_skip_coa:
 		np->dontfrag = valbool;
 		retv = 0;
 		break;
+	case IPV6_AUTOFLOWLABEL:
+		np->autoflowlabel = valbool;
+		retv = 0;
+		break;
 	}
 
 	release_sock(sk);
@@ -1271,6 +1275,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		val = np->dontfrag;
 		break;
 
+	case IPV6_AUTOFLOWLABEL:
+		val = np->autoflowlabel;
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index e42080e..e11127d 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -61,6 +61,13 @@ static struct ctl_table ipv6_table_template[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "auto_flowlabels",
+		.data		= &init_net.ipv6.sysctl.auto_flowlabels,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
@@ -101,6 +108,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
 	ipv6_table[3].data = &net->idgen_retries;
 	ipv6_table[4].data = &net->idgen_delay;
 	ipv6_table[5].data = &net->ipv6_sysctl_fwmark_reflect;
+	ipv6_table[6].data = &net->ipv6.sysctl.auto_flowlabels;
 
 	ipv6_route_table = ipv6_route_sysctl_init(net);
 	if (!ipv6_route_table)


More information about the Devel mailing list