[Devel] [PATCH vz9 v2] ve/netfilter: Add autoloading of sockopt modules

Nikita Yushchenko nikita.yushchenko at virtuozzo.com
Thu Nov 11 19:45:30 MSK 2021


Partially based on vz8 commit 3cd26ece5d16 ("ve/netfilter:
Add autoloading of sockopt modules").
Original task: https://jira.sw.ru/browse/PSBM-28910

On netfilter [gs]etsockopt() call, if implementation of the requested
operation is missing in the currently running kernel, try to load the
module containing that implementation.

A hardcoded table is used to look up the name of the module to load.
This table contains information about all in-tree kernel modules that
provide netfilter [gs]etsockopt() operations, as of kernel 5.14 version.

Unlike previous versions of this functionality, CAP_NET_ADMIN is no
longer required to try module loading. Not all [gs]etsockopt()
operations require additional privileges, and issuing one not requiring
them shall not fail due to missing module. Required permission checks
are performed inside operations.

Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
v2:
- fix typos in commit message

 kernel/kmod.c              |   3 +
 net/netfilter/nf_sockopt.c | 119 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 678735dbb969..16563ff101f3 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -214,6 +214,7 @@ static const char * const ve0_allowed_mod[] = {
 	"ip6table_security",
 	"ip6table_nat",
 	"ip6table_mangle",
+	"arp_tables",
 
 	"nf-nat",
 	"nf_conncount",
@@ -270,6 +271,7 @@ static const char * const ve0_allowed_mod[] = {
 
 	/* ip_set */
 	"nfnetlink-subsys-6",		/* NFNL_SUBSYS_IPSET */
+	"ip_set",
 	"ip_set_bitmap:ip",
 	"ip_set_bitmap:ip,mac",
 	"ip_set_bitmap:port",
@@ -290,6 +292,7 @@ static const char * const ve0_allowed_mod[] = {
 	"nfsv4",
 
 	/* IPVS */
+	"ip_vs"
 	"ip_vs_ftp",
 	"ip_vs_nq",
 	"ip_vs_wlc",
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
index 34afcd03b6f6..4613428a9679 100644
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -7,6 +7,15 @@
 #include <linux/mutex.h>
 #include <net/sock.h>
 
+#ifdef CONFIG_VE
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_arp/arp_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/ip_vs.h>
+#endif /* CONFIG_VE */
+
 #include "nf_internals.h"
 
 /* Sockopts only registered and called from user context, so
@@ -89,13 +98,119 @@ static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf,
 	return ops;
 }
 
+#ifdef CONFIG_VE
+static int nf_sockopt_request_module(u8 pf, int val, int get)
+{
+	/* Normally, information of sockopt range provided by a module is owned
+	 * by that module, and registered via nf_register_sockopt().
+	 *
+	 * But now need to find not-yet-loaded module by a sockopt number.
+	 *
+	 * TODO: evaluate if module aliases or device tables or whatever
+	 * similar could be used to avoid duplication of that infomration
+	 * in the below lookup table.
+	 */
+	struct table_entry {
+		const char *name;
+		u8 pf;
+		int get_min;
+		int get_max;
+		int set_min;
+		int set_max;
+	};
+
+#define TABLE_ENTRY(_name, _pf, _prefix) {		\
+	.name = _name,					\
+	.pf = _pf,					\
+	.get_min = _prefix ## _BASE_CTL,		\
+	.get_max = _prefix ## _SO_GET_MAX,		\
+	.set_min = _prefix ## _BASE_CTL,		\
+	.set_max = _prefix ## _SO_SET_MAX,		\
+}
+#define TABLE_ENTRY_SINGLE_GET(_name, _pf, _val) {	\
+	.name = _name,					\
+	.pf = _pf,					\
+	.get_min = _val,				\
+	.get_max = _val,				\
+	.set_min = 0,					\
+	.set_max = -1,					\
+}
+
+	static struct table_entry table[] = {
+#ifdef CONFIG_IP_NF_IPTABLES_MODULE
+		TABLE_ENTRY("ip_tables", PF_INET, IPT),
+#endif
+#ifdef CONFIG_IP6_NF_IPTABLES_MODULE
+		TABLE_ENTRY("ip6_tables", PF_INET6, IP6T),
+#endif
+#ifdef CONFIG_IP_NF_ARPTABLES_MODULE
+		TABLE_ENTRY("arp_tables", PF_INET, ARPT),
+#endif
+#ifdef CONFIG_BRIDGE_NF_EBTABLES_MODULE
+		TABLE_ENTRY("ebtables", PF_INET, EBT),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_MODULE
+		TABLE_ENTRY_SINGLE_GET("nf_conntrack", PF_INET,
+				       SO_ORIGINAL_DST),
+		TABLE_ENTRY_SINGLE_GET("nf_conntrack", PF_INET6,
+				       IP6T_SO_ORIGINAL_DST),
+#endif
+#ifdef CONFIG_IP_SET_MODULE
+		TABLE_ENTRY_SINGLE_GET("ip_set", PF_INET, SO_IP_SET),
+#endif
+#ifdef CONFIG_IP_VS_MODULE
+		TABLE_ENTRY("ip_vs", PF_INET, IP_VS),
+#endif
+	};
+#undef TABLE_ENTRY
+#undef TABLE_ENTRY_SINGLE_GET
+
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(table); i++) {
+		if (pf != table[i].pf)
+			continue;
+		if (get && val >= table[i].get_min && val <= table[i].get_max)
+			break;
+		if (!get && val >= table[i].set_min && val <= table[i].set_max)
+			break;
+	}
+
+	if (i == ARRAY_SIZE(table))
+		return -EOPNOTSUPP;
+
+	return request_module(table[i].name);
+}
+
+static struct nf_sockopt_ops *nf_sockopt_find_ve(struct sock *sk, u_int8_t pf,
+						 int val, int get)
+{
+	struct nf_sockopt_ops *ops = nf_sockopt_find(sk, pf, val, get);
+
+	if (!IS_ERR(ops) || ve_is_super(get_exec_env()))
+		return ops;
+
+	/* Containers are not able to load appropriate modules
+	 * from userspace. We tricky help them here. For containers
+	 * this looks like module is already loaded or driver
+	 * is built in kernel.
+	 */
+	if (nf_sockopt_request_module(pf, val, get) == 0)
+		ops = nf_sockopt_find(sk, pf, val, get);
+
+	return ops;
+}
+#else /* !CONFIG_VE */
+#define nf_sockopt_find_ve(sk, pf, val, get)	nf_sockopt_find(sk, pf, val, get)
+#endif /* !CONFIG_VE */
+
 int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, sockptr_t opt,
 		  unsigned int len)
 {
 	struct nf_sockopt_ops *ops;
 	int ret;
 
-	ops = nf_sockopt_find(sk, pf, val, 0);
+	ops = nf_sockopt_find_ve(sk, pf, val, 0);
 	if (IS_ERR(ops))
 		return PTR_ERR(ops);
 	ret = ops->set(sk, val, opt, len);
@@ -110,7 +225,7 @@ int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
 	struct nf_sockopt_ops *ops;
 	int ret;
 
-	ops = nf_sockopt_find(sk, pf, val, 1);
+	ops = nf_sockopt_find_ve(sk, pf, val, 1);
 	if (IS_ERR(ops))
 		return PTR_ERR(ops);
 	ret = ops->get(sk, val, opt, len);
-- 
2.30.2



More information about the Devel mailing list