[Devel] [PATCH RHEL7 COMMIT] ms/netfilter: ipset: fix performance regression in swap operation

Konstantin Khorenko khorenko at virtuozzo.com
Mon Sep 30 15:21:54 MSK 2024


The commit is pushed to "branch-rh7-3.10.0-1160.119.1.vz7.224.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh7-3.10.0-1160.119.1.vz7.224.3
------>
commit b60644e6ad4f37478b4dfa0d4d25bdff08d887b7
Author: Jozsef Kadlecsik <kadlec at netfilter.org>
Date:   Wed Sep 25 17:35:41 2024 +0800

    ms/netfilter: ipset: fix performance regression in swap operation
    
    The patch "netfilter: ipset: fix race condition between swap/destroy
    and kernel side add/del/test", commit 28628fa9 fixes a race condition.
    But the synchronize_rcu() added to the swap function unnecessarily slows
    it down: it can safely be moved to destroy and use call_rcu() instead.
    
    Eric Dumazet pointed out that simply calling the destroy functions as
    rcu callback does not work: sets with timeout use garbage collectors
    which need cancelling at destroy which can wait. Therefore the destroy
    functions are split into two: cancelling garbage collectors safely at
    executing the command received by netlink and moving the remaining
    part only into the rcu callback.
    
    Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/
    Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test")
    Reported-by: Ale Crismani <ale.crismani at automattic.com>
    Reported-by: David Wang <00107082 at 163.com>
    Tested-by: David Wang <00107082 at 163.com>
    Signed-off-by: Jozsef Kadlecsik <kadlec at netfilter.org>
    Signed-off-by: Pablo Neira Ayuso <pablo at netfilter.org>
    
    https://virtuozzo.atlassian.net/browse/PSBM-155867
    (cherry picked from commit 97f7cf1cd80eeed3b7c808b7c12463295c751001)
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    =================
    Patchset description:
    netfilter: ipset: Fix possible cause of memory corruption
    
    Patch [1] fixes possible race between swap/destroy and add/del/test.
    
    Here is is possible order of events when this race can lead to double
    free with kfree_rcu on already freed hash bucket:
    
     # Thread 1
      +-> ip_set_add
        +-> set = ip_set_rcu_get(xt_net(par), index)
        < pause >
    
     # Thread 2
    ipset swap
    ipset destroy
      +-> mtype_destroy
        +-> mtype_ahash_destroy
          +-> n = __ipset_dereference(hbucket(t, i))
          +-> kfree(n)
    
     # Thread 1
        < unpause >
        +-> ip_set_lock(set)
        +-> hash_net4_kadt
          +-> mtype_variant->adt[adt]
            +-> net4_kadt_add (mtype_add)
              +-> n = rcu_dereference_bh(hbucket(t, key))
              +-> if (n->pos >= n->size)
                +-> old = n
              +-> if (old != ERR_PTR(-ENOENT))
                +-> kfree_rcu(old, rcu)
    
    That can in it's turn lead to possible rcu free list corruption if this
    double fried memory is reused just after rcu_free and before actual rcu
    callback.
    
    note1: The patch [1] has a reproducer but, sadly, I was unable to
    reproduce the situation, even adding mdelay(100) to ip_set_add/del/test.
    
    note2: All other patches are fixups to the original fixing patch.
    
    note3: We don't have proof that this a fix to original issue, we only
    know that ipset hash buckets were several times seen double freed just
    before crash, which might indirectly indicate that the original problem
    is related to ipset hash buckets.
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    https://virtuozzo.atlassian.net/browse/PSBM-155867
    
    Alexander Maltsev (1):
      netfilter: ipset: Add list flush to cancel_gc
    
    Eric Dumazet (1):
      netns: add pre_exit method to struct pernet_operations
    
    Jozsef Kadlecsik (5):
      netfilter: ipset: fix race condition between swap/destroy and kernel
        side add/del/test [1]
      netfilter: ipset: fix performance regression in swap operation
      netfilter: ipset: Missing gc cancellations fixed
      netfilter: ipset: Fix race between namespace cleanup and gc in the
        list:set type
      netfilter: ipset: Fix suspicious rcu_dereference_protected()
---
 include/linux/netfilter/ipset/ip_set.h  |  4 ++++
 net/netfilter/ipset/ip_set_bitmap_gen.h | 14 ++++++++++---
 net/netfilter/ipset/ip_set_core.c       | 37 +++++++++++++++++++++++++--------
 net/netfilter/ipset/ip_set_hash_gen.h   | 15 ++++++++++---
 net/netfilter/ipset/ip_set_list_set.c   | 13 +++++++++---
 5 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 471363b8862f..328802219857 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -191,6 +191,8 @@ struct ip_set_type_variant {
 	/* Return true if "b" set is the same as "a"
 	 * according to the create set parameters */
 	bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
+	/* Cancel ongoing garbage collectors before destroying the set*/
+	void (*cancel_gc)(struct ip_set *set);
 	/* Region-locking is used */
 	bool region_lock;
 };
@@ -239,6 +241,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);
 
 /* A generic IP set */
 struct ip_set {
+	/* For call_rcu in destroy */
+	struct rcu_head rcu;
 	/* The name of the set */
 	char name[IPSET_MAXNAMELEN];
 	/* Lock protecting the set data */
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index af480ffefaf3..2518258c6e67 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -32,6 +32,7 @@
 #define mtype_del		IPSET_TOKEN(MTYPE, _del)
 #define mtype_list		IPSET_TOKEN(MTYPE, _list)
 #define mtype_gc		IPSET_TOKEN(MTYPE, _gc)
+#define mtype_cancel_gc		IPSET_TOKEN(MTYPE, _cancel_gc)
 #define mtype			MTYPE
 
 #define get_ext(set, map, id)	((map)->extensions + ((set)->dsize * (id)))
@@ -61,9 +62,6 @@ mtype_destroy(struct ip_set *set)
 {
 	struct mtype *map = set->data;
 
-	if (SET_WITH_TIMEOUT(set))
-		del_timer_sync(&map->gc);
-
 	if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
 		mtype_ext_cleanup(set);
 	ip_set_free(map->members);
@@ -292,6 +290,15 @@ mtype_gc(struct timer_list *t)
 	add_timer(&map->gc);
 }
 
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+	struct mtype *map = set->data;
+
+	if (SET_WITH_TIMEOUT(set))
+		del_timer_sync(&map->gc);
+}
+
 static const struct ip_set_type_variant mtype = {
 	.kadt	= mtype_kadt,
 	.uadt	= mtype_uadt,
@@ -305,6 +312,7 @@ static const struct ip_set_type_variant mtype = {
 	.head	= mtype_head,
 	.list	= mtype_list,
 	.same_set = mtype_same_set,
+	.cancel_gc = mtype_cancel_gc,
 };
 
 #endif /* __IP_SET_BITMAP_IP_GEN_H */
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index ca30e6495fc7..7cf9da0881d5 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1038,6 +1038,14 @@ ip_set_destroy_set(struct ip_set *set)
 	kfree(set);
 }
 
+static void
+ip_set_destroy_set_rcu(struct rcu_head *head)
+{
+	struct ip_set *set = container_of(head, struct ip_set, rcu);
+
+	ip_set_destroy_set(set);
+}
+
 static int
 ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 	       const struct nlmsghdr *nlh,
@@ -1051,8 +1059,6 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 	if (unlikely(protocol_min_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
 
-	/* Must wait for flush to be really finished in list:set */
-	rcu_barrier();
 
 	/* Commands are serialized and references are
 	 * protected by the ip_set_ref_lock.
@@ -1064,8 +1070,10 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 	 * counter, so if it's already zero, we can proceed
 	 * without holding the lock.
 	 */
-	read_lock_bh(&ip_set_ref_lock);
 	if (!attr[IPSET_ATTR_SETNAME]) {
+		/* Must wait for flush to be really finished in list:set */
+		rcu_barrier();
+		read_lock_bh(&ip_set_ref_lock);
 		for (i = 0; i < inst->ip_set_max; i++) {
 			s = ip_set(inst, i);
 			if (s && (s->ref || s->ref_netlink)) {
@@ -1079,12 +1087,17 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 			s = ip_set(inst, i);
 			if (s) {
 				ip_set(inst, i) = NULL;
+				/* Must cancel garbage collectors */
+				s->variant->cancel_gc(s);
 				ip_set_destroy_set(s);
 			}
 		}
 		/* Modified by ip_set_destroy() only, which is serialized */
 		inst->is_destroyed = false;
 	} else {
+		u16 features = 0;
+
+		read_lock_bh(&ip_set_ref_lock);
 		s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
 				    &i);
 		if (!s) {
@@ -1094,10 +1107,16 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 			ret = -IPSET_ERR_BUSY;
 			goto out;
 		}
+		features = s->type->features;
 		ip_set(inst, i) = NULL;
 		read_unlock_bh(&ip_set_ref_lock);
-
-		ip_set_destroy_set(s);
+		if (features & IPSET_TYPE_NAME) {
+			/* Must wait for flush to be really finished  */
+			rcu_barrier();
+		}
+		/* Must cancel garbage collectors */
+		s->variant->cancel_gc(s);
+		call_rcu(&s->rcu, ip_set_destroy_set_rcu);
 	}
 	return 0;
 out:
@@ -1256,9 +1275,6 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 	ip_set(inst, to_id) = from;
 	write_unlock_bh(&ip_set_ref_lock);
 
-	/* Make sure all readers of the old set pointers are completed. */
-	synchronize_rcu();
-
 	return 0;
 }
 
@@ -2285,8 +2301,11 @@ ip_set_fini(void)
 {
 	nf_unregister_sockopt(&so_set);
 	nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-
 	unregister_pernet_subsys(&ip_set_net_ops);
+
+	/* Wait for call_rcu() in destroy */
+	rcu_barrier();
+
 	pr_debug("these are the famous last words\n");
 }
 
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 596bb2662b0e..9c6383592c20 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -253,6 +253,7 @@ htable_bits(u32 hashsize)
 #undef mtype_gc_do
 #undef mtype_gc
 #undef mtype_gc_init
+#undef mtype_cancel_gc
 #undef mtype_variant
 #undef mtype_data_match
 
@@ -297,6 +298,7 @@ htable_bits(u32 hashsize)
 #define mtype_gc_do		IPSET_TOKEN(MTYPE, _gc_do)
 #define mtype_gc		IPSET_TOKEN(MTYPE, _gc)
 #define mtype_gc_init		IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_cancel_gc		IPSET_TOKEN(MTYPE, _cancel_gc)
 #define mtype_variant		IPSET_TOKEN(MTYPE, _variant)
 #define mtype_data_match	IPSET_TOKEN(MTYPE, _data_match)
 
@@ -482,9 +484,6 @@ mtype_destroy(struct ip_set *set)
 	struct htype *h = set->data;
 	struct list_head *l, *lt;
 
-	if (SET_WITH_TIMEOUT(set))
-		cancel_delayed_work_sync(&h->gc.dwork);
-
 	mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
 	list_for_each_safe(l, lt, &h->ad) {
 		list_del(l);
@@ -631,6 +630,15 @@ mtype_gc_init(struct htable_gc *gc)
 	queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
 }
 
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+	struct htype *h = set->data;
+
+	if (SET_WITH_TIMEOUT(set))
+		cancel_delayed_work_sync(&h->gc.dwork);
+}
+
 static int
 mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	  struct ip_set_ext *mext, u32 flags);
@@ -1447,6 +1455,7 @@ static const struct ip_set_type_variant mtype_variant = {
 	.uref	= mtype_uref,
 	.resize	= mtype_resize,
 	.same_set = mtype_same_set,
+	.cancel_gc = mtype_cancel_gc,
 	.region_lock = true,
 };
 
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 8da228da53ae..b9dc3063e0d4 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -430,9 +430,6 @@ list_set_destroy(struct ip_set *set)
 	struct list_set *map = set->data;
 	struct set_elem *e, *n;
 
-	if (SET_WITH_TIMEOUT(set))
-		del_timer_sync(&map->gc);
-
 	list_for_each_entry_safe(e, n, &map->members, list) {
 		list_del(&e->list);
 		ip_set_put_byindex(map->net, e->id);
@@ -549,6 +546,15 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b)
 	       a->extensions == b->extensions;
 }
 
+static void
+list_set_cancel_gc(struct ip_set *set)
+{
+	struct list_set *map = set->data;
+
+	if (SET_WITH_TIMEOUT(set))
+		del_timer_sync(&map->gc);
+}
+
 static const struct ip_set_type_variant set_variant = {
 	.kadt	= list_set_kadt,
 	.uadt	= list_set_uadt,
@@ -562,6 +568,7 @@ static const struct ip_set_type_variant set_variant = {
 	.head	= list_set_head,
 	.list	= list_set_list,
 	.same_set = list_set_same_set,
+	.cancel_gc = list_set_cancel_gc,
 };
 
 static void


More information about the Devel mailing list