[Devel] [PATCH RHEL7 COMMIT] ms/netns: add pre_exit method to struct pernet_operations
Konstantin Khorenko
khorenko at virtuozzo.com
Mon Sep 30 15:21:54 MSK 2024
The commit is pushed to "branch-rh7-3.10.0-1160.119.1.vz7.224.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh7-3.10.0-1160.119.1.vz7.224.3
------>
commit 156dfdd555a9967e67732936a40685890c39e4e6
Author: Eric Dumazet <edumazet at google.com>
Date: Wed Sep 25 17:35:44 2024 +0800
ms/netns: add pre_exit method to struct pernet_operations
Current struct pernet_operations exit() handlers are highly
discouraged to call synchronize_rcu().
There are cases where we need them, and exit_batch() does
not help the common case where a single netns is dismantled.
This patch leverages the existing synchronize_rcu() call
in cleanup_net()
Calling optional ->pre_exit() method before ->exit() or
->exit_batch() allows to benefit from a single synchronize_rcu()
call.
Note that the synchronize_rcu() calls added in this patch
are only in error paths or slow paths.
Tested:
$ time for i in {1..1000}; do unshare -n /bin/false;done
real 0m2.612s
user 0m0.171s
sys 0m2.216s
Signed-off-by: Eric Dumazet <edumazet at google.com>
Signed-off-by: David S. Miller <davem at davemloft.net>
https://virtuozzo.atlassian.net/browse/PSBM-155867
(cherry picked from commit d7d99872c144a2c2f5d9c9d83627fa833836cba5)
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
=================
Patchset description:
netfilter: ipset: Fix possible cause of memory corruption
Patch [1] fixes possible race between swap/destroy and add/del/test.
Here is is possible order of events when this race can lead to double
free with kfree_rcu on already freed hash bucket:
# Thread 1
+-> ip_set_add
+-> set = ip_set_rcu_get(xt_net(par), index)
< pause >
# Thread 2
ipset swap
ipset destroy
+-> mtype_destroy
+-> mtype_ahash_destroy
+-> n = __ipset_dereference(hbucket(t, i))
+-> kfree(n)
# Thread 1
< unpause >
+-> ip_set_lock(set)
+-> hash_net4_kadt
+-> mtype_variant->adt[adt]
+-> net4_kadt_add (mtype_add)
+-> n = rcu_dereference_bh(hbucket(t, key))
+-> if (n->pos >= n->size)
+-> old = n
+-> if (old != ERR_PTR(-ENOENT))
+-> kfree_rcu(old, rcu)
That can in it's turn lead to possible rcu free list corruption if this
double fried memory is reused just after rcu_free and before actual rcu
callback.
note1: The patch [1] has a reproducer but, sadly, I was unable to
reproduce the situation, even adding mdelay(100) to ip_set_add/del/test.
note2: All other patches are fixups to the original fixing patch.
note3: We don't have proof that this a fix to original issue, we only
know that ipset hash buckets were several times seen double freed just
before crash, which might indirectly indicate that the original problem
is related to ipset hash buckets.
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
https://virtuozzo.atlassian.net/browse/PSBM-155867
Alexander Maltsev (1):
netfilter: ipset: Add list flush to cancel_gc
Eric Dumazet (1):
netns: add pre_exit method to struct pernet_operations
Jozsef Kadlecsik (5):
netfilter: ipset: fix race condition between swap/destroy and kernel
side add/del/test [1]
netfilter: ipset: fix performance regression in swap operation
netfilter: ipset: Missing gc cancellations fixed
netfilter: ipset: Fix race between namespace cleanup and gc in the
list:set type
netfilter: ipset: Fix suspicious rcu_dereference_protected()
---
include/net/net_namespace.h | 6 ++++++
net/core/net_namespace.c | 28 ++++++++++++++++++++++++++++
2 files changed, 34 insertions(+)
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index d00dc43efd52..57830afc25cf 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -364,7 +364,13 @@ struct net *get_net_ns_by_id(struct net *net, int id);
struct pernet_operations {
struct list_head list;
+ /*
+ * Note that a combination of pre_exit() and exit() can
+ * be used, since a synchronize_rcu() is guaranteed between
+ * the calls.
+ */
int (*init)(struct net *net);
+ void (*pre_exit)(struct net *net);
void (*exit)(struct net *net);
void (*exit_batch)(struct list_head *net_exit_list);
int *id;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 644e38948d10..16d3a111d882 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -139,6 +139,17 @@ static void ops_free(const struct pernet_operations *ops, struct net *net)
}
}
+static void ops_pre_exit_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ if (ops->pre_exit) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops->pre_exit(net);
+ }
+}
+
static void ops_exit_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
@@ -338,6 +349,12 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
* for the pernet modules whose init functions did not fail.
*/
list_add(&net->exit_list, &net_exit_list);
+ saved_ops = ops;
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+ ops_pre_exit_list(ops, &net_exit_list);
+
+ synchronize_rcu();
+
saved_ops = ops;
list_for_each_entry_continue_reverse(ops, &pernet_list, list)
ops_exit_list(ops, &net_exit_list);
@@ -509,10 +526,15 @@ static void cleanup_net(struct work_struct *work)
list_add_tail(&net->exit_list, &net_exit_list);
}
+ /* Run all of the network namespace pre_exit methods */
+ list_for_each_entry_reverse(ops, &pernet_list, list)
+ ops_pre_exit_list(ops, &net_exit_list);
+
/*
* Another CPU might be rcu-iterating the list, wait for it.
* This needs to be before calling the exit() notifiers, so
* the rcu_barrier() below isn't sufficient alone.
+ * Also the pre_exit() and exit() methods need this barrier.
*/
synchronize_rcu();
@@ -883,6 +905,8 @@ static int __register_pernet_operations(struct list_head *list,
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
return error;
@@ -896,6 +920,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
list_del(&ops->list);
for_each_net(net)
list_add_tail(&net->exit_list, &net_exit_list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
}
@@ -912,6 +938,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
{
LIST_HEAD(net_exit_list);
list_add(&init_net.exit_list, &net_exit_list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
}
More information about the Devel
mailing list