[Devel] [PATCH RHEL7 COMMIT] ms/netns: add pre_exit method to struct pernet_operations

Konstantin Khorenko khorenko at virtuozzo.com
Mon Sep 30 15:21:54 MSK 2024


The commit is pushed to "branch-rh7-3.10.0-1160.119.1.vz7.224.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh7-3.10.0-1160.119.1.vz7.224.3
------>
commit 156dfdd555a9967e67732936a40685890c39e4e6
Author: Eric Dumazet <edumazet at google.com>
Date:   Wed Sep 25 17:35:44 2024 +0800

    ms/netns: add pre_exit method to struct pernet_operations
    
    Current struct pernet_operations exit() handlers are highly
    discouraged to call synchronize_rcu().
    
    There are cases where we need them, and exit_batch() does
    not help the common case where a single netns is dismantled.
    
    This patch leverages the existing synchronize_rcu() call
    in cleanup_net()
    
    Calling optional ->pre_exit() method before ->exit() or
    ->exit_batch() allows to benefit from a single synchronize_rcu()
    call.
    
    Note that the synchronize_rcu() calls added in this patch
    are only in error paths or slow paths.
    
    Tested:
    
    $ time for i in {1..1000}; do unshare -n /bin/false;done
    
    real    0m2.612s
    user    0m0.171s
    sys     0m2.216s
    
    Signed-off-by: Eric Dumazet <edumazet at google.com>
    Signed-off-by: David S. Miller <davem at davemloft.net>
    
    https://virtuozzo.atlassian.net/browse/PSBM-155867
    (cherry picked from commit d7d99872c144a2c2f5d9c9d83627fa833836cba5)
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    =================
    Patchset description:
    netfilter: ipset: Fix possible cause of memory corruption
    
    Patch [1] fixes possible race between swap/destroy and add/del/test.
    
    Here is is possible order of events when this race can lead to double
    free with kfree_rcu on already freed hash bucket:
    
     # Thread 1
      +-> ip_set_add
        +-> set = ip_set_rcu_get(xt_net(par), index)
        < pause >
    
     # Thread 2
    ipset swap
    ipset destroy
      +-> mtype_destroy
        +-> mtype_ahash_destroy
          +-> n = __ipset_dereference(hbucket(t, i))
          +-> kfree(n)
    
     # Thread 1
        < unpause >
        +-> ip_set_lock(set)
        +-> hash_net4_kadt
          +-> mtype_variant->adt[adt]
            +-> net4_kadt_add (mtype_add)
              +-> n = rcu_dereference_bh(hbucket(t, key))
              +-> if (n->pos >= n->size)
                +-> old = n
              +-> if (old != ERR_PTR(-ENOENT))
                +-> kfree_rcu(old, rcu)
    
    That can in it's turn lead to possible rcu free list corruption if this
    double fried memory is reused just after rcu_free and before actual rcu
    callback.
    
    note1: The patch [1] has a reproducer but, sadly, I was unable to
    reproduce the situation, even adding mdelay(100) to ip_set_add/del/test.
    
    note2: All other patches are fixups to the original fixing patch.
    
    note3: We don't have proof that this a fix to original issue, we only
    know that ipset hash buckets were several times seen double freed just
    before crash, which might indirectly indicate that the original problem
    is related to ipset hash buckets.
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    https://virtuozzo.atlassian.net/browse/PSBM-155867
    
    Alexander Maltsev (1):
      netfilter: ipset: Add list flush to cancel_gc
    
    Eric Dumazet (1):
      netns: add pre_exit method to struct pernet_operations
    
    Jozsef Kadlecsik (5):
      netfilter: ipset: fix race condition between swap/destroy and kernel
        side add/del/test [1]
      netfilter: ipset: fix performance regression in swap operation
      netfilter: ipset: Missing gc cancellations fixed
      netfilter: ipset: Fix race between namespace cleanup and gc in the
        list:set type
      netfilter: ipset: Fix suspicious rcu_dereference_protected()
---
 include/net/net_namespace.h |  6 ++++++
 net/core/net_namespace.c    | 28 ++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index d00dc43efd52..57830afc25cf 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -364,7 +364,13 @@ struct net *get_net_ns_by_id(struct net *net, int id);
 
 struct pernet_operations {
 	struct list_head list;
+	/*
+	 * Note that a combination of pre_exit() and exit() can
+	 * be used, since a synchronize_rcu() is guaranteed between
+	 * the calls.
+	 */
 	int (*init)(struct net *net);
+	void (*pre_exit)(struct net *net);
 	void (*exit)(struct net *net);
 	void (*exit_batch)(struct list_head *net_exit_list);
 	int *id;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 644e38948d10..16d3a111d882 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -139,6 +139,17 @@ static void ops_free(const struct pernet_operations *ops, struct net *net)
 	}
 }
 
+static void ops_pre_exit_list(const struct pernet_operations *ops,
+			      struct list_head *net_exit_list)
+{
+	struct net *net;
+
+	if (ops->pre_exit) {
+		list_for_each_entry(net, net_exit_list, exit_list)
+			ops->pre_exit(net);
+	}
+}
+
 static void ops_exit_list(const struct pernet_operations *ops,
 			  struct list_head *net_exit_list)
 {
@@ -338,6 +349,12 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 	 * for the pernet modules whose init functions did not fail.
 	 */
 	list_add(&net->exit_list, &net_exit_list);
+	saved_ops = ops;
+	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+		ops_pre_exit_list(ops, &net_exit_list);
+
+	synchronize_rcu();
+
 	saved_ops = ops;
 	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
 		ops_exit_list(ops, &net_exit_list);
@@ -509,10 +526,15 @@ static void cleanup_net(struct work_struct *work)
 		list_add_tail(&net->exit_list, &net_exit_list);
 	}
 
+	/* Run all of the network namespace pre_exit methods */
+	list_for_each_entry_reverse(ops, &pernet_list, list)
+		ops_pre_exit_list(ops, &net_exit_list);
+
 	/*
 	 * Another CPU might be rcu-iterating the list, wait for it.
 	 * This needs to be before calling the exit() notifiers, so
 	 * the rcu_barrier() below isn't sufficient alone.
+	 * Also the pre_exit() and exit() methods need this barrier.
 	 */
 	synchronize_rcu();
 
@@ -883,6 +905,8 @@ static int __register_pernet_operations(struct list_head *list,
 out_undo:
 	/* If I have an error cleanup all namespaces I initialized */
 	list_del(&ops->list);
+	ops_pre_exit_list(ops, &net_exit_list);
+	synchronize_rcu();
 	ops_exit_list(ops, &net_exit_list);
 	ops_free_list(ops, &net_exit_list);
 	return error;
@@ -896,6 +920,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
 	list_del(&ops->list);
 	for_each_net(net)
 		list_add_tail(&net->exit_list, &net_exit_list);
+	ops_pre_exit_list(ops, &net_exit_list);
+	synchronize_rcu();
 	ops_exit_list(ops, &net_exit_list);
 	ops_free_list(ops, &net_exit_list);
 }
@@ -912,6 +938,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
 {
 	LIST_HEAD(net_exit_list);
 	list_add(&init_net.exit_list, &net_exit_list);
+	ops_pre_exit_list(ops, &net_exit_list);
+	synchronize_rcu();
 	ops_exit_list(ops, &net_exit_list);
 	ops_free_list(ops, &net_exit_list);
 }


More information about the Devel mailing list