[Devel] [PATCH RHEL COMMIT] ve/netns: limit number of network namespaces per container

Konstantin Khorenko khorenko at virtuozzo.com
Wed Sep 22 14:50:59 MSK 2021


The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit c3dcdffa5de452349fd6286adc6091970803c80f
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Wed Sep 22 14:50:58 2021 +0300

    ve/netns: limit number of network namespaces per container
    
    Network namespaces are destroyed asynchronously from net_cleanup_work
    (cleanup_net()). This work destroys all network namespaces which are
    queued in net_kill_list. It's doing this under net_mutex.
    
    This work is executed from the "kworker" kernel thead. We know that here
    is an avalanch affect. If N processes which creates network namespaces
    don't affect a time of creating new namespaces in other containers, it
    doesn't mean that N + 1 processes will not increase a creation time in
    dozen times. It can be explained too.
    
    The longer kworker is running on cpu, the longer it will be
    not scheduled in the next time and the more namespaces will be
    created for this period of time.
    
    The next script can be used to reproduce the problem:
    while :; do
            n=`jobs | wc -l`
            n=$((10000 - $n))
            for i in `seq $n`; do
                    unshare -m true &
            done
            wait -n 1
    done
    
    https://jira.sw.ru/browse/PSBM-49690
    
    v2: Describe the real reason of the issue.
    v3: - don't limit a number of netns for ve0
        - initialize netns before returning an error, otherwise
          net_namespace structures are not freed.
    v4: don't allocate a net_namespace structure if a ct hits the limit.
    
    khorenko@: v5: s/netns_nr/netns_avail_nr/g
    The variable contains the number of available netns, not currently existing.
    
    Cc: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
    
    Signed-off-by: Andrey Vagin <avagin at openvz.org>
    
    Reviewed-by: Cyrill Gorcunov <gorcunov at openvz.org>
    
    Rebase 4.14:
    Add separate read/write callbacks for cgroup files as for iptables_mask
    and features. Add helpers to change netns_avail_nr and handle yet
    unhandled error-path for net_alloc().
    
    We still need it as ms sysctl max_net_namespaces is per-user not per
    ve/userns and it sets same max value for all users, thus it is not
    flexible enough for us.
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    (cherry-picked from vz8 commit 43e6ffb41ba7dccc3d225b0d8a954a9453167e7b)
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 include/linux/ve.h       |  5 +++++
 kernel/ve/ve.c           | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 net/core/net_namespace.c | 25 +++++++++++++++++++++++++
 3 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/include/linux/ve.h b/include/linux/ve.h
index ba3d1e517152..c8dd71cfa3d9 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -48,10 +48,15 @@ struct ve_struct {
 	__u64			features;
 
 	struct kmapset_key	sysfs_perms_key;
+
+	atomic_t		netns_avail_nr;
+	int			netns_max_nr;
 };
 
 extern int nr_ve;
 
+#define NETNS_MAX_NR_DEFAULT	256	/* number of net-namespaces per-VE */
+
 #define capable_setveid() \
 	(ve_is_super(get_exec_env()) && capable(CAP_SYS_ADMIN))
 
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index ff51f5678a83..f87e3fd53ec7 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -38,6 +38,8 @@ struct ve_struct ve0 = {
 
 	.init_cred		= &init_cred,
 	.features		= -1,
+	.netns_avail_nr		= ATOMIC_INIT(INT_MAX),
+	.netns_max_nr		= INT_MAX,
 };
 EXPORT_SYMBOL(ve0);
 
@@ -327,6 +329,9 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 		goto err_ve;
 
 	ve->features = VE_FEATURES_DEF;
+
+	atomic_set(&ve->netns_avail_nr, NETNS_MAX_NR_DEFAULT);
+	ve->netns_max_nr = NETNS_MAX_NR_DEFAULT;
 do_init:
 	init_rwsem(&ve->op_sem);
 	INIT_LIST_HEAD(&ve->ve_list);
@@ -599,6 +604,35 @@ static int ve_reatures_write(struct cgroup_subsys_state *css, struct cftype *cft
 	return 0;
 }
 
+static u64 ve_netns_max_nr_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return css_to_ve(css)->netns_max_nr;
+}
+
+static int ve_netns_max_nr_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val)
+{
+	struct ve_struct *ve = css_to_ve(css);
+	int delta;
+
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
+	down_write(&ve->op_sem);
+	if (ve->is_running || ve->ve_ns) {
+		up_write(&ve->op_sem);
+		return -EBUSY;
+	}
+	delta = val - ve->netns_max_nr;
+	ve->netns_max_nr = val;
+	atomic_add(delta, &ve->netns_avail_nr);
+	up_write(&ve->op_sem);
+	return 0;
+}
+static u64 ve_netns_avail_nr_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return atomic_read(&css_to_ve(css)->netns_avail_nr);
+}
+
 static struct cftype ve_cftypes[] = {
 
 	{
@@ -625,7 +659,16 @@ static struct cftype ve_cftypes[] = {
 		.read_u64		= ve_reatures_read,
 		.write_u64		= ve_reatures_write,
 	},
-
+	{
+		.name			= "netns_max_nr",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_netns_max_nr_read,
+		.write_u64		= ve_netns_max_nr_write,
+	},
+	{
+		.name			= "netns_avail_nr",
+		.read_u64		= ve_netns_avail_nr_read,
+	},
 	{ }
 };
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index e40ad5d30b15..ee0dbebe3bb5 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -455,9 +455,24 @@ void net_drop_ns(void *p)
 		net_free(ns);
 }
 
+#ifdef CONFIG_VE
+static int dec_netns_avail(struct ve_struct *ve)
+{
+	if (atomic_dec_if_positive(&ve->netns_avail_nr) < 0)
+		return -ENOSPC;
+	return 0;
+}
+
+static void inc_netns_avail(struct ve_struct *ve)
+{
+	atomic_inc(&ve->netns_avail_nr);
+}
+#endif
+
 struct net *copy_net_ns(unsigned long flags,
 			struct user_namespace *user_ns, struct net *old_net)
 {
+	struct ve_struct *ve = get_exec_env();
 	struct ucounts *ucounts;
 	struct net *net;
 	int rv;
@@ -469,6 +484,12 @@ struct net *copy_net_ns(unsigned long flags,
 	if (!ucounts)
 		return ERR_PTR(-ENOSPC);
 
+#ifdef CONFIG_VE
+	rv = dec_netns_avail(ve);
+	if (rv < 0)
+		return ERR_PTR(rv);
+#endif
+
 	net = net_alloc();
 	if (!net) {
 		rv = -ENOMEM;
@@ -493,6 +514,9 @@ struct net *copy_net_ns(unsigned long flags,
 		net_drop_ns(net);
 dec_ucounts:
 		dec_net_namespaces(ucounts);
+#ifdef CONFIG_VE
+		inc_netns_avail(ve);
+#endif
 		return ERR_PTR(rv);
 	}
 	return net;
@@ -623,6 +647,7 @@ static void cleanup_net(struct work_struct *work)
 		key_remove_domain(net->key_domain);
 		put_user_ns(net->user_ns);
 #ifdef CONFIG_VE
+		inc_netns_avail(net->owner_ve);
 		put_ve(net->owner_ve);
 #endif
 		net_drop_ns(net);


More information about the Devel mailing list