[Devel] [PATCH RHEL COMMIT] cgroup/net_prio: virtualize ifpriomap per-ve

Tue Oct 12 16:45:10 MSK 2021

The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit 1c440266bd826dba9dc46d2cee486aed16a5233a
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Tue Oct 12 16:45:10 2021 +0300

    cgroup/net_prio: virtualize ifpriomap per-ve
    
    Ifpriomap is a map of net_prio cgroup id to device prio. Each process is
    in some netprio cgroup and all sockets of this process have prio cgroup
    id of this cgroup. When packet from such socket goes through network
    stack we choose priority for packet on each device we go through based
    on these device+id->prio map.
    
    Previously we were able to set map for each net_prio cgroup on the
    system, but only for devices of host init network namespace. This patch
    adds mapping for ve init netns devices. VE can only get/change device
    map for ve init netns, Host can only get/change device map for host's
    init netns.
    
    We can have for same cgroup both mappings setup by host for host net
    devices and mappings setup by ve for ve net devices.
    
    When new cgroup is created it either copies only mappings for host
    network devices if done from host, or copies also mappings for ve
    network devices if done from ve.
    
    If ve is not running (ve_ns is NULL), even while in ve we would operate
    with host ifpriomap.
    
    https://jira.sw.ru/browse/PSBM-123766
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    ====================
    cgroup: ifpriomap virtualization
    
    I've also added get_curr_ve() helper as it looks like in many places we
    rely that get_exec_env() gives us ve which would not free under us, but
    all processes can be moved easily from this ve in parallel and ve can be
    freed AFAICS.
    
    https://jira.sw.ru/browse/PSBM-123766
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 net/core/netprio_cgroup.c | 73 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 99a431c56f23..0ab8c37c42b8 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -17,6 +17,7 @@
 #include <linux/rcupdate.h>
 #include <linux/atomic.h>
 #include <linux/sched/task.h>
+#include <linux/ve.h>
 
 #include <net/rtnetlink.h>
 #include <net/pkt_cls.h>
@@ -145,6 +146,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
 static int cgrp_css_online(struct cgroup_subsys_state *css)
 {
 	struct cgroup_subsys_state *parent_css = css->parent;
+	struct ve_struct *ve;
 	struct net_device *dev;
 	int ret = 0;
 
@@ -166,6 +168,38 @@ static int cgrp_css_online(struct cgroup_subsys_state *css)
 		if (ret)
 			break;
 	}
+
+	/* get_exec_env is safe under cgroup_mutex */
+	ve = get_exec_env();
+	/*
+	 * Inherit prios from the parent cgroup in scope of ve init netns.
+	 */
+	if (!ve_is_super(ve)) {
+		struct nsproxy *ve_ns;
+		struct net *net = NULL;
+
+		/*
+		 * Take rcu read lock to check that ve's net is not freed under
+		 * us after we release rcu read lock we still have rtnl lock to
+		 * insure net remains non-freed, pairs with rtnl lock in
+		 * cleanup_net().
+		 */
+		rcu_read_lock();
+		ve_ns = rcu_dereference(ve->ve_ns);
+		if (ve_ns)
+			net = ve_ns->net_ns;
+		rcu_read_unlock();
+
+		if (net && net != &init_net) {
+			for_each_netdev(net, dev) {
+				u32 prio = netprio_prio(parent_css, dev);
+
+				ret = netprio_set_prio(css, dev, prio);
+				if (ret)
+					break;
+			}
+		}
+	}
 	rtnl_unlock();
 	return ret;
 }
@@ -182,19 +216,38 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
 
 static int read_priomap(struct seq_file *sf, void *v)
 {
+	struct ve_struct *ve;
+	struct net *net, *_net = NULL;
 	struct net_device *dev;
 
+	ve = get_curr_ve();
+	if (!ve_is_super(ve)) {
+		struct nsproxy *ve_ns;
+
+		rcu_read_lock();
+		ve_ns = rcu_dereference(ve->ve_ns);
+		if (ve_ns)
+			_net = get_net(ve_ns->net_ns);
+		rcu_read_unlock();
+	}
+	put_ve(ve);
+
+	net = _net ? : &init_net;
 	rcu_read_lock();
-	for_each_netdev_rcu(&init_net, dev)
+	for_each_netdev_rcu(net, dev)
 		seq_printf(sf, "%s %u\n", dev->name,
 			   netprio_prio(seq_css(sf), dev));
 	rcu_read_unlock();
+	if (_net)
+		put_net(_net);
 	return 0;
 }
 
 static ssize_t write_priomap(struct kernfs_open_file *of,
 			     char *buf, size_t nbytes, loff_t off)
 {
+	struct ve_struct *ve;
+	struct net *net, *_net = NULL;
 	char devname[IFNAMSIZ + 1];
 	struct net_device *dev;
 	u32 prio;
@@ -203,7 +256,22 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
 	if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
 		return -EINVAL;
 
-	dev = dev_get_by_name(&init_net, devname);
+	ve = get_curr_ve();
+	if (!ve_is_super(ve)) {
+		struct nsproxy *ve_ns;
+
+		rcu_read_lock();
+		ve_ns = rcu_dereference(ve->ve_ns);
+		if (ve_ns)
+			_net = get_net(ve_ns->net_ns);
+		rcu_read_unlock();
+	}
+	put_ve(ve);
+
+	net = _net ? : &init_net;
+	dev = dev_get_by_name(net, devname);
+	if (_net)
+		put_net(_net);
 	if (!dev)
 		return -ENODEV;
 
@@ -253,6 +321,7 @@ static struct cftype ss_files[] = {
 	},
 	{
 		.name = "ifpriomap",
+		.flags = CFTYPE_VE_WRITABLE,
 		.seq_show = read_priomap,
 		.write = write_priomap,
 	},