[Devel] [PATCH RHEL7 COMMIT] cgroup/net_prio: virtualize ifpriomap per-ve

Vasily Averin vvs at virtuozzo.com
Tue Aug 24 14:40:14 MSK 2021


The commit is pushed to "branch-rh7-3.10.0-1160.36.2.vz7.182.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.36.2.vz7.182.1
------>
commit 23b33c9424325cf6fc57c5ae0694b7c38636cfa4
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Tue Aug 24 14:40:14 2021 +0300

    cgroup/net_prio: virtualize ifpriomap per-ve
    
    Ifpriomap is a map of net_prio cgroup id to device prio. Each process is
    in some netprio cgroup and all sockets of this process have prio cgroup
    id of this cgroup. When packet from such socket goes through network
    stack we choose priority for packet on each device we go through based
    on these device+id->prio map.
    
    Previously we were able to set map for each net_prio cgroup on the
    system, but only for devices of host init network namespace. This patch
    adds mapping for ve init netns devices. VE can only get/change device
    map for ve init netns, Host can only get/change device map for host's
    init netns.
    
    We can have for same cgroup both mappings setup by host for host net
    devices and mappings setup by ve for ve net devices.
    
    When new cgroup is created it either copies only mappings for host
    network devices if done from host, or copies also mappings for ve
    network devices if done from ve.
    
    If ve is not running (ve_ns is NULL), even while in ve we would operate
    with host ifpriomap.
    
    https://jira.sw.ru/browse/PSBM-123766
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 net/core/netprio_cgroup.c | 73 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 0d2180d..32009ca 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -24,6 +24,7 @@
 #include <net/pkt_cls.h>
 #include <net/sock.h>
 #include <net/netprio_cgroup.h>
+#include <linux/ve.h>
 
 #include <linux/fdtable.h>
 
@@ -144,6 +145,7 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp)
 static int cgrp_css_online(struct cgroup *cgrp)
 {
 	struct cgroup *parent = cgrp->parent;
+	struct ve_struct *ve;
 	struct net_device *dev;
 	int ret = 0;
 
@@ -162,6 +164,38 @@ static int cgrp_css_online(struct cgroup *cgrp)
 		if (ret)
 			break;
 	}
+
+	/* get_exec_env is safe under cgroup_mutex */
+	ve = get_exec_env();
+	/*
+	 * Inherit prios from the parent cgroup in scope of ve init netns.
+	 */
+	if (!ve_is_super(ve)) {
+		struct nsproxy *ve_ns;
+		struct net *net = NULL;
+
+		/*
+		 * Take rcu read lock to check that ve's net is not freed under
+		 * us after we release rcu read lock we still have rtnl lock to
+		 * insure net remains non-freed, pairs with rtnl lock in
+		 * cleanup_net().
+		 */
+		rcu_read_lock();
+		ve_ns = rcu_dereference(ve->ve_ns);
+		if (ve_ns)
+			net = ve_ns->net_ns;
+		rcu_read_unlock();
+
+		if (net && net != &init_net) {
+			for_each_netdev(net, dev) {
+				u32 prio = netprio_prio(parent, dev);
+
+				ret = netprio_set_prio(cgrp, dev, prio);
+				if (ret)
+					break;
+			}
+		}
+	}
 	rtnl_unlock();
 	return ret;
 }
@@ -179,18 +213,37 @@ static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft)
 static int read_priomap(struct cgroup *cont, struct cftype *cft,
 			struct cgroup_map_cb *cb)
 {
+	struct ve_struct *ve;
+	struct net *net, *_net = NULL;
 	struct net_device *dev;
 
+	ve = get_curr_ve();
+	if (!ve_is_super(ve)) {
+		struct nsproxy *ve_ns;
+
+		rcu_read_lock();
+		ve_ns = rcu_dereference(ve->ve_ns);
+		if (ve_ns)
+			_net = get_net(ve_ns->net_ns);
+		rcu_read_unlock();
+	}
+	put_ve(ve);
+
+	net = _net ? : &init_net;
 	rcu_read_lock();
-	for_each_netdev_rcu(&init_net, dev)
+	for_each_netdev_rcu(net, dev)
 		cb->fill(cb, dev->name, netprio_prio(cont, dev));
 	rcu_read_unlock();
+	if (_net)
+		put_net(_net);
 	return 0;
 }
 
 static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
 			 const char *buffer)
 {
+	struct ve_struct *ve;
+	struct net *net, *_net = NULL;
 	char devname[IFNAMSIZ + 1];
 	struct net_device *dev;
 	u32 prio;
@@ -199,7 +252,22 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
 	if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
 		return -EINVAL;
 
-	dev = dev_get_by_name(&init_net, devname);
+	ve = get_curr_ve();
+	if (!ve_is_super(ve)) {
+		struct nsproxy *ve_ns;
+
+		rcu_read_lock();
+		ve_ns = rcu_dereference(ve->ve_ns);
+		if (ve_ns)
+			_net = get_net(ve_ns->net_ns);
+		rcu_read_unlock();
+	}
+	put_ve(ve);
+
+	net = _net ? : &init_net;
+	dev = dev_get_by_name(net, devname);
+	if (_net)
+		put_net(_net);
 	if (!dev)
 		return -ENODEV;
 
@@ -241,6 +309,7 @@ static struct cftype ss_files[] = {
 	},
 	{
 		.name = "ifpriomap",
+		.flags = CFTYPE_VE_WRITABLE,
 		.read_map = read_priomap,
 		.write_string = write_priomap,
 	},


More information about the Devel mailing list