[Devel] [PATCH RHEL COMMIT] cgroup/net_prio: virtualize ifpriomap per-ve
Konstantin Khorenko
khorenko at virtuozzo.com
Tue Oct 12 16:45:10 MSK 2021
The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit 1c440266bd826dba9dc46d2cee486aed16a5233a
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date: Tue Oct 12 16:45:10 2021 +0300
cgroup/net_prio: virtualize ifpriomap per-ve
Ifpriomap is a map of net_prio cgroup id to device prio. Each process is
in some netprio cgroup and all sockets of this process have prio cgroup
id of this cgroup. When packet from such socket goes through network
stack we choose priority for packet on each device we go through based
on these device+id->prio map.
Previously we were able to set map for each net_prio cgroup on the
system, but only for devices of host init network namespace. This patch
adds mapping for ve init netns devices. VE can only get/change device
map for ve init netns, Host can only get/change device map for host's
init netns.
We can have for same cgroup both mappings setup by host for host net
devices and mappings setup by ve for ve net devices.
When new cgroup is created it either copies only mappings for host
network devices if done from host, or copies also mappings for ve
network devices if done from ve.
If ve is not running (ve_ns is NULL), even while in ve we would operate
with host ifpriomap.
https://jira.sw.ru/browse/PSBM-123766
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
====================
cgroup: ifpriomap virtualization
I've also added get_curr_ve() helper as it looks like in many places we
rely that get_exec_env() gives us ve which would not free under us, but
all processes can be moved easily from this ve in parallel and ve can be
freed AFAICS.
https://jira.sw.ru/browse/PSBM-123766
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
net/core/netprio_cgroup.c | 73 +++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 71 insertions(+), 2 deletions(-)
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 99a431c56f23..0ab8c37c42b8 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -17,6 +17,7 @@
#include <linux/rcupdate.h>
#include <linux/atomic.h>
#include <linux/sched/task.h>
+#include <linux/ve.h>
#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
@@ -145,6 +146,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
static int cgrp_css_online(struct cgroup_subsys_state *css)
{
struct cgroup_subsys_state *parent_css = css->parent;
+ struct ve_struct *ve;
struct net_device *dev;
int ret = 0;
@@ -166,6 +168,38 @@ static int cgrp_css_online(struct cgroup_subsys_state *css)
if (ret)
break;
}
+
+ /* get_exec_env is safe under cgroup_mutex */
+ ve = get_exec_env();
+ /*
+ * Inherit prios from the parent cgroup in scope of ve init netns.
+ */
+ if (!ve_is_super(ve)) {
+ struct nsproxy *ve_ns;
+ struct net *net = NULL;
+
+ /*
+ * Take rcu read lock to check that ve's net is not freed under
+ * us after we release rcu read lock we still have rtnl lock to
+ * insure net remains non-freed, pairs with rtnl lock in
+ * cleanup_net().
+ */
+ rcu_read_lock();
+ ve_ns = rcu_dereference(ve->ve_ns);
+ if (ve_ns)
+ net = ve_ns->net_ns;
+ rcu_read_unlock();
+
+ if (net && net != &init_net) {
+ for_each_netdev(net, dev) {
+ u32 prio = netprio_prio(parent_css, dev);
+
+ ret = netprio_set_prio(css, dev, prio);
+ if (ret)
+ break;
+ }
+ }
+ }
rtnl_unlock();
return ret;
}
@@ -182,19 +216,38 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
static int read_priomap(struct seq_file *sf, void *v)
{
+ struct ve_struct *ve;
+ struct net *net, *_net = NULL;
struct net_device *dev;
+ ve = get_curr_ve();
+ if (!ve_is_super(ve)) {
+ struct nsproxy *ve_ns;
+
+ rcu_read_lock();
+ ve_ns = rcu_dereference(ve->ve_ns);
+ if (ve_ns)
+ _net = get_net(ve_ns->net_ns);
+ rcu_read_unlock();
+ }
+ put_ve(ve);
+
+ net = _net ? : &init_net;
rcu_read_lock();
- for_each_netdev_rcu(&init_net, dev)
+ for_each_netdev_rcu(net, dev)
seq_printf(sf, "%s %u\n", dev->name,
netprio_prio(seq_css(sf), dev));
rcu_read_unlock();
+ if (_net)
+ put_net(_net);
return 0;
}
static ssize_t write_priomap(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
+ struct ve_struct *ve;
+ struct net *net, *_net = NULL;
char devname[IFNAMSIZ + 1];
struct net_device *dev;
u32 prio;
@@ -203,7 +256,22 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
return -EINVAL;
- dev = dev_get_by_name(&init_net, devname);
+ ve = get_curr_ve();
+ if (!ve_is_super(ve)) {
+ struct nsproxy *ve_ns;
+
+ rcu_read_lock();
+ ve_ns = rcu_dereference(ve->ve_ns);
+ if (ve_ns)
+ _net = get_net(ve_ns->net_ns);
+ rcu_read_unlock();
+ }
+ put_ve(ve);
+
+ net = _net ? : &init_net;
+ dev = dev_get_by_name(net, devname);
+ if (_net)
+ put_net(_net);
if (!dev)
return -ENODEV;
@@ -253,6 +321,7 @@ static struct cftype ss_files[] = {
},
{
.name = "ifpriomap",
+ .flags = CFTYPE_VE_WRITABLE,
.seq_show = read_priomap,
.write = write_priomap,
},
More information about the Devel
mailing list