[Devel] [PATCH rh7] Port diff-fairsched-cpuset-add-fake-cpuset-for-containers

Sun May 31 10:03:15 PDT 2015

Author: Pavel Tikhomirov
Email: ptikhomirov at parallels.com
Subject: cpuset: add fake cpuset for containers
Date: Tue, 27 Jan 2015 15:40:12 +0300

If container want to write/read cpumask or nodemask of cpuset through
cgroupfs for incontainer cgroup, fake it - add special ve_* fields
to cpuset structure and operate with them. We don't want to validate
change as it is just fake, so allow any.
For flags, relax_domain_level, mem_migration_pending do not
allow access from container.

for docker integration-cli test: TestRunWithCpuset
https://jira.sw.ru/browse/PSBM-30878

v2: add for mems, cpus_allowed, mems_allowed; simplify checks in
update_cpumask/update_nodemask, no excessive code in alloc_trial_cpuset
and change naming for masks
v3: do not take the callback_mutex for printing ve_cpus_allowed,
do not permit r/w to cpuset_cpus_allowed, cpuset_mems_allowed,
add ve_flags and ve_relax_domain_level.
v4: leave only ve_cpus/mems_allowed, others are not faked,
block access to others from CT
v5: cleanup code

Signed-off-by: Pavel Tikhomirov <ptikhomirov at parallels.com>
Reviewed-by: Vladimir Davydov <vdavydov at parallels.com>
=============================================================================

Related to https://jira.sw.ru/browse/PSBM-33642

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 kernel/cpuset.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3c4355e07dc4..ef08c194288b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -87,6 +87,9 @@ struct cpuset {
 	cpumask_var_t cpus_allowed;	/* CPUs allowed to tasks in cpuset */
 	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
 
+	cpumask_var_t ve_cpus_allowed;
+	nodemask_t ve_mems_allowed;
+
 	struct fmeter fmeter;		/* memory_pressure filter */
 
 	/*
@@ -866,6 +869,15 @@ static int __update_cpumask(struct cpuset *cs,
 	if (cs == &top_cpuset)
 		return -EACCES;
 
+	/*
+	 * If we are in CT use fake cpu mask
+	 * can set and read, but no effect
+	 */
+	if (!ve_is_super(get_exec_env())) {
+		cpumask_copy(cs->ve_cpus_allowed, cpus_allowed);
+		return 0;
+	}
+
 	if (!cpumask_subset(cpus_allowed, cpu_active_mask))
 		return -EINVAL;
 
@@ -1127,6 +1139,16 @@ static int __update_nodemask(struct cpuset *cs,
 		goto done;
 	}
 
+	/*
+	 * If we are in CT use fake node mask
+	 * can set and read, but no effect
+	 */
+	if (!ve_is_super(get_exec_env())) {
+		cs->ve_mems_allowed = *mems_allowed;
+		retval = 0;
+		goto done;
+	}
+
 	if (!nodes_subset(*mems_allowed, node_states[N_MEMORY])) {
 		retval = -EINVAL;
 		goto done;
@@ -1563,6 +1585,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 	cpuset_filetype_t type = cft->private;
 	int retval = 0;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EACCES;
+
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs)) {
 		retval = -ENODEV;
@@ -1612,6 +1637,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 	cpuset_filetype_t type = cft->private;
 	int retval = -ENODEV;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EACCES;
+
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
@@ -1693,6 +1721,9 @@ static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
 	size_t count;
 
+	if (!ve_is_super(get_exec_env()))
+		return cpulist_scnprintf(page, PAGE_SIZE, cs->ve_cpus_allowed);
+
 	mutex_lock(&callback_mutex);
 	count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
 	mutex_unlock(&callback_mutex);
@@ -1704,6 +1735,9 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
 	size_t count;
 
+	if (!ve_is_super(get_exec_env()))
+		return nodelist_scnprintf(page, PAGE_SIZE, cs->ve_mems_allowed);
+
 	mutex_lock(&callback_mutex);
 	count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
 	mutex_unlock(&callback_mutex);
@@ -1751,6 +1785,10 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
 {
 	struct cpuset *cs = cgroup_cs(cont);
 	cpuset_filetype_t type = cft->private;
+
+	if (!ve_is_super(get_exec_env()))
+		return 0;
+
 	switch (type) {
 	case FILE_CPU_EXCLUSIVE:
 		return is_cpu_exclusive(cs);
@@ -1782,6 +1820,10 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
 {
 	struct cpuset *cs = cgroup_cs(cont);
 	cpuset_filetype_t type = cft->private;
+
+	if (!ve_is_super(get_exec_env()))
+		return 0;
+
 	switch (type) {
 	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
 		return cs->relax_domain_level;
@@ -1909,10 +1951,17 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
 		kfree(cs);
 		return ERR_PTR(-ENOMEM);
 	}
+	if (!alloc_cpumask_var(&cs->ve_cpus_allowed, GFP_KERNEL)) {
+		free_cpumask_var(cs->cpus_allowed);
+		kfree(cs);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	cpumask_clear(cs->cpus_allowed);
 	nodes_clear(cs->mems_allowed);
+	cpumask_clear(cs->ve_cpus_allowed);
+	nodes_clear(cs->ve_mems_allowed);
 	fmeter_init(&cs->fmeter);
 	INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
 	cs->relax_domain_level = -1;
@@ -2000,6 +2049,7 @@ static void cpuset_css_free(struct cgroup *cont)
 	struct cpuset *cs = cgroup_cs(cont);
 
 	free_cpumask_var(cs->cpus_allowed);
+	free_cpumask_var(cs->ve_cpus_allowed);
 	kfree(cs);
 }
 
@@ -2029,10 +2079,15 @@ int __init cpuset_init(void)
 
 	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
 		BUG();
+	if (!alloc_cpumask_var(&top_cpuset.ve_cpus_allowed, GFP_KERNEL))
+		BUG();
 
 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
 
+	cpumask_clear(top_cpuset.ve_cpus_allowed);
+	nodes_clear(top_cpuset.ve_mems_allowed);
+
 	fmeter_init(&top_cpuset.fmeter);
 	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
 	top_cpuset.relax_domain_level = -1;
-- 
2.1.4