[Devel] [PATCH RHEL7 COMMIT] sched: Revert "SCHED: rework cputime accounting (v2)"

Konstantin Khorenko khorenko at virtuozzo.com
Thu Jun 4 05:58:28 PDT 2015


The commit is pushed to "branch-rh7-3.10.0-123.1.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.9
------>
commit ed523ec92064c7b792fcbfea3a01cf9f1e80dd63
Author: Vladimir Davydov <vdavydov at parallels.com>
Date:   Thu Jun 4 16:58:28 2015 +0400

    sched: Revert "SCHED: rework cputime accounting (v2)"
    
    This reverts commit 6071473d0440fcfd128f3243dfb82d19f6aef668.
    
    The above-mentioned commit dramatically complicates porting of cpu acct
    patches from RH6, so revert it. The next patch will fix cpu accounting
    once again.
    
    Related to https://jira.sw.ru/browse/PSBM-33642
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
    
    Conflicts:
    	kernel/sched/core.c
---
 drivers/iommu/amd_iommu.c   |  4 +-
 include/linux/fairsched.h   |  3 +-
 include/linux/kernel_stat.h |  1 +
 include/linux/sched.h       |  3 --
 kernel/sched/core.c         | 92 +++++++++++++++++++++++++--------------------
 kernel/sched/cputime.c      | 15 --------
 kernel/sched/sched.h        |  1 +
 7 files changed, 57 insertions(+), 62 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index c1eefe2..6dc6594 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -4227,7 +4227,7 @@ static int set_affinity(struct irq_data *data, const struct cpumask *mask,
 	return 0;
 }
 
-static int amd_iommu_free_irq(int irq)
+static int free_irq(int irq)
 {
 	struct irq_2_irte *irte_info;
 	struct irq_cfg *cfg;
@@ -4352,7 +4352,7 @@ struct irq_remap_ops amd_iommu_irq_ops = {
 	.enable_faulting	= amd_iommu_enable_faulting,
 	.setup_ioapic_entry	= setup_ioapic_entry,
 	.set_affinity		= set_affinity,
-	.free_irq		= amd_iommu_free_irq,
+	.free_irq		= free_irq,
 	.compose_msi_msg	= compose_msi_msg,
 	.msi_alloc_irq		= msi_alloc_irq,
 	.msi_setup_irq		= msi_setup_irq,
diff --git a/include/linux/fairsched.h b/include/linux/fairsched.h
index 12bbc5b..e242c0d 100644
--- a/include/linux/fairsched.h
+++ b/include/linux/fairsched.h
@@ -18,6 +18,8 @@
 
 #ifdef __KERNEL__
 
+struct kernel_cpustat;
+
 #ifdef CONFIG_VZ_FAIRSCHED
 
 #define FSCHWEIGHT_MAX		((1 << 16) - 1)
@@ -79,7 +81,6 @@ static inline int fairsched_get_cpu_stat(const char *name, struct kernel_cpustat
 
 #endif /* CONFIG_VZ_FAIRSCHED */
 
-struct kernel_cpustat;
 void cpu_cgroup_get_stat(struct cgroup *cgrp, struct kernel_cpustat *kstat);
 
 #endif /* __KERNEL__ */
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index a63a497..d105ab3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -6,6 +6,7 @@
 #include <linux/percpu.h>
 #include <linux/cpumask.h>
 #include <linux/interrupt.h>
+#include <linux/sched.h>
 #include <linux/vtime.h>
 #include <asm/irq.h>
 #include <asm/cputime.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e62dc2b..f4a5e3d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -53,7 +53,6 @@ struct sched_param {
 #include <linux/uidgid.h>
 #include <linux/gfp.h>
 #include <linux/ve_proto.h>
-#include <linux/kernel_stat.h>
 
 #include <asm/processor.h>
 
@@ -976,8 +975,6 @@ struct sched_avg {
 
 #ifdef CONFIG_SCHEDSTATS
 struct sched_statistics {
-	u64			cpustat[NR_STATS];
-
 	u64			wait_start;
 	u64			wait_max;
 	u64			wait_count;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0e8c921..50273af 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7394,6 +7394,7 @@ void __init sched_init(void)
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 	}
 
+	root_task_group.cpustat = alloc_percpu(struct kernel_cpustat);
 	root_task_group.taskstats = alloc_percpu(struct taskstats);
 
 #ifdef CONFIG_SMP
@@ -7694,6 +7695,7 @@ static void free_sched_group(struct task_group *tg)
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
 	autogroup_free(tg);
+	free_percpu(tg->cpustat);
 	free_percpu(tg->taskstats);
 	kfree(tg);
 }
@@ -7713,6 +7715,10 @@ struct task_group *sched_create_group(struct task_group *parent)
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 
+	tg->cpustat = alloc_percpu(struct kernel_cpustat);
+	if (!tg->cpustat)
+		goto err;
+
 	tg->taskstats = alloc_percpu(struct taskstats);
 	if (!tg->taskstats)
 		goto err;
@@ -8661,16 +8667,34 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-static void __task_group_get_cpu_stat(struct task_group *tg, int cpu,
-				      struct kernel_cpustat *kcpustat)
+static u64 cpu_cgroup_usage_cpu(struct task_group *tg, int i)
+{
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SCHEDSTATS)
+	/* root_task_group has not sched entities */
+	if (tg == &root_task_group)
+		return cpu_rq(i)->rq_cpu_time;
+
+	return tg->se[i]->sum_exec_runtime;
+#else
+	return 0;
+#endif
+}
+
+static void cpu_cgroup_update_stat(struct task_group *tg, int i)
 {
 #if defined(CONFIG_SCHEDSTATS) && defined(CONFIG_FAIR_GROUP_SCHED)
-	struct sched_entity *se = tg->se[cpu];
-	u64 now = cpu_clock(cpu);
+	struct sched_entity *se = tg->se[i];
+	struct kernel_cpustat *kcpustat = per_cpu_ptr(tg->cpustat, i);
+	u64 now = cpu_clock(i);
 	u64 delta, idle, iowait;
 
+	/* root_task_group has not sched entities */
+	if (tg == &root_task_group)
+		return;
+
 	iowait = se->statistics.iowait_sum;
 	idle = se->statistics.sum_sleep_runtime;
+	kcpustat->cpustat[CPUTIME_STEAL] = se->statistics.wait_sum;
 
 	if (idle > iowait)
 		idle -= iowait;
@@ -8691,28 +8715,13 @@ static void __task_group_get_cpu_stat(struct task_group *tg, int cpu,
 			kcpustat->cpustat[CPUTIME_STEAL] += delta;
 	}
 
-	kcpustat->cpustat[CPUTIME_USER] = se->statistics.cpustat[CPUTIME_USER];
-	kcpustat->cpustat[CPUTIME_NICE] = se->statistics.cpustat[CPUTIME_NICE];
-	kcpustat->cpustat[CPUTIME_SYSTEM] =
-		se->statistics.cpustat[CPUTIME_SYSTEM];
 	kcpustat->cpustat[CPUTIME_IDLE] =
 		max(kcpustat->cpustat[CPUTIME_IDLE], idle);
 	kcpustat->cpustat[CPUTIME_IOWAIT] =
 		max(kcpustat->cpustat[CPUTIME_IOWAIT], iowait);
-	kcpustat->cpustat[CPUTIME_STEAL] = se->statistics.wait_sum;
-	kcpustat->cpustat[CPUTIME_USED] = se->sum_exec_runtime;
-#endif
-}
 
-static void task_group_get_cpu_stat(struct task_group *tg, int cpu,
-				    struct kernel_cpustat *kcpustat)
-{
-	if (tg == &root_task_group) {
-		memcpy(kcpustat, &kcpustat_cpu(cpu), sizeof(*kcpustat));
-		return;
-	}
-	memset(kcpustat, 0, sizeof(*kcpustat));
-	__task_group_get_cpu_stat(tg, cpu, kcpustat);
+	kcpustat->cpustat[CPUTIME_USED] = cpu_cgroup_usage_cpu(tg, i);
+#endif
 }
 
 int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
@@ -8724,7 +8733,7 @@ int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
 	u64 user, nice, system, idle, iowait, steal;
 	struct timespec boottime;
 	struct task_group *tg = cgroup_tg(cgrp);
-	struct kernel_cpustat st;
+	struct kernel_cpustat *kcpustat;
 	unsigned long tg_nr_running = 0;
 	unsigned long tg_nr_iowait = 0;
 	unsigned long long tg_nr_switches = 0;
@@ -8736,14 +8745,16 @@ int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
 	user = nice = system = idle = iowait = steal = 0;
 
 	for_each_possible_cpu(i) {
-		task_group_get_cpu_stat(tg, i, &st);
+		kcpustat = per_cpu_ptr(tg->cpustat, i);
 
-		user	+= st.cpustat[CPUTIME_USER];
-		nice	+= st.cpustat[CPUTIME_NICE];
-		system	+= st.cpustat[CPUTIME_SYSTEM];
-		idle	+= st.cpustat[CPUTIME_IDLE];
-		iowait	+= st.cpustat[CPUTIME_IOWAIT];
-		steal	+= st.cpustat[CPUTIME_STEAL];
+		cpu_cgroup_update_stat(tg, i);
+
+		user += kcpustat->cpustat[CPUTIME_USER];
+		nice += kcpustat->cpustat[CPUTIME_NICE];
+		system += kcpustat->cpustat[CPUTIME_SYSTEM];
+		idle += kcpustat->cpustat[CPUTIME_IDLE];
+		iowait += kcpustat->cpustat[CPUTIME_IOWAIT];
+		steal += kcpustat->cpustat[CPUTIME_STEAL];
 
 		/* root task group has autogrouping, so this doesn't hold */
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8770,15 +8781,14 @@ int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
 		for_each_online_cpu(j) {
 			if (j % nr_ve_vcpus != i)
 				continue;
-
-			task_group_get_cpu_stat(tg, i, &st);
-
-			user	+= st.cpustat[CPUTIME_USER];
-			nice	+= st.cpustat[CPUTIME_NICE];
-			system	+= st.cpustat[CPUTIME_SYSTEM];
-			idle	+= st.cpustat[CPUTIME_IDLE];
-			iowait	+= st.cpustat[CPUTIME_IOWAIT];
-			steal	+= st.cpustat[CPUTIME_STEAL];
+			kcpustat = per_cpu_ptr(tg->cpustat, j);
+
+			user += kcpustat->cpustat[CPUTIME_USER];
+			nice += kcpustat->cpustat[CPUTIME_NICE];
+			system += kcpustat->cpustat[CPUTIME_SYSTEM];
+			idle += kcpustat->cpustat[CPUTIME_IDLE];
+			iowait += kcpustat->cpustat[CPUTIME_IOWAIT];
+			steal += kcpustat->cpustat[CPUTIME_STEAL];
 		}
 		seq_printf(p,
 			"cpu%d %llu %llu %llu %llu %llu 0 0 %llu\n",
@@ -8845,12 +8855,12 @@ void cpu_cgroup_get_stat(struct cgroup *cgrp, struct kernel_cpustat *kstat)
 	memset(kstat, 0, sizeof(struct kernel_cpustat));
 
 	for_each_possible_cpu(i) {
-		struct kernel_cpustat st;
+		struct kernel_cpustat *st = per_cpu_ptr(tg->cpustat, i);
 
-		task_group_get_cpu_stat(tg, i, &st);
+		cpu_cgroup_update_stat(tg, i);
 
 		for (j = 0; j < NR_STATS; j++)
-			kstat->cpustat[j] += st.cpustat[j];
+			kstat->cpustat[j] += st->cpustat[j];
 	}
 }
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 3ad9fa8..ba4bfc0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -112,19 +112,6 @@ static int irqtime_account_si_update(void)
 
 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
 
-static inline void update_stats_account_cpu_time(struct task_struct *p,
-						 int index, u64 tmp)
-{
-#if defined(CONFIG_SCHEDSTATS) && defined(CONFIG_FAIR_GROUP_SCHED)
-	struct sched_entity *se = &p->se;
-
-	do {
-		se->statistics.cpustat[index] += tmp;
-		se = se->parent;
-	} while (se);
-#endif
-}
-
 static inline void task_group_account_field(struct task_struct *p, int index,
 					    u64 tmp)
 {
@@ -136,8 +123,6 @@ static inline void task_group_account_field(struct task_struct *p, int index,
 	 */
 	__get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
 
-	update_stats_account_cpu_time(p, index, tmp);
-
 	cpuacct_account_field(p, index, tmp);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e4f92a5..e0c03d8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -167,6 +167,7 @@ struct task_group {
 	struct autogroup *autogroup;
 #endif
 
+	struct kernel_cpustat __percpu *cpustat;
 	struct taskstats __percpu *taskstats;
 	unsigned long avenrun[3];	/* loadavg data */
 	struct timespec start_time;



More information about the Devel mailing list