[Devel] [PATCH 5/5] expose per-taskgroup schedstats in cgroup
Glauber Costa
glommer at parallels.com
Thu Feb 2 06:19:32 PST 2012
This patch aims at exposing stat information per-cgroup, such as:
* idle time,
* iowait time,
* steal time,
and friends. The ultimate goal is to be able to present a per-container view of
/proc/stat inside a container. With this patch, everything that is needed to do
that is in place, except for number of switches and number of tasks.
I achieve that by hooking into the schedstats framework, so although the
overhead of that is prone to discussion, I am not adding anything, but reusing
what's already there instead. The exception being that the data is now computed
and stored in non-task se's as well, instead of entity_is_task() branches.
However, I expect this to be minimum comparing to the alternative of adding new
hierarchy walks. Those are kept intact.
Note that in this first version, I am using clock_t units, being quite
proc-centric. It made my testing easier, but I am happy to show any units
you guys would prefer.
Signed-off-by: Glauber Costa <glommer at parallels.com>
---
kernel/sched/core.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 45 ++++++++++++++++++++
kernel/sched/sched.h | 3 +
3 files changed, 162 insertions(+), 0 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 013ca9c..fc2b9ed 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7975,6 +7975,107 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
}
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_SCHEDSTATS
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#define fair_rq(field, tg, i) tg->cfs_rq[i]->field
+#define fair_se(field, tg, i) tg->se[i]->statistics.field
+#else
+#define fair_rq(field, tg, i) 0
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+#define rt_rq(field, tg, i) tg->rt_rq[i]->field
+#else
+#define rt_rq(field, tg, i) 0
+#endif
+
+static u64 tg_nr_running(struct task_group *tg, int cpu)
+{
+ /*
+ * because of autogrouped groups in root_task_group, the
+ * following does not hold.
+ */
+ if (tg != &root_task_group)
+ return rt_rq(rt_nr_running, tg, cpu) + fair_rq(nr_running, tg, cpu);
+
+ return cpu_rq(cpu)->nr_running;
+}
+
+static u64 tg_idle(struct task_group *tg, int cpu)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ u64 val;
+
+ if (tg != &root_task_group) {
+ val = cfs_read_sleep(tg->se[cpu]);
+ /* If we have rt tasks running, we're not really idle */
+ val -= rt_rq(exec_clock, tg, cpu);
+ val = nsec_to_tick(val);
+ } else
+ val = cpustat[CPUTIME_IDLE];
+
+ return cputime_to_clock_t(val);
+}
+
+static u64 tg_steal(struct task_group *tg, int cpu)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ u64 val = cpustat[CPUTIME_STEAL];
+
+ if (tg != &root_task_group)
+ val = nsec_to_tick(cfs_read_wait(tg->se[cpu]));
+ else
+ val = cpustat[CPUTIME_STEAL];
+
+ return cputime_to_clock_t(val);
+}
+
+static u64 tg_nr_iowait(struct task_group *tg, int cpu)
+{
+ if (tg != &root_task_group)
+ return atomic_read(&tg->se[cpu]->nr_iowait);
+
+ return atomic_read(&cpu_rq(cpu)->nr_iowait);
+}
+
+static u64 tg_iowait(struct task_group *tg, int cpu)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ u64 val = 0;
+
+ if (tg != &root_task_group)
+ val = nsec_to_tick(cfs_read_iowait(tg->se[cpu]));
+ else
+ val = cpustat[CPUTIME_IOWAIT];
+
+ return cputime_to_clock_t(val);
+}
+
+static int cpu_schedstats_show(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+ int cpu;
+ /*
+ * TODO: nr_switches is one of the statistics we're interested in, but
+ * it is potentially too heavy on the scheduler.
+ */
+ u64 nr_switches = 0;
+
+ for_each_online_cpu(cpu) {
+ seq_printf(m,
+ "cpu%d %llu %llu %llu %llu %llu %llu\n",
+ cpu, tg_idle(tg, cpu), tg_iowait(tg, cpu), tg_steal(tg, cpu),
+ tg_nr_iowait(tg,cpu), nr_switches,
+ tg_nr_running(tg, cpu)
+ );
+ }
+
+ return 0;
+}
+#endif
+
static struct cftype cpu_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
@@ -7982,6 +8083,19 @@ static struct cftype cpu_files[] = {
.read_u64 = cpu_shares_read_u64,
.write_u64 = cpu_shares_write_u64,
},
+/*
+ * In theory, those could be done using the rt tasks as a basis
+ * as well. Since we're interested in figures like idle, iowait, etc
+ * for the whole cgroup, the results should be the same.
+ * But that only complicates the code, and I doubt anyone using !FAIR_GROUP_SCHED
+ * is terribly interested in those.
+ */
+#ifdef CONFIG_SCHEDSTATS
+ {
+ .name = "schedstat_percpu",
+ .read_seq_string = cpu_schedstats_show,
+ },
+#endif
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e301ba4..5305bb1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -721,6 +721,41 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
}
+#ifdef CONFIG_SCHEDSTATS
+u64 cfs_read_sleep(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = se->cfs_rq;
+ u64 value = se->statistics.sum_sleep_runtime;
+
+ if (!se->statistics.sleep_start)
+ return value;
+
+ return value + rq_of(cfs_rq)->clock - se->statistics.sleep_start;
+}
+
+u64 cfs_read_iowait(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = se->cfs_rq;
+ u64 value = se->statistics.iowait_sum;
+
+ if (!se->statistics.block_start)
+ return value;
+
+ return value + rq_of(cfs_rq)->clock - se->statistics.block_start;
+}
+
+u64 cfs_read_wait(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = se->cfs_rq;
+ u64 value = se->statistics.wait_sum;
+
+ if (!se->statistics.wait_start)
+ return value;
+
+ return value + rq_of(cfs_rq)->clock - se->statistics.wait_start;
+}
+#endif
+
/*
* Task is being enqueued - update stats:
*/
@@ -1046,6 +1081,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
account_scheduler_latency(tsk, delta >> 10, 0);
}
+ else if (atomic_read(&se->nr_iowait)) {
+ se->statistics.iowait_sum += delta;
+ se->statistics.iowait_count++;
+ }
}
#endif
}
@@ -1199,6 +1238,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->statistics.sleep_start = rq_of(cfs_rq)->clock;
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->statistics.block_start = rq_of(cfs_rq)->clock;
+ } else {
+ if (atomic_read(&se->nr_iowait))
+ se->statistics.block_start = rq_of(cfs_rq)->clock;
+ else
+ se->statistics.sleep_start = rq_of(cfs_rq)->clock;
+
}
#endif
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 53d13dd..7ec2482 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1156,6 +1156,9 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
extern void unthrottle_offline_cfs_rqs(struct rq *rq);
extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+extern u64 cfs_read_sleep(struct sched_entity *se);
+extern u64 cfs_read_iowait(struct sched_entity *se);
+extern u64 cfs_read_wait(struct sched_entity *se);
#ifdef CONFIG_NO_HZ
enum rq_nohz_flag_bits {
--
1.7.7.4
More information about the Devel
mailing list