[Devel] [PATCH vz9 04/27] ve/sched/stat: Introduce functions to calculate vcpustat data
Nikita Yushchenko
nikita.yushchenko at virtuozzo.com
Wed Oct 6 11:57:23 MSK 2021
From: Konstantin Khorenko <khorenko at virtuozzo.com>
Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
+++
sched: Uninline css_tg()
Compilation with custom config fails:
kernel/ve/ve.c: In function ‘ve_get_cpu_avenrun’:
kernel/ve/ve.c:1679:27: error: inlining failed in call to always_inline ‘css_tg’: function body not available
inline struct task_group *css_tg(struct cgroup_subsys_state *css);
^~~~~~
kernel/ve/ve.c:1690:7: note: called from here
tg = css_tg(css);
^~~~~~~~~~~
We may remove "inline" attribute, as compiler is clever enough
to make itself inlining in kernel/sched/sched.c.
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
Reviewed-by: Evgenii Shatokhin <eshatokhin at virtuozzo.com>
Cherry-picked from vz8 commit 0b5495c8980d ("ve/sched/stat: Introduce
functions to calculate vcpustat data").
Ported code that calculates CT boot timestamp to time namespaces.
Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
kernel/sched/core.c | 2 +-
kernel/sched/cpuacct.c | 379 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 380 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 79a6f6808a7c..f1689ac77af1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9718,7 +9718,7 @@ void sched_move_task(struct task_struct *tsk)
task_rq_unlock(rq, tsk, &rf);
}
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+struct task_group *css_tg(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct task_group, css) : NULL;
}
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 893eece65bfd..871b6f8ccb0d 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -5,6 +5,8 @@
* Based on the work by Paul Menage (menage at google.com) and Balbir Singh
* (balbir at in.ibm.com).
*/
+#include <linux/kernel_stat.h>
+#include <linux/ve.h>
#include <asm/irq_regs.h>
#include "sched.h"
@@ -374,3 +376,380 @@ struct cgroup_subsys cpuacct_cgrp_subsys = {
.legacy_cftypes = files,
.early_init = true,
};
+
+extern struct task_group *css_tg(struct cgroup_subsys_state *css);
+
+static struct task_group *ve_root_tg(struct task_group *tg) {
+ struct cgroup *cg;
+
+ if (!tg)
+ return NULL;
+
+ cg = cgroup_get_ve_root1(tg->css.cgroup);
+ return cg ? css_tg(&cg->self) : NULL;
+}
+
+unsigned int tg_cpu_rate(struct task_group *tg)
+{
+ unsigned int cpu_rate = 0;
+#ifdef CONFIG_CFS_CPULIMIT
+ tg = ve_root_tg(tg);
+ if (tg)
+ cpu_rate = tg->cpu_rate;
+#endif
+ return cpu_rate;
+}
+
+static unsigned int tg_nr_cpus(struct task_group *tg)
+{
+ unsigned int nr_cpus = 0;
+ unsigned int max_nr_cpus = num_online_cpus();
+
+#ifdef CONFIG_CFS_CPULIMIT
+ tg = ve_root_tg(tg);
+ if (tg)
+ nr_cpus = tg->nr_cpus;
+#endif
+ if (!nr_cpus || nr_cpus > max_nr_cpus)
+ nr_cpus = max_nr_cpus;
+
+ return nr_cpus;
+}
+
+struct kernel_cpustat *cpuacct_cpustat(struct cgroup_subsys_state *css, int cpu)
+{
+ return per_cpu_ptr(css_ca(css)->cpustat, cpu);
+}
+
+static void cpu_cgroup_update_stat(struct cgroup_subsys_state *cpu_css,
+ struct cgroup_subsys_state *cpuacct_css,
+ int i)
+{
+#if defined(CONFIG_SCHEDSTATS) && defined(CONFIG_FAIR_GROUP_SCHED)
+ struct task_group *tg = css_tg(cpu_css);
+ struct sched_entity *se = tg->se[i];
+ u64 *cpustat = cpuacct_cpustat(cpuacct_css, i)->cpustat;
+ u64 now = cpu_clock(i);
+ u64 delta, idle, iowait, steal;
+
+ /* root_task_group has not sched entities */
+ if (tg == &root_task_group)
+ return;
+
+ iowait = se->statistics.iowait_sum;
+ idle = se->statistics.sum_sleep_runtime;
+ steal = se->statistics.wait_sum;
+
+ if (idle > iowait)
+ idle -= iowait;
+ else
+ idle = 0;
+
+ if (se->statistics.sleep_start) {
+ delta = now - se->statistics.sleep_start;
+ if ((s64)delta > 0)
+ idle += delta;
+ } else if (se->statistics.block_start) {
+ delta = now - se->statistics.block_start;
+ if ((s64)delta > 0)
+ iowait += delta;
+ } else if (se->statistics.wait_start) {
+ delta = now - se->statistics.wait_start;
+ if ((s64)delta > 0)
+ steal += delta;
+ }
+
+ cpustat[CPUTIME_IDLE] = max(cpustat[CPUTIME_IDLE], idle);
+ cpustat[CPUTIME_IOWAIT] = max(cpustat[CPUTIME_IOWAIT], iowait);
+ cpustat[CPUTIME_STEAL] = steal;
+#endif
+}
+
+static void fixup_vcpustat_delta_usage(struct kernel_cpustat *cur,
+ struct kernel_cpustat *rem, int ind,
+ u64 cur_usage, u64 target_usage,
+ u64 rem_usage)
+{
+ s64 scaled_val;
+ u32 scale_pct = 0;
+
+ /* distribute the delta among USER, NICE, and SYSTEM proportionally */
+ if (cur_usage < target_usage) {
+ if ((s64)rem_usage > 0) /* sanity check to avoid div/0 */
+ scale_pct = div64_u64(100 * rem->cpustat[ind],
+ rem_usage);
+ } else {
+ if ((s64)cur_usage > 0) /* sanity check to avoid div/0 */
+ scale_pct = div64_u64(100 * cur->cpustat[ind],
+ cur_usage);
+ }
+
+ scaled_val = div_s64(scale_pct * (target_usage - cur_usage), 100);
+
+ cur->cpustat[ind] += scaled_val;
+ if ((s64)cur->cpustat[ind] < 0)
+ cur->cpustat[ind] = 0;
+
+ rem->cpustat[ind] -= scaled_val;
+ if ((s64)rem->cpustat[ind] < 0)
+ rem->cpustat[ind] = 0;
+}
+
+static void calc_vcpustat_delta_idle(struct kernel_cpustat *cur,
+ int ind, u64 cur_idle, u64 target_idle)
+{
+ /* distribute target_idle between IDLE and IOWAIT proportionally to
+ * what we initially had on this vcpu */
+ if ((s64)cur_idle > 0) {
+ u32 scale_pct = div64_u64(100 * cur->cpustat[ind], cur_idle);
+ cur->cpustat[ind] = div_u64(scale_pct * target_idle, 100);
+ } else {
+ cur->cpustat[ind] = ind == CPUTIME_IDLE ? target_idle : 0;
+ }
+}
+
+static void fixup_vcpustat_delta(struct kernel_cpustat *cur,
+ struct kernel_cpustat *rem,
+ u64 max_usage)
+{
+ u64 cur_usage, target_usage, rem_usage;
+ u64 cur_idle, target_idle;
+
+ cur_usage = kernel_cpustat_total_usage(cur);
+ rem_usage = kernel_cpustat_total_usage(rem);
+
+ target_usage = min(cur_usage + rem_usage,
+ max_usage);
+
+ if (cur_usage != target_usage) {
+ fixup_vcpustat_delta_usage(cur, rem, CPUTIME_USER,
+ cur_usage, target_usage, rem_usage);
+ fixup_vcpustat_delta_usage(cur, rem, CPUTIME_NICE,
+ cur_usage, target_usage, rem_usage);
+ fixup_vcpustat_delta_usage(cur, rem, CPUTIME_SYSTEM,
+ cur_usage, target_usage, rem_usage);
+ }
+
+ cur_idle = kernel_cpustat_total_idle(cur);
+ target_idle = max_usage - target_usage;
+
+ if (cur_idle != target_idle) {
+ calc_vcpustat_delta_idle(cur, CPUTIME_IDLE,
+ cur_idle, target_idle);
+ calc_vcpustat_delta_idle(cur, CPUTIME_IOWAIT,
+ cur_idle, target_idle);
+ }
+
+ /* do not show steal time inside ve */
+ cur->cpustat[CPUTIME_STEAL] = 0;
+}
+
+static void cpu_cgroup_update_vcpustat(struct cgroup_subsys_state *cpu_css,
+ struct cgroup_subsys_state *cpuacct_css)
+{
+ int i, j;
+ int nr_vcpus;
+ int vcpu_rate;
+ ktime_t now;
+ u64 max_usage;
+ struct kernel_cpustat stat_delta, stat_rem;
+ struct task_group *tg = css_tg(cpu_css);
+ int first_pass = 1;
+
+ spin_lock(&tg->vcpustat_lock);
+
+ now = ktime_get();
+ nr_vcpus = tg_nr_cpus(tg);
+ vcpu_rate = DIV_ROUND_UP(tg_cpu_rate(tg), nr_vcpus);
+ if (!vcpu_rate || vcpu_rate > MAX_CPU_RATE)
+ vcpu_rate = MAX_CPU_RATE;
+
+ if (!ktime_to_ns(tg->vcpustat_last_update)) {
+ /* on the first read initialize vcpu i stat as a sum of stats
+ * over pcpus j such that j % nr_vcpus == i */
+ for (i = 0; i < nr_vcpus; i++) {
+ for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
+ if (!cpu_possible(j))
+ continue;
+ kernel_cpustat_add(tg->vcpustat + i,
+ cpuacct_cpustat(cpuacct_css, j),
+ tg->vcpustat + i);
+ }
+ }
+ goto out_update_last;
+ }
+
+ max_usage = ktime_to_ns(ktime_sub(now, tg->vcpustat_last_update));
+ max_usage = div_u64(max_usage * vcpu_rate, MAX_CPU_RATE);
+ /* don't allow to update stats too often to avoid calculation errors */
+ if (max_usage < 10)
+ goto out_unlock;
+
+ /* temporarily copy per cpu usage delta to tg->cpustat_last */
+ for_each_possible_cpu(i)
+ kernel_cpustat_sub(cpuacct_cpustat(cpuacct_css, i),
+ tg->cpustat_last + i,
+ tg->cpustat_last + i);
+
+ /* proceed to calculating per vcpu delta */
+ kernel_cpustat_zero(&stat_rem);
+
+again:
+ for (i = 0; i < nr_vcpus; i++) {
+ int exceeds_max;
+
+ kernel_cpustat_zero(&stat_delta);
+ for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
+ if (!cpu_possible(j))
+ continue;
+ kernel_cpustat_add(&stat_delta,
+ tg->cpustat_last + j, &stat_delta);
+ }
+
+ exceeds_max = kernel_cpustat_total_usage(&stat_delta) >=
+ max_usage;
+ /*
+ * On the first pass calculate delta for vcpus with usage >
+ * max_usage in order to accumulate excess in stat_rem.
+ *
+ * Once the remainder is accumulated, proceed to the rest of
+ * vcpus so that it will be distributed among them.
+ */
+ if (exceeds_max != first_pass)
+ continue;
+
+ fixup_vcpustat_delta(&stat_delta, &stat_rem, max_usage);
+ kernel_cpustat_add(tg->vcpustat + i, &stat_delta,
+ tg->vcpustat + i);
+ }
+
+ if (first_pass) {
+ first_pass = 0;
+ goto again;
+ }
+out_update_last:
+ for_each_possible_cpu(i)
+ tg->cpustat_last[i] = *cpuacct_cpustat(cpuacct_css, i);
+ tg->vcpustat_last_update = now;
+out_unlock:
+ spin_unlock(&tg->vcpustat_lock);
+}
+
+int cpu_cgroup_proc_stat(struct cgroup_subsys_state *cpu_css,
+ struct cgroup_subsys_state *cpuacct_css,
+ struct seq_file *p)
+{
+ int i;
+ u64 user, nice, system, idle, iowait, steal;
+ struct time_namespace *time_ns;
+ struct timespec64 boottime;
+ struct task_group *tg = css_tg(cpu_css);
+ bool virt = !ve_is_super(get_exec_env()) && tg != &root_task_group;
+ int nr_vcpus = tg_nr_cpus(tg);
+ struct kernel_cpustat *kcpustat;
+ unsigned long tg_nr_running = 0;
+ unsigned long tg_nr_iowait = 0;
+
+ time_ns = ve_get_time_ns(get_exec_env());
+ if (time_ns) {
+ getboottime64(&boottime);
+ /* time_ns->offsets.boottime is (ve_uptime - host_uptime), i.e.
+ * negative for ve created on this host. Shall subtract that
+ * from the timestamp of host's boot to get the timestamp of
+ * ve's boot */
+ boottime = timespec64_sub(boottime, time_ns->offsets.boottime);
+ put_time_ns(time_ns);
+ } else {
+ /* for not yet started ve, use current time as the timestamp of
+ * ve's boot */
+ ktime_get_real_ts64(&boottime);
+ }
+
+ for_each_possible_cpu(i) {
+ cpu_cgroup_update_stat(cpu_css, cpuacct_css, i);
+
+ /* root task group has autogrouping, so this doesn't hold */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ tg_nr_running += tg->cfs_rq[i]->h_nr_running;
+ tg_nr_iowait += tg->cfs_rq[i]->nr_iowait;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ tg_nr_running += tg->rt_rq[i]->rt_nr_running;
+#endif
+ }
+
+ if (virt)
+ cpu_cgroup_update_vcpustat(cpu_css, cpuacct_css);
+
+ user = nice = system = idle = iowait = steal = 0;
+
+ for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) {
+ if (!virt && !cpu_possible(i))
+ continue;
+
+ kcpustat = virt ? tg->vcpustat + i :
+ cpuacct_cpustat(cpuacct_css, i);
+
+ user += kcpustat->cpustat[CPUTIME_USER];
+ nice += kcpustat->cpustat[CPUTIME_NICE];
+ system += kcpustat->cpustat[CPUTIME_SYSTEM];
+ idle += kcpustat->cpustat[CPUTIME_IDLE];
+ iowait += kcpustat->cpustat[CPUTIME_IOWAIT];
+ steal += kcpustat->cpustat[CPUTIME_STEAL];
+ }
+ /* Don't scare CT users with high steal time */
+ if (!ve_is_super(get_exec_env()))
+ steal = 0;
+
+ seq_printf(p, "cpu %llu %llu %llu %llu %llu 0 0 %llu\n",
+ (unsigned long long)nsec_to_clock_t(user),
+ (unsigned long long)nsec_to_clock_t(nice),
+ (unsigned long long)nsec_to_clock_t(system),
+ (unsigned long long)nsec_to_clock_t(idle),
+ (unsigned long long)nsec_to_clock_t(iowait),
+ virt ? 0ULL :
+ (unsigned long long)nsec_to_clock_t(steal));
+
+ for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) {
+ if (!virt && !cpu_online(i))
+ continue;
+ kcpustat = virt ? tg->vcpustat + i :
+ cpuacct_cpustat(cpuacct_css, i);
+
+ user = kcpustat->cpustat[CPUTIME_USER];
+ nice = kcpustat->cpustat[CPUTIME_NICE];
+ system = kcpustat->cpustat[CPUTIME_SYSTEM];
+ idle = kcpustat->cpustat[CPUTIME_IDLE];
+ iowait = kcpustat->cpustat[CPUTIME_IOWAIT];
+ steal = kcpustat->cpustat[CPUTIME_STEAL];
+ /* Don't scare CT users with high steal time */
+ if (!ve_is_super(get_exec_env()))
+ steal = 0;
+
+ seq_printf(p,
+ "cpu%d %llu %llu %llu %llu %llu 0 0 %llu\n",
+ i,
+ (unsigned long long)nsec_to_clock_t(user),
+ (unsigned long long)nsec_to_clock_t(nice),
+ (unsigned long long)nsec_to_clock_t(system),
+ (unsigned long long)nsec_to_clock_t(idle),
+ (unsigned long long)nsec_to_clock_t(iowait),
+ virt ? 0ULL :
+ (unsigned long long)nsec_to_clock_t(steal));
+ }
+ seq_printf(p, "intr 0");
+
+ seq_printf(p,
+ "\nctxt %llu\n"
+ "btime %llu\n"
+ "processes %lu\n"
+ "procs_running %lu\n"
+ "procs_blocked %lu\n",
+ nr_context_switches(),
+ (unsigned long long)boottime.tv_sec,
+ total_forks,
+ tg_nr_running,
+ tg_nr_iowait);
+
+ return 0;
+}
--
2.30.2
More information about the Devel
mailing list