[Devel] [PATCH vz9 04/27] ve/sched/stat: Introduce functions to calculate vcpustat data

Wed Oct 6 12:15:23 MSK 2021


On 06.10.2021 11:57, Nikita Yushchenko wrote:
> From: Konstantin Khorenko <khorenko at virtuozzo.com>
> 
> Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
> Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
> 
> +++
> sched: Uninline css_tg()
> 
> Compilation with custom config fails:
> 
> kernel/ve/ve.c: In function ‘ve_get_cpu_avenrun’:
> kernel/ve/ve.c:1679:27: error: inlining failed in call to always_inline ‘css_tg’: function body not available
>   inline struct task_group *css_tg(struct cgroup_subsys_state *css);
>                             ^~~~~~
> kernel/ve/ve.c:1690:7: note: called from here
>    tg = css_tg(css);
>         ^~~~~~~~~~~
> 
> We may remove "inline" attribute, as compiler is clever enough
> to make itself inlining in kernel/sched/sched.c.
> 
> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> Reviewed-by: Evgenii Shatokhin <eshatokhin at virtuozzo.com>
> 
> Cherry-picked from vz8 commit 0b5495c8980d ("ve/sched/stat: Introduce
> functions to calculate vcpustat data").
> 
> Ported code that calculates CT boot timestamp to time namespaces.
>


Reviewed-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>

> Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
> ---
>   kernel/sched/core.c    |   2 +-
>   kernel/sched/cpuacct.c | 379 +++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 380 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 79a6f6808a7c..f1689ac77af1 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -9718,7 +9718,7 @@ void sched_move_task(struct task_struct *tsk)
>   	task_rq_unlock(rq, tsk, &rf);
>   }
>   
> -static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
> +struct task_group *css_tg(struct cgroup_subsys_state *css)
>   {
>   	return css ? container_of(css, struct task_group, css) : NULL;
>   }
> diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
> index 893eece65bfd..871b6f8ccb0d 100644
> --- a/kernel/sched/cpuacct.c
> +++ b/kernel/sched/cpuacct.c
> @@ -5,6 +5,8 @@
>    * Based on the work by Paul Menage (menage at google.com) and Balbir Singh
>    * (balbir at in.ibm.com).
>    */
> +#include <linux/kernel_stat.h>
> +#include <linux/ve.h>
>   #include <asm/irq_regs.h>
>   #include "sched.h"
>   
> @@ -374,3 +376,380 @@ struct cgroup_subsys cpuacct_cgrp_subsys = {
>   	.legacy_cftypes	= files,
>   	.early_init	= true,
>   };
> +
> +extern struct task_group *css_tg(struct cgroup_subsys_state *css);
> +
> +static struct task_group *ve_root_tg(struct task_group *tg) {
> +	struct cgroup *cg;
> +
> +	if (!tg)
> +		return NULL;
> +
> +	cg = cgroup_get_ve_root1(tg->css.cgroup);
> +	return cg ? css_tg(&cg->self) : NULL;
> +}
> +
> +unsigned int tg_cpu_rate(struct task_group *tg)
> +{
> +	unsigned int cpu_rate = 0;
> +#ifdef CONFIG_CFS_CPULIMIT
> +	tg = ve_root_tg(tg);
> +	if (tg)
> +		cpu_rate = tg->cpu_rate;
> +#endif
> +	return cpu_rate;
> +}
> +
> +static unsigned int tg_nr_cpus(struct task_group *tg)
> +{
> +	unsigned int nr_cpus = 0;
> +	unsigned int max_nr_cpus = num_online_cpus();
> +
> +#ifdef CONFIG_CFS_CPULIMIT
> +	tg = ve_root_tg(tg);
> +	if (tg)
> +		nr_cpus = tg->nr_cpus;
> +#endif
> +	if (!nr_cpus || nr_cpus > max_nr_cpus)
> +		nr_cpus = max_nr_cpus;
> +
> +	return nr_cpus;
> +}
> +
> +struct kernel_cpustat *cpuacct_cpustat(struct cgroup_subsys_state *css, int cpu)
> +{
> +	return per_cpu_ptr(css_ca(css)->cpustat, cpu);
> +}
> +
> +static void cpu_cgroup_update_stat(struct cgroup_subsys_state *cpu_css,
> +				   struct cgroup_subsys_state *cpuacct_css,
> +				   int i)
> +{
> +#if defined(CONFIG_SCHEDSTATS) && defined(CONFIG_FAIR_GROUP_SCHED)
> +	struct task_group *tg = css_tg(cpu_css);
> +	struct sched_entity *se = tg->se[i];
> +	u64 *cpustat = cpuacct_cpustat(cpuacct_css, i)->cpustat;
> +	u64 now = cpu_clock(i);
> +	u64 delta, idle, iowait, steal;
> +
> +	/* root_task_group has not sched entities */
> +	if (tg == &root_task_group)
> +		return;
> +
> +	iowait = se->statistics.iowait_sum;
> +	idle = se->statistics.sum_sleep_runtime;
> +	steal = se->statistics.wait_sum;
> +
> +	if (idle > iowait)
> +		idle -= iowait;
> +	else
> +		idle = 0;
> +
> +	if (se->statistics.sleep_start) {
> +		delta = now - se->statistics.sleep_start;
> +		if ((s64)delta > 0)
> +			idle += delta;
> +	} else if (se->statistics.block_start) {
> +		delta = now - se->statistics.block_start;
> +		if ((s64)delta > 0)
> +			iowait += delta;
> +	} else if (se->statistics.wait_start) {
> +		delta = now - se->statistics.wait_start;
> +		if ((s64)delta > 0)
> +			steal += delta;
> +	}
> +
> +	cpustat[CPUTIME_IDLE]	= max(cpustat[CPUTIME_IDLE], idle);
> +	cpustat[CPUTIME_IOWAIT]	= max(cpustat[CPUTIME_IOWAIT], iowait);
> +	cpustat[CPUTIME_STEAL]	= steal;
> +#endif
> +}
> +
> +static void fixup_vcpustat_delta_usage(struct kernel_cpustat *cur,
> +				       struct kernel_cpustat *rem, int ind,
> +				       u64 cur_usage, u64 target_usage,
> +				       u64 rem_usage)
> +{
> +	s64 scaled_val;
> +	u32 scale_pct = 0;
> +
> +	/* distribute the delta among USER, NICE, and SYSTEM proportionally */
> +	if (cur_usage < target_usage) {
> +		if ((s64)rem_usage > 0) /* sanity check to avoid div/0 */
> +			scale_pct = div64_u64(100 * rem->cpustat[ind],
> +					      rem_usage);
> +	} else {
> +		if ((s64)cur_usage > 0) /* sanity check to avoid div/0 */
> +			scale_pct = div64_u64(100 * cur->cpustat[ind],
> +					      cur_usage);
> +	}
> +
> +	scaled_val = div_s64(scale_pct * (target_usage - cur_usage), 100);
> +
> +	cur->cpustat[ind] += scaled_val;
> +	if ((s64)cur->cpustat[ind] < 0)
> +		cur->cpustat[ind] = 0;
> +
> +	rem->cpustat[ind] -= scaled_val;
> +	if ((s64)rem->cpustat[ind] < 0)
> +		rem->cpustat[ind] = 0;
> +}
> +
> +static void calc_vcpustat_delta_idle(struct kernel_cpustat *cur,
> +				     int ind, u64 cur_idle, u64 target_idle)
> +{
> +	/* distribute target_idle between IDLE and IOWAIT proportionally to
> +	 * what we initially had on this vcpu */
> +	if ((s64)cur_idle > 0) {
> +		u32 scale_pct = div64_u64(100 * cur->cpustat[ind], cur_idle);
> +		cur->cpustat[ind] = div_u64(scale_pct * target_idle, 100);
> +	} else {
> +		cur->cpustat[ind] = ind == CPUTIME_IDLE ? target_idle : 0;
> +	}
> +}
> +
> +static void fixup_vcpustat_delta(struct kernel_cpustat *cur,
> +				 struct kernel_cpustat *rem,
> +				 u64 max_usage)
> +{
> +	u64 cur_usage, target_usage, rem_usage;
> +	u64 cur_idle, target_idle;
> +
> +	cur_usage = kernel_cpustat_total_usage(cur);
> +	rem_usage = kernel_cpustat_total_usage(rem);
> +
> +	target_usage = min(cur_usage + rem_usage,
> +			max_usage);
> +
> +	if (cur_usage != target_usage) {
> +		fixup_vcpustat_delta_usage(cur, rem, CPUTIME_USER,
> +				cur_usage, target_usage, rem_usage);
> +		fixup_vcpustat_delta_usage(cur, rem, CPUTIME_NICE,
> +				cur_usage, target_usage, rem_usage);
> +		fixup_vcpustat_delta_usage(cur, rem, CPUTIME_SYSTEM,
> +				cur_usage, target_usage, rem_usage);
> +	}
> +
> +	cur_idle = kernel_cpustat_total_idle(cur);
> +	target_idle = max_usage - target_usage;
> +
> +	if (cur_idle != target_idle) {
> +		calc_vcpustat_delta_idle(cur, CPUTIME_IDLE,
> +					 cur_idle, target_idle);
> +		calc_vcpustat_delta_idle(cur, CPUTIME_IOWAIT,
> +					 cur_idle, target_idle);
> +	}
> +
> +	/* do not show steal time inside ve */
> +	cur->cpustat[CPUTIME_STEAL] = 0;
> +}
> +
> +static void cpu_cgroup_update_vcpustat(struct cgroup_subsys_state *cpu_css,
> +				       struct cgroup_subsys_state *cpuacct_css)
> +{
> +	int i, j;
> +	int nr_vcpus;
> +	int vcpu_rate;
> +	ktime_t now;
> +	u64 max_usage;
> +	struct kernel_cpustat stat_delta, stat_rem;
> +	struct task_group *tg = css_tg(cpu_css);
> +	int first_pass = 1;
> +
> +	spin_lock(&tg->vcpustat_lock);
> +
> +	now = ktime_get();
> +	nr_vcpus = tg_nr_cpus(tg);
> +	vcpu_rate = DIV_ROUND_UP(tg_cpu_rate(tg), nr_vcpus);
> +	if (!vcpu_rate || vcpu_rate > MAX_CPU_RATE)
> +		vcpu_rate = MAX_CPU_RATE;
> +
> +	if (!ktime_to_ns(tg->vcpustat_last_update)) {
> +		/* on the first read initialize vcpu i stat as a sum of stats
> +		 * over pcpus j such that j % nr_vcpus == i */
> +		for (i = 0; i < nr_vcpus; i++) {
> +			for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
> +				if (!cpu_possible(j))
> +					continue;
> +				kernel_cpustat_add(tg->vcpustat + i,
> +						cpuacct_cpustat(cpuacct_css, j),
> +						tg->vcpustat + i);
> +			}
> +		}
> +		goto out_update_last;
> +	}
> +
> +	max_usage = ktime_to_ns(ktime_sub(now, tg->vcpustat_last_update));
> +	max_usage = div_u64(max_usage * vcpu_rate, MAX_CPU_RATE);
> +	/* don't allow to update stats too often to avoid calculation errors */
> +	if (max_usage < 10)
> +		goto out_unlock;
> +
> +	/* temporarily copy per cpu usage delta to tg->cpustat_last */
> +	for_each_possible_cpu(i)
> +		kernel_cpustat_sub(cpuacct_cpustat(cpuacct_css, i),
> +				   tg->cpustat_last + i,
> +				   tg->cpustat_last + i);
> +
> +	/* proceed to calculating per vcpu delta */
> +	kernel_cpustat_zero(&stat_rem);
> +
> +again:
> +	for (i = 0; i < nr_vcpus; i++) {
> +		int exceeds_max;
> +
> +		kernel_cpustat_zero(&stat_delta);
> +		for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
> +			if (!cpu_possible(j))
> +				continue;
> +			kernel_cpustat_add(&stat_delta,
> +					   tg->cpustat_last + j, &stat_delta);
> +		}
> +
> +		exceeds_max = kernel_cpustat_total_usage(&stat_delta) >=
> +			      max_usage;
> +		/*
> +		 * On the first pass calculate delta for vcpus with usage >
> +		 * max_usage in order to accumulate excess in stat_rem.
> +		 *
> +		 * Once the remainder is accumulated, proceed to the rest of
> +		 * vcpus so that it will be distributed among them.
> +		 */
> +		if (exceeds_max != first_pass)
> +			continue;
> +
> +		fixup_vcpustat_delta(&stat_delta, &stat_rem, max_usage);
> +		kernel_cpustat_add(tg->vcpustat + i, &stat_delta,
> +				   tg->vcpustat + i);
> +	}
> +
> +	if (first_pass) {
> +		first_pass = 0;
> +		goto again;
> +	}
> +out_update_last:
> +	for_each_possible_cpu(i)
> +		tg->cpustat_last[i] = *cpuacct_cpustat(cpuacct_css, i);
> +	tg->vcpustat_last_update = now;
> +out_unlock:
> +	spin_unlock(&tg->vcpustat_lock);
> +}
> +
> +int cpu_cgroup_proc_stat(struct cgroup_subsys_state *cpu_css,
> +			 struct cgroup_subsys_state *cpuacct_css,
> +			 struct seq_file *p)
> +{
> +	int i;
> +	u64 user, nice, system, idle, iowait, steal;
> +	struct time_namespace *time_ns;
> +	struct timespec64 boottime;
> +	struct task_group *tg = css_tg(cpu_css);
> +	bool virt = !ve_is_super(get_exec_env()) && tg != &root_task_group;
> +	int nr_vcpus = tg_nr_cpus(tg);
> +	struct kernel_cpustat *kcpustat;
> +	unsigned long tg_nr_running = 0;
> +	unsigned long tg_nr_iowait = 0;
> +
> +	time_ns = ve_get_time_ns(get_exec_env());
> +	if (time_ns) {
> +		getboottime64(&boottime);
> +		/* time_ns->offsets.boottime is (ve_uptime - host_uptime), i.e.
> +		 * negative for ve created on this host. Shall subtract that
> +		 * from the timestamp of host's boot to get the timestamp of
> +		 * ve's boot */
> +		boottime = timespec64_sub(boottime, time_ns->offsets.boottime);
> +		put_time_ns(time_ns);
> +	} else {
> +		/* for not yet started ve, use current time as the timestamp of
> +		 * ve's boot */
> +		ktime_get_real_ts64(&boottime);
> +	}
> +
> +	for_each_possible_cpu(i) {
> +		cpu_cgroup_update_stat(cpu_css, cpuacct_css, i);
> +
> +		/* root task group has autogrouping, so this doesn't hold */
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +		tg_nr_running += tg->cfs_rq[i]->h_nr_running;
> +		tg_nr_iowait  += tg->cfs_rq[i]->nr_iowait;
> +#endif
> +#ifdef CONFIG_RT_GROUP_SCHED
> +		tg_nr_running += tg->rt_rq[i]->rt_nr_running;
> +#endif
> +	}
> +
> +	if (virt)
> +		cpu_cgroup_update_vcpustat(cpu_css, cpuacct_css);
> +
> +	user = nice = system = idle = iowait = steal = 0;
> +
> +	for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) {
> +		if (!virt && !cpu_possible(i))
> +			continue;
> +
> +		kcpustat = virt ? tg->vcpustat + i :
> +				  cpuacct_cpustat(cpuacct_css, i);
> +
> +		user	+= kcpustat->cpustat[CPUTIME_USER];
> +		nice	+= kcpustat->cpustat[CPUTIME_NICE];
> +		system	+= kcpustat->cpustat[CPUTIME_SYSTEM];
> +		idle	+= kcpustat->cpustat[CPUTIME_IDLE];
> +		iowait	+= kcpustat->cpustat[CPUTIME_IOWAIT];
> +		steal	+= kcpustat->cpustat[CPUTIME_STEAL];
> +	}
> +	/* Don't scare CT users with high steal time */
> +	if (!ve_is_super(get_exec_env()))
> +		steal = 0;
> +
> +	seq_printf(p, "cpu  %llu %llu %llu %llu %llu 0 0 %llu\n",
> +		   (unsigned long long)nsec_to_clock_t(user),
> +		   (unsigned long long)nsec_to_clock_t(nice),
> +		   (unsigned long long)nsec_to_clock_t(system),
> +		   (unsigned long long)nsec_to_clock_t(idle),
> +		   (unsigned long long)nsec_to_clock_t(iowait),
> +		   virt ? 0ULL :
> +		   (unsigned long long)nsec_to_clock_t(steal));
> +
> +	for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) {
> +		if (!virt && !cpu_online(i))
> +			continue;
> +		kcpustat = virt ? tg->vcpustat + i :
> +				  cpuacct_cpustat(cpuacct_css, i);
> +
> +		user	= kcpustat->cpustat[CPUTIME_USER];
> +		nice	= kcpustat->cpustat[CPUTIME_NICE];
> +		system	= kcpustat->cpustat[CPUTIME_SYSTEM];
> +		idle	= kcpustat->cpustat[CPUTIME_IDLE];
> +		iowait	= kcpustat->cpustat[CPUTIME_IOWAIT];
> +		steal	= kcpustat->cpustat[CPUTIME_STEAL];
> +		/* Don't scare CT users with high steal time */
> +		if (!ve_is_super(get_exec_env()))
> +			steal = 0;
> +
> +		seq_printf(p,
> +			   "cpu%d %llu %llu %llu %llu %llu 0 0 %llu\n",
> +			   i,
> +			   (unsigned long long)nsec_to_clock_t(user),
> +			   (unsigned long long)nsec_to_clock_t(nice),
> +			   (unsigned long long)nsec_to_clock_t(system),
> +			   (unsigned long long)nsec_to_clock_t(idle),
> +			   (unsigned long long)nsec_to_clock_t(iowait),
> +			   virt ? 0ULL :
> +			   (unsigned long long)nsec_to_clock_t(steal));
> +	}
> +	seq_printf(p, "intr 0");
> +
> +	seq_printf(p,
> +		   "\nctxt %llu\n"
> +		   "btime %llu\n"
> +		   "processes %lu\n"
> +		   "procs_running %lu\n"
> +		   "procs_blocked %lu\n",
> +		   nr_context_switches(),
> +		   (unsigned long long)boottime.tv_sec,
> +		   total_forks,
> +		   tg_nr_running,
> +		   tg_nr_iowait);
> +
> +	return 0;
> +}
> 

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.