[Devel] [PATCH RHEL COMMIT] ve/proc/loadavg: Virtualize /proc/loadavg in Containers

Konstantin Khorenko khorenko at virtuozzo.com
Fri Oct 1 19:38:45 MSK 2021


The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit fedfe80301da8a2c7cda4475bae8a601e7089fd3
Author: Konstantin Khorenko <khorenko at virtuozzo.com>
Date:   Fri Oct 1 19:38:45 2021 +0300

    ve/proc/loadavg: Virtualize /proc/loadavg in Containers
    
    The patch is based on following vz7 commits:
      ecdce58b214c ("sched: Export per task_group statistics_work")
      a58fb58bff1c ("Use ve init task's css instead of opening cgroup via vfs")
      5f2a49a05629 ("sched/ve: Use cfs_rq::h_nr_running to count loadavg")
    
    vz8 rebase notes:
    1) cpu cgroup vz specific file "proc.loadavg" has been dropped
    2) "nr_running" field in /proc/loadavg inside a CT includes running
       realtime tasks (although they are not allowed to be run inside a CT)
       and tasks in D state (like on the Host)
    
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    (cherry-picked from vz8 commit e0012c83f2a8 ("ve/proc/loadavg:
    Virtualize /proc/loadavg in Containers"))
    
    Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 fs/proc/loadavg.c   | 10 ++++++++++
 include/linux/ve.h  |  8 ++++++++
 kernel/sched/core.c | 40 ++++++++++++++++++++++++++++++++++++++++
 kernel/ve/ve.c      | 16 ++++++++++++++++
 4 files changed, 74 insertions(+)

diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index c651c6a2d285..32148d6f66ef 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -9,10 +9,20 @@
 #include <linux/seq_file.h>
 #include <linux/seqlock.h>
 #include <linux/time.h>
+#include <linux/ve.h>
 
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
 	unsigned long avnrun[3];
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (!ve_is_super(ve)) {
+		int ret;
+		ret = ve_show_loadavg(ve, m);
+		if (ret != -ENOSYS)
+			return ret;
+	}
 
 	get_avenrun(avnrun, FIXED_1/200, 0);
 
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 19a590bc86d4..95dcd99267df 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -150,4 +150,12 @@ static inline void monotonic_ve_to_abs(clockid_t which_clock,
 
 #endif	/* CONFIG_VE */
 
+struct seq_file;
+
+#if defined(CONFIG_VE) && defined(CONFIG_CGROUP_SCHED)
+int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p);
+#else
+static inline int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p) { return -ENOSYS; }
+#endif
+
 #endif /* _LINUX_VE_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62a31f1b9cc9..4c5eb09b4888 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -72,6 +72,8 @@ __read_mostly int sysctl_resched_latency_warn_ms = 100;
 __read_mostly int sysctl_resched_latency_warn_once = 1;
 #endif /* CONFIG_SCHED_DEBUG */
 
+#include "../cgroup/cgroup-internal.h" /* For cgroup_task_count() */
+
 /*
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
@@ -10538,6 +10540,44 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+int cpu_cgroup_proc_loadavg(struct cgroup_subsys_state *css,
+			    struct seq_file *p)
+{
+	struct cgroup *cgrp = css->cgroup;
+	struct task_group *tg = css_tg(css);
+	unsigned long avnrun[3];
+	int nr_running = 0;
+	int i;
+
+	avnrun[0] = tg->avenrun[0] + FIXED_1/200;
+	avnrun[1] = tg->avenrun[1] + FIXED_1/200;
+	avnrun[2] = tg->avenrun[2] + FIXED_1/200;
+
+	for_each_possible_cpu(i) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		nr_running += tg->cfs_rq[i]->h_nr_running;
+		/*
+		 * We do not export nr_unint to parent task groups
+		 * like we do for h_nr_running, as it gives additional
+		 * overhead for activate/deactivate operations. So, we
+		 * don't account child cgroup unint tasks here.
+		 */
+		nr_running += tg->cfs_rq[i]->nr_unint;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+		nr_running += tg->rt_rq[i]->rt_nr_running;
+#endif
+	}
+
+	seq_printf(p, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
+		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
+		nr_running, cgroup_task_count(cgrp),
+		idr_get_cursor(&task_active_pid_ns(current)->idr));
+	return 0;
+}
+
 static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index f3df12f8638b..178aa658b50b 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -1095,3 +1095,19 @@ static int __init ve_subsys_init(void)
 	return 0;
 }
 late_initcall(ve_subsys_init);
+
+#ifdef CONFIG_CGROUP_SCHED
+int cpu_cgroup_proc_loadavg(struct cgroup_subsys_state *css,
+			    struct seq_file *p);
+
+int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p)
+{
+	struct cgroup_subsys_state *css;
+	int err;
+
+	css = ve_get_init_css(ve, cpu_cgrp_id);
+	err = cpu_cgroup_proc_loadavg(css, p);
+	css_put(css);
+	return err;
+}
+#endif /* CONFIG_CGROUP_SCHED */


More information about the Devel mailing list