[Devel] [PATCH RHEL8 COMMIT] ve/proc/loadavg: Virtualize /proc/loadavg in Containers

Konstantin Khorenko khorenko at virtuozzo.com
Wed Oct 28 19:21:32 MSK 2020


The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.14
------>
commit 41f6bed2b424afbd51364f262ed7473b706f24b5
Author: Konstantin Khorenko <khorenko at virtuozzo.com>
Date:   Wed Oct 21 16:01:31 2020 +0300

    ve/proc/loadavg: Virtualize /proc/loadavg in Containers
    
    The patch is based on following vz7 commits:
      ecdce58b214c ("sched: Export per task_group statistics_work")
      a58fb58bff1c ("Use ve init task's css instead of opening cgroup via vfs")
      5f2a49a05629 ("sched/ve: Use cfs_rq::h_nr_running to count loadavg")
    
    vz8 rebase notes:
    1) cpu cgroup vz specific file "proc.loadavg" has been dropped
    2) "nr_running" field in /proc/loadavg inside a CT includes running
       realtime tasks (although they are not allowed to be run inside a CT)
       and tasks in D state (like on the Host)
    
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
---
 fs/proc/loadavg.c   | 10 ++++++++++
 include/linux/ve.h  |  8 ++++++++
 kernel/sched/core.c | 40 ++++++++++++++++++++++++++++++++++++++++
 kernel/ve/ve.c      | 16 ++++++++++++++++
 4 files changed, 74 insertions(+)

diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 40467c3ade86..b884a1a59a3d 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -9,10 +9,20 @@
 #include <linux/seq_file.h>
 #include <linux/seqlock.h>
 #include <linux/time.h>
+#include <linux/ve.h>
 
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
 	unsigned long avnrun[3];
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (!ve_is_super(ve)) {
+		int ret;
+		ret = ve_show_loadavg(ve, m);
+		if (ret != -ENOSYS)
+			return ret;
+	}
 
 	get_avenrun(avnrun, FIXED_1/200, 0);
 
diff --git a/include/linux/ve.h b/include/linux/ve.h
index ec7dc522ac1f..0341bb915923 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -176,4 +176,12 @@ static inline void monotonic_ve_to_abs(clockid_t which_clock,
 
 #endif	/* CONFIG_VE */
 
+struct seq_file;
+
+#if defined(CONFIG_VE) && defined(CONFIG_CGROUP_SCHED)
+int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p);
+#else
+static inline int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p) { return -ENOSYS; }
+#endif
+
 #endif /* _LINUX_VE_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a6100bf3f625..0116742de578 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -41,6 +41,8 @@ const_debug unsigned int sysctl_sched_features =
 #undef SCHED_FEAT
 #endif
 
+#include "../cgroup/cgroup-internal.h" /* For cgroup_task_count() */
+
 /*
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
@@ -7134,6 +7136,44 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+int cpu_cgroup_proc_loadavg(struct cgroup_subsys_state *css,
+			    struct seq_file *p)
+{
+	struct cgroup *cgrp = css->cgroup;
+	struct task_group *tg = css_tg(css);
+	unsigned long avnrun[3];
+	int nr_running = 0;
+	int i;
+
+	avnrun[0] = tg->avenrun[0] + FIXED_1/200;
+	avnrun[1] = tg->avenrun[1] + FIXED_1/200;
+	avnrun[2] = tg->avenrun[2] + FIXED_1/200;
+
+	for_each_possible_cpu(i) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		nr_running += tg->cfs_rq[i]->h_nr_running;
+		/*
+		 * We do not export nr_unint to parent task groups
+		 * like we do for h_nr_running, as it gives additional
+		 * overhead for activate/deactivate operations. So, we
+		 * don't account child cgroup unint tasks here.
+		 */
+		nr_running += tg->cfs_rq[i]->nr_unint;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+		nr_running += tg->rt_rq[i]->rt_nr_running;
+#endif
+	}
+
+	seq_printf(p, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
+		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
+		nr_running, cgroup_task_count(cgrp),
+		idr_get_cursor(&task_active_pid_ns(current)->idr));
+	return 0;
+}
+
 static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 43e37b27e887..193fdb95daab 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -1147,3 +1147,19 @@ int vz_security_protocol_check(struct net *net, int protocol)
 	}
 }
 EXPORT_SYMBOL_GPL(vz_security_protocol_check);
+
+#ifdef CONFIG_CGROUP_SCHED
+int cpu_cgroup_proc_loadavg(struct cgroup_subsys_state *css,
+			    struct seq_file *p);
+
+int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p)
+{
+	struct cgroup_subsys_state *css;
+	int err;
+
+	css = ve_get_init_css(ve, cpu_cgrp_id);
+	err = cpu_cgroup_proc_loadavg(css, p);
+	css_put(css);
+	return err;
+}
+#endif /* CONFIG_CGROUP_SCHED */


More information about the Devel mailing list