[Devel] [PATCH RHEL COMMIT] vzstat: Add vzstat module and kstat interfaces

Konstantin Khorenko khorenko at virtuozzo.com
Fri Sep 24 15:15:55 MSK 2021


The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit 86d9e9f5332fa691e8b361a606424d14dcc2ff53
Author: Kirill Tkhai <ktkhai at virtuozzo.com>
Date:   Fri Sep 24 15:15:55 2021 +0300

    vzstat: Add vzstat module and kstat interfaces
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    +++
    vzstat: account cpu total time properly in mm performance stats
    
    /proc/vz/mmperf occasionally accounts/shows wall total time in both
    "Wall_tot_time" and "CPU_tot_time" columns, fix this.
    
    mFixes: c0a20dd32be6 ("vzstat: Add vzstat module and kstat interfaces")
    https://pmc.acronis.com/browse/VSTOR-16659
    
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    (cherry-picked from vz7 commit 306162b35d01 ("vzstat: account cpu total time
    properly in mm performance stats"))
    Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
    
    +++
    vzstat: drop double KSTAT_LAT_PCPU_UPDATE declaration
    
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    mFixes: c0a20dd32be6 ("vzstat: Add vzstat module and kstat interfaces")
    
    (cherry-picked from vz7 commit 8242e706656b ("vzstat: drop double
    KSTAT_LAT_PCPU_UPDATE declaration"))
    Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
    
    +++
    ve/proc: increment position in "next" seq_operations callback
    
    "next" callback of struct "seq_operations" should move the iterator
    forward to the next position in the sequence.
    
    Some Virtuozzo specific proc files skip such increment,
    and trigger incorrect file output.
    
    https://bugs.openvz.org/browse/OVZ-7158
    
    Signed-off-by: Vasily Averin <vvs at virtuozzo.com>
    
    (cherry-picked from vz7 commit 0a458a0598b4 ("ve/proc: increment position in
    "next" seq_operations callback"))
    
    https://jira.sw.ru/browse/PSBM-127849
    Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
    
    (cherry picked from vz8 commit 4a5af6067ead98fa516ffcd7404308f67af3ee2a)
    Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
---
 include/linux/vzstat.h  |  28 ++
 kernel/ve/Makefile      |   4 +-
 kernel/ve/vzstat.c      | 705 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/ve/vzstat_core.c | 103 +++++++
 4 files changed, 839 insertions(+), 1 deletion(-)

diff --git a/include/linux/vzstat.h b/include/linux/vzstat.h
index 32f1132404c1..36408f06a2e3 100644
--- a/include/linux/vzstat.h
+++ b/include/linux/vzstat.h
@@ -48,4 +48,32 @@ extern struct kernel_stat_glob kstat_glob ____cacheline_aligned;
 extern spinlock_t kstat_glb_lock;
 
 extern void kstat_init(void);
+
+#ifdef CONFIG_VE
+extern void KSTAT_PERF_ADD(struct kstat_perf_pcpu_struct *ptr, u64 real_time,
+			   u64 cpu_time);
+
+#define KSTAT_PERF_ENTER(name)				\
+	u64 start, sleep_time;				\
+							\
+	start = ktime_to_ns(ktime_get());		\
+	sleep_time = current->se.statistics->sum_sleep_runtime; \
+
+#define KSTAT_PERF_LEAVE(name)				\
+	start = ktime_to_ns(ktime_get()) - start;	\
+	sleep_time = current->se.statistics->sum_sleep_runtime - sleep_time; \
+	KSTAT_PERF_ADD(&kstat_glob.name, start, start - sleep_time);
+
+extern void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, u64 dur);
+extern void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p);
+
+#else /* !CONFIG_VE */
+#define KSTAT_PERF_ADD(ptr, real_time, cpu_time)
+#define KSTAT_PERF_ENTER(name)
+#define KSTAT_PERF_LEAVE(name)
+#define KSTAT_LAT_PCPU_ADD(p, dur)
+#define KSTAT_LAT_PCPU_UPDATE(p)
+#endif /* CONFIG_VE */
+
+
 #endif /* __VZSTAT_H__ */
diff --git a/kernel/ve/Makefile b/kernel/ve/Makefile
index e0c23ca9a867..6219357803fb 100644
--- a/kernel/ve/Makefile
+++ b/kernel/ve/Makefile
@@ -6,10 +6,12 @@
 # Copyright (c) 2017-2021 Virtuozzo International GmbH. All rights reserved.
 #
 
-obj-$(CONFIG_VE) = ve.o hooks.o veowner.o
+obj-$(CONFIG_VE) = ve.o hooks.o veowner.o vzstat_core.o
 
 obj-$(CONFIG_VZ_DEV) += vzdev.o
 obj-$(CONFIG_VZ_EVENT) += vzevent.o
 
 obj-$(CONFIG_VE_CALLS) += vzmon.o
 vzmon-objs = vecalls.o
+
+obj-$(CONFIG_VE_CALLS) += vzstat.o
diff --git a/kernel/ve/vzstat.c b/kernel/ve/vzstat.c
new file mode 100644
index 000000000000..9c1287a94a05
--- /dev/null
+++ b/kernel/ve/vzstat.c
@@ -0,0 +1,705 @@
+/*
+ *  kernel/ve/vzstat.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *  Copyright (c) 2017-2021 Virtuozzo International GmbH. All rights reserved.
+ *
+ */
+
+#include <linux/sched/loadavg.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/sched/stat.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/errno.h>
+#include <linux/suspend.h>
+#include <linux/interrupt.h>
+#include <linux/mmzone.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/veowner.h>
+#include <linux/swap.h>
+
+#include <linux/vzstat.h>
+
+/* local variables */
+static struct task_struct *vzstat_thread_tsk;
+
+static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = {
+	"alocatomic:",
+	"aloclow:",
+	"alochigh:",
+	"aloclowmp:",
+	"alochighmp:"
+};
+
+/*
+ * ------------------------------------------------------------------------
+ * Kernel protection: kernel code checksumming
+ * ------------------------------------------------------------------------
+ */
+#ifdef CONFIG_VE_KERNEL_CSUM
+
+#ifdef __x86_64__
+/* skip init_level4_pgt */
+#define KERNEL_PROT_START	((unsigned long)(&_stext) + 0x2000)
+#else
+#define KERNEL_PROT_START	((unsigned long)(&_stext))
+#endif
+#define KERNEL_PROT_END		((unsigned long)(&_etext))
+#define CSALIGN(value, size)	((value + (size - 1)) & ~(size - 1))
+
+void kernel_text_csum_check(void)
+{
+#define CSUM_NR	2
+	static unsigned long text_csum[CSUM_NR], text_csumed, csum_time;
+	unsigned long start, end, ptr, csum[CSUM_NR];
+	int i;
+
+	if (jiffies - csum_time < 60*HZ)
+		return;
+
+	csum_time = jiffies;
+	for (i = 0; i < CSUM_NR; i++) csum[i] = 0;
+	start = CSALIGN(KERNEL_PROT_START, sizeof(csum[0]));
+	end = CSALIGN(KERNEL_PROT_END, sizeof(csum[0]));
+
+	for (ptr = start; ptr < end; ptr += sizeof(csum[0])) {
+		unsigned long i = *(unsigned long*)ptr;
+		csum[0] = csum[0] + i;
+		csum[1] = (csum[1] ^ i) + ((csum[1] << 1) + (csum[1] >> 31));
+		cond_resched();
+	}
+
+	if (!text_csumed) {
+		for (i = 0; i < CSUM_NR; i++) text_csum[i] = csum[i];
+		text_csumed = 1;
+		return;
+	}
+	for (i = 0; i < CSUM_NR; i++)
+		if (text_csum[i] != csum[i]) {
+			printk(KERN_EMERG "Kernel checksum %d changed "
+				"(csum%d=%08lx, onboot csum%d=%08lx)\n",
+				i, i, csum[i], i, text_csum[i]);
+			kernel_text_csum_broken++;
+		}
+}
+
+#endif
+
+/*
+ * ------------------------------------------------------------------------
+ * Latency update and show functions
+ * ------------------------------------------------------------------------
+ */
+static void update_alloc_latency(void)
+{
+	int i;
+
+	spin_lock_irq(&kstat_glb_lock);
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		KSTAT_LAT_PCPU_UPDATE(&kstat_glob.alloc_lat[i]);
+	KSTAT_LAT_PCPU_UPDATE(&kstat_glob.swap_in);
+	KSTAT_LAT_PCPU_UPDATE(&kstat_glob.page_in);
+	spin_unlock_irq(&kstat_glb_lock);
+}
+
+static void lastlat_seq_show(struct seq_file *m,
+		const char *name,
+		struct kstat_lat_snap_struct *snap)
+{
+	seq_printf(m, "%-11s %20Lu %20Lu %20lu\n", name,
+			snap->maxlat, snap->totlat, snap->count);
+}
+
+static void avglat_seq_show(struct seq_file *m,
+		const char *name,
+		u64 *avg)
+{
+	seq_printf(m, "%-11s %20Lu %20Lu %20Lu\n", name,
+			avg[0], avg[1], avg[2]);
+}
+
+static int latency_seq_show(struct seq_file *m, void *v)
+{
+	int i;
+
+	if (!v)
+		return 0;
+
+	seq_puts(m, "Version: 2.5\n");
+
+	seq_puts(m, "\nLatencies:\n");
+	seq_printf(m, "%-11s %20s %20s %20s\n",
+			"Type", "Lat", "Total_lat", "Calls");
+	lastlat_seq_show(m, "scheduling:", &kstat_glob.sched_lat.last);
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		lastlat_seq_show(m, alloc_descr[i],
+				&kstat_glob.alloc_lat[i].last);
+	lastlat_seq_show(m, "swap_in:", &kstat_glob.swap_in.last);
+	lastlat_seq_show(m, "page_in:", &kstat_glob.page_in.last);
+
+	seq_puts(m, "\nAverages:\n");
+	seq_printf(m, "%-11s %20s %20s %20s\n",
+			"Type", "Avg1", "Avg5", "Avg15");
+	avglat_seq_show(m, "scheduling:", kstat_glob.sched_lat.avg);
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		avglat_seq_show(m, alloc_descr[i],
+				kstat_glob.alloc_lat[i].avg);
+	avglat_seq_show(m, "swap_in:", kstat_glob.swap_in.avg);
+	avglat_seq_show(m, "page_in:", kstat_glob.page_in.avg);
+
+	return 0;
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * General system info: processes, memory, VE
+ * ------------------------------------------------------------------------
+ */
+static void update_memory(void)
+{
+	pg_data_t *pgdat;
+	struct zone *zone;
+	struct kstat_zone_avg *zone_avg;
+	unsigned type;
+	unsigned long nr_free, nr_active, nr_inactive, *p;
+	unsigned present;
+
+	for (type = 0; type < MAX_NR_ZONES; type++) {
+		present = 0;
+		nr_free = 0;
+		nr_active = 0;
+		nr_inactive = 0;
+
+		for_each_online_pgdat (pgdat) {
+			zone = pgdat->node_zones + type;
+			if (!zone->present_pages)
+				continue;
+
+			present++;
+			nr_free += zone_page_state(zone, NR_FREE_PAGES);
+			nr_active +=  zone_page_state(zone, NR_ACTIVE_ANON) +
+				zone_page_state(zone, NR_ACTIVE_FILE);
+			nr_inactive += zone_page_state(zone, NR_INACTIVE_ANON) +
+				zone_page_state(zone, NR_INACTIVE_FILE);
+		}
+
+		if (!present)
+			continue;
+
+		zone_avg = &kstat_glob.zone_avg[type];
+
+		p = zone_avg->free_pages_avg;
+		p[0] = calc_load(p[0], EXP_1,  nr_free);
+		p[1] = calc_load(p[1], EXP_5,  nr_free);
+		p[2] = calc_load(p[2], EXP_15, nr_free);
+
+		p = zone_avg->nr_active_avg;
+		p[0] = calc_load(p[0], EXP_1,  nr_active);
+		p[1] = calc_load(p[1], EXP_5,  nr_active);
+		p[2] = calc_load(p[2], EXP_15, nr_active);
+
+		p = zone_avg->nr_inactive_avg;
+		p[0] = calc_load(p[0], EXP_1,  nr_inactive);
+		p[1] = calc_load(p[1], EXP_5,  nr_inactive);
+		p[2] = calc_load(p[2], EXP_15, nr_inactive);
+	}
+}
+
+static void mem_avg_show(struct seq_file *m, void *v)
+{
+	unsigned type;
+	pg_data_t *pgdat;
+	struct zone *zone;
+	struct kstat_zone_avg *zone_avg;
+	unsigned present;
+	int zone_id;
+
+	zone_id = 0;
+
+	for (type = 0; type < MAX_NR_ZONES; type++) {
+		present = 0;
+
+		for_each_online_pgdat (pgdat) {
+			zone = pgdat->node_zones + type;
+			if (zone->present_pages) {
+				present++;
+				break;
+			}
+		}
+		if (!present)
+			continue;
+
+		zone_avg = &kstat_glob.zone_avg[type];
+		seq_printf(m, "ZONE%u %s averages: "
+			"active %lu %lu %lu, "
+			"inactive %lu %lu %lu, "
+			"free %lu %lu %lu\n",
+			zone_id++,
+			zone->name,
+			zone_avg->nr_active_avg[0],
+			zone_avg->nr_active_avg[1],
+			zone_avg->nr_active_avg[2],
+			zone_avg->nr_inactive_avg[0],
+			zone_avg->nr_inactive_avg[1],
+			zone_avg->nr_inactive_avg[2],
+			zone_avg->free_pages_avg[0],
+			zone_avg->free_pages_avg[1],
+			zone_avg->free_pages_avg[2]);
+	}
+}
+
+static void task_counts_seq_show(struct seq_file *m, void *v)
+{
+	unsigned long _nr_running, _nr_sleeping, _nr_unint,
+				_nr_zombie, _nr_dead, _nr_stopped;
+	unsigned long avg[3], seq;
+
+	_nr_running = nr_running();
+	_nr_unint = nr_uninterruptible();
+	_nr_sleeping = nr_sleeping();
+	_nr_zombie = nr_zombie;
+	_nr_dead = atomic_read(&nr_dead);
+	_nr_stopped = 0; /* Broken since 3.10 */
+
+	do {
+		seq = read_seqcount_begin(&kstat_glob.nr_unint_avg_seq);
+		memcpy(avg, kstat_glob.nr_unint_avg, sizeof(avg));
+	} while (read_seqcount_retry(&kstat_glob.nr_unint_avg_seq, seq));
+
+	seq_printf(m, "VEs: %d\n", nr_ve);
+	seq_printf(m, "Processes: R %lu, S %lu, D %lu, "
+		"Z %lu, T %lu, X %lu\n",
+			_nr_running,
+			_nr_sleeping,
+			_nr_unint,
+			_nr_zombie,
+			_nr_stopped,
+			_nr_dead);
+	seq_printf(m, "Processes avg: unint %lu %lu %lu\n",
+			avg[0] >> FSHIFT, avg[1] >> FSHIFT, avg[2] >> FSHIFT);
+}
+
+static void cycles_per_jiffy_show(struct seq_file *m, void *v)
+{
+	/* Now all time slices are measured in nanoseconds */
+	seq_printf(m, "cycles_per_jiffy: %llu\n", ((u64) jiffies_to_usecs(1)) * 1000);
+}
+
+static void jiffies_per_second_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "jiffies_per_second: %u\n", HZ);
+}
+
+static void kernel_text_csum_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "kernel_text_csum_broken: %d\n", 0);
+}
+
+static void swap_cache_seq_show(struct seq_file *m, void *v)
+{
+	struct swap_cache_info *swpcache;
+
+	swpcache = &swap_cache_info;
+	seq_printf(m, "Swap cache: add %lu, del %lu, find %lu/%lu\n",
+			swpcache->add_total,
+			swpcache->del_total,
+			swpcache->find_success,
+			swpcache->find_total);
+}
+
+/*
+ * Declare special structure to store summarized statistics. The 'struct zone'
+ * is not used because of it's tremendous size.
+ */
+struct zonestat {
+	const char *name;
+	unsigned long free_pages;
+	unsigned long nr_free[MAX_ORDER];
+	unsigned long pages_min;
+	unsigned long pages_low;
+	unsigned long pages_high;
+	unsigned long nr_active;
+	unsigned long nr_inactive;
+	unsigned long present_pages;
+};
+
+/*
+ * Show information about all memory zones.
+ */
+static void mem_free_areas_show_zonestat(struct seq_file *m,
+						struct zonestat *zstat)
+{
+	unsigned int order;
+	unsigned type;
+
+	for (type = 0; type < MAX_NR_ZONES; type++) {
+		struct zonestat *zone = &zstat[type];
+
+		if (!zone->name)
+			continue;
+
+		/* Skip empty zones */
+		if (!zone->present_pages)
+			continue;
+
+		seq_printf(m, "%s free %lu (", zone->name, zone->free_pages);
+		for (order = 0; order < MAX_ORDER; order++)
+			seq_printf(m, "%lu*%lu ", zone->nr_free[order],
+								1UL << order);
+
+		seq_printf(m, ") min %lu low %lu high %lu "
+			"active %lu inactive %lu size %lu\n",
+				zone->pages_min,
+				zone->pages_low,
+				zone->pages_high,
+				zone->nr_active,
+				zone->nr_inactive,
+				zone->present_pages);
+	}
+}
+
+/*
+ * Scan all registered pgdat's (i.e. memory nodes) and summarize
+ * values for identical zones.
+ */
+static void mem_free_areas_show(struct seq_file *m, void *v)
+{
+	pg_data_t *pgdat;
+	struct zonestat zones[MAX_NR_ZONES];
+	struct zonestat *zdst;
+	struct zone *zsrc;
+	int type, order;
+
+	memset(zones, 0, sizeof(zones));
+
+	for_each_online_pgdat (pgdat) {
+		for (type = 0; type < MAX_NR_ZONES; type++) {
+			unsigned long flags;
+
+			zdst = &zones[type];
+			zsrc = pgdat->node_zones + type;
+			if (!zsrc || !zsrc->name)
+				continue;
+
+			if (!zdst->name)
+				zdst->name = zsrc->name;
+			else if (strcmp(zsrc->name, zdst->name))
+				/* This shouldn't happen! */
+				printk("Warning: names mismatch for "
+					"zone %d: %s != %s\n",
+					type, zsrc->name, zdst->name);
+
+			spin_lock_irqsave(&zsrc->lock, flags);
+			for (order = 0; order < MAX_ORDER; order++)
+				zdst->nr_free[order] += zsrc->free_area[order].nr_free;
+			spin_unlock_irqrestore(&zsrc->lock, flags);
+
+			zdst->nr_active     += zone_page_state(zsrc, NR_ACTIVE_ANON) +
+						zone_page_state(zsrc, NR_ACTIVE_FILE);
+			zdst->nr_inactive   += zone_page_state(zsrc, NR_INACTIVE_ANON) +
+						zone_page_state(zsrc, NR_INACTIVE_FILE);
+			zdst->pages_min     += min_wmark_pages(zsrc);
+			zdst->pages_low     += low_wmark_pages(zsrc);
+			zdst->pages_high    += high_wmark_pages(zsrc);
+			zdst->present_pages += zsrc->present_pages;
+			zdst->free_pages    += zone_page_state(zsrc, NR_FREE_PAGES);
+		}
+	}
+	mem_free_areas_show_zonestat(m, zones);
+}
+
+static void mem_fails_show(struct seq_file *m, void *v)
+{
+	int i, cpu;
+	unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR];
+
+	memset(alloc_fails, 0, sizeof(alloc_fails));
+	for_each_online_cpu(cpu)
+		for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+			alloc_fails[i] += kstat_glob.alloc_fails[cpu][i];
+
+	seq_puts(m, "\nMemory fails:\n");
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		seq_printf(m, "%-11s %20lu\n", alloc_descr[i],
+				alloc_fails[i]);
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * Memory management profiling
+ * ------------------------------------------------------------------------
+ */
+static void KSTAT_PERF_UPDATE(struct kstat_perf_pcpu_struct *p)
+{
+	unsigned i, cpu;
+	struct kstat_perf_pcpu_snap_struct snap, *cur;
+
+	memset(&p->last, 0, sizeof(p->last));
+	for_each_online_cpu(cpu) {
+		cur = per_cpu_ptr(p->cur, cpu);
+		do {
+			i = read_seqcount_begin(&cur->lock);
+			memcpy(&snap, cur, sizeof(snap));
+		} while (read_seqcount_retry(&cur->lock, i));
+
+		if (p->last.wall_maxdur < snap.wall_maxdur)
+			p->last.wall_maxdur = snap.wall_maxdur;
+		if (p->last.cpu_maxdur < snap.cpu_maxdur)
+			p->last.cpu_maxdur = snap.cpu_maxdur;
+		cur->wall_maxdur = cur->cpu_maxdur = 0;
+
+		p->last.count += snap.count;
+		p->last.wall_tottime += snap.wall_tottime;
+		p->last.cpu_tottime += snap.cpu_tottime;
+	}
+}
+
+static void update_mmperf(void)
+{
+	KSTAT_PERF_UPDATE(&kstat_glob.ttfp);
+	KSTAT_PERF_UPDATE(&kstat_glob.cache_reap);
+	KSTAT_PERF_UPDATE(&kstat_glob.refill_inact);
+	KSTAT_PERF_UPDATE(&kstat_glob.shrink_icache);
+	KSTAT_PERF_UPDATE(&kstat_glob.shrink_dcache);
+}
+
+static void perf_seq_show(struct seq_file *m,
+		const char *name,
+		struct kstat_perf_pcpu_struct *p)
+{
+	seq_printf(m, "%-14s %10lu %20Lu %20Lu %20Lu %20Lu\n",
+			name,
+			p->last.count,
+			p->last.cpu_maxdur,
+			p->last.wall_maxdur,
+			p->last.cpu_tottime,
+			p->last.wall_tottime);
+}
+
+static int mmperf_seq_show(struct seq_file *m, void *v)
+{
+	if (!v)
+		return 0;
+	seq_puts(m, "Version: 2.5.1\n");
+	seq_printf(m, "%-14s %10s %20s %20s %20s %20s\n",
+			"Type",
+			"Count",
+			"CPU_max_dur",
+			"Wall_max_dur",
+			"CPU_tot_time",
+			"Wall_tot_time");
+	perf_seq_show(m, "ttfp:", &kstat_glob.ttfp);
+	perf_seq_show(m, "cache_reap:", &kstat_glob.cache_reap);
+	perf_seq_show(m, "refill_inact:", &kstat_glob.refill_inact);
+	perf_seq_show(m, "shrink_icache:", &kstat_glob.shrink_icache);
+	perf_seq_show(m, "shrink_dcache:", &kstat_glob.shrink_dcache);
+	return 0;
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * Main loop
+ * ------------------------------------------------------------------------
+ */
+static int vzstat_mon_loop(void* data)
+{
+	while (1) {
+		try_to_freeze();
+#ifdef CONFIG_VE_KERNEL_CSUM
+		kernel_text_csum_check();
+#endif
+		update_alloc_latency();
+		update_memory();
+		update_mmperf();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+		schedule_timeout(LOAD_FREQ);
+	}
+	return 0;
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * default sequential files methods
+ * ------------------------------------------------------------------------
+ */
+static void *empty_seq_start(struct seq_file *m, loff_t *pos)
+{
+	if (*pos == 0)
+		return (void*)1;
+	else
+		return NULL;
+}
+
+static void *empty_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return NULL;
+}
+
+static void empty_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * /proc/vz/latency sequential file methods
+ * ------------------------------------------------------------------------
+ */
+static struct seq_operations latency_seq_op = {
+	start:	empty_seq_start,
+	next:	empty_seq_next,
+	stop:	empty_seq_stop,
+	show:	latency_seq_show
+};
+
+static int latency_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &latency_seq_op);
+}
+
+static struct proc_ops proc_latency_operations = {
+	.proc_open = latency_open,
+	.proc_read = seq_read,
+	.proc_lseek = seq_lseek,
+	.proc_release = seq_release,
+};
+
+/*
+ * ------------------------------------------------------------------------
+ * /proc/vz/stats sequential file methods
+ * ------------------------------------------------------------------------
+ */
+static int stats_seq_show(struct seq_file *m, void *v)
+{
+	if (!v)
+		return 0;
+	seq_puts(m, "Version: 2.6\n");
+	cycles_per_jiffy_show(m, v);
+	jiffies_per_second_show(m, v);
+	seq_puts(m, "\nLoad info:\n");
+	task_counts_seq_show(m, v);
+	seq_puts(m, "\nMemory info:\n");
+	kernel_text_csum_seq_show(m, v);
+	swap_cache_seq_show(m, v);
+	mem_free_areas_show(m, v);
+	mem_avg_show(m, v);
+	mem_fails_show(m, v);
+	return 0;
+}
+
+static struct seq_operations stats_seq_op = {
+	start:	empty_seq_start,
+	next:	empty_seq_next,
+	stop:	empty_seq_stop,
+	show:	stats_seq_show
+};
+
+static int stats_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &stats_seq_op);
+}
+
+static struct proc_ops proc_stats_operations = {
+	.proc_open = stats_open,
+	.proc_read = seq_read,
+	.proc_lseek = seq_lseek,
+	.proc_release = seq_release,
+};
+
+/*
+ * ------------------------------------------------------------------------
+ * /proc/vz/mmperf sequential file methods
+ * ------------------------------------------------------------------------
+ */
+static struct seq_operations mmperf_seq_op = {
+	start:	empty_seq_start,
+	next:	empty_seq_next,
+	stop:	empty_seq_stop,
+	show:	mmperf_seq_show
+};
+
+static int mmperf_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &mmperf_seq_op);
+}
+
+static struct proc_ops proc_mmperf_operations = {
+	.proc_open = mmperf_open,
+	.proc_read = seq_read,
+	.proc_lseek = seq_lseek,
+	.proc_release = seq_release,
+};
+
+/*
+ * ------------------------------------------------------------------------
+ * module init/exit code
+ * ------------------------------------------------------------------------
+ */
+
+int __init vzstat_mon_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = proc_create("latency", S_IRUGO, proc_vz_dir, &proc_latency_operations);
+	if (entry == NULL) {
+		printk(KERN_WARNING "VZSTAT: can't make proc entry\n");
+		goto fail_lat;
+	}
+
+	entry = proc_create("stats", S_IRUGO, proc_vz_dir, &proc_stats_operations);
+	if (!entry) {
+		printk(KERN_WARNING "VZSTAT: can't make proc entry\n");
+		goto fail_stat;
+	}
+
+	entry = proc_create("mmperf", S_IRUGO, proc_vz_dir, &proc_mmperf_operations);
+	if (!entry) {
+		printk(KERN_WARNING "VZSTAT: can't make proc entry\n");
+		goto fail_perf;
+	}
+
+	vzstat_thread_tsk = kthread_run(vzstat_mon_loop, NULL, "vzstat");
+	if (IS_ERR(vzstat_thread_tsk))
+		goto fail_thread;
+
+	printk(KERN_INFO "VZSTAT: initialized successfully\n");
+
+	return 0;
+
+fail_thread:
+	remove_proc_entry("mmperf", proc_vz_dir);
+fail_perf:
+	remove_proc_entry("stats", proc_vz_dir);
+fail_stat:
+	remove_proc_entry("latency", proc_vz_dir);
+fail_lat:
+	return -EBUSY;
+}
+
+void __exit vzstat_mon_exit(void)
+{
+	kthread_stop(vzstat_thread_tsk);
+
+	remove_proc_entry("mmperf", proc_vz_dir);
+	remove_proc_entry("stats", proc_vz_dir);
+	remove_proc_entry("latency", proc_vz_dir);
+}
+
+module_init(vzstat_mon_init);
+module_exit(vzstat_mon_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel at openvz.org>");
diff --git a/kernel/ve/vzstat_core.c b/kernel/ve/vzstat_core.c
new file mode 100644
index 000000000000..26fe8fba2e5b
--- /dev/null
+++ b/kernel/ve/vzstat_core.c
@@ -0,0 +1,103 @@
+/*
+ *  kernel/ve/vzstat_core.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *  Copyright (c) 2017-2021 Virtuozzo International GmbH. All rights reserved.
+ *
+ */
+
+#include <linux/sched/loadavg.h>
+#include <linux/vzstat.h>
+#include <linux/sched.h>
+
+void KSTAT_PERF_ADD(struct kstat_perf_pcpu_struct *ptr, u64 real_time, u64 cpu_time)
+{
+	struct kstat_perf_pcpu_snap_struct *cur = get_cpu_ptr(ptr->cur);
+
+	write_seqcount_begin(&cur->lock);
+	cur->count++;
+	if (cur->wall_maxdur < real_time)
+		cur->wall_maxdur = real_time;
+	cur->wall_tottime += real_time;
+	if (cur->cpu_maxdur < cpu_time)
+		cur->cpu_maxdur = cpu_time;
+	cur->cpu_tottime += cpu_time;
+	write_seqcount_end(&cur->lock);
+	put_cpu_ptr(cur);
+}
+
+/*
+ * Must be called with disabled interrupts to remove any possible
+ * locks and seqcounts under write-lock and avoid this 3-way deadlock:
+ *
+ * timer interrupt:
+ *	write_seqlock(&xtime_lock);
+ *	 spin_lock_irqsave(&kstat_glb_lock);
+ *
+ * update_schedule_latency():
+ *	spin_lock_irq(&kstat_glb_lock);
+ *	 read_seqcount_begin(&cur->lock)
+ *
+ * some-interrupt during KSTAT_LAT_PCPU_ADD()
+ *   KSTAT_LAT_PCPU_ADD()
+ *    write_seqcount_begin(&cur->lock);
+ *     <interrupt>
+ *      ktime_get()
+ *       read_seqcount_begin(&xtime_lock);
+ */
+void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, u64 dur)
+{
+	struct kstat_lat_pcpu_snap_struct *cur;
+	seqcount_t *seq;
+
+	cur = this_cpu_ptr(p->cur);
+	seq = this_cpu_ptr(&kstat_pcpu_seq);
+
+	write_seqcount_begin(seq);
+	cur->count++;
+	if (cur->maxlat < dur)
+		cur->maxlat = dur;
+	cur->totlat += dur;
+	write_seqcount_end(seq);
+}
+
+/*
+ * Move current statistics to last, clear last.
+ * Serialization is the caller's due.
+ */
+void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p)
+{
+	struct kstat_lat_pcpu_snap_struct snap, *cur;
+	unsigned i, cpu;
+	seqcount_t *seq;
+	u64 m;
+
+	memset(&p->last, 0, sizeof(p->last));
+	for_each_online_cpu(cpu) {
+		cur = per_cpu_ptr(p->cur, cpu);
+		seq = per_cpu_ptr(&kstat_pcpu_seq, cpu);
+		do {
+			i = read_seqcount_begin(seq);
+			memcpy(&snap, cur, sizeof(snap));
+		} while (read_seqcount_retry(seq, i));
+		/*
+		 * read above and this update of maxlat is not atomic,
+		 * but this is OK, since it happens rarely and losing
+		 * a couple of peaks is not essential. xemul
+		 */
+		cur->maxlat = 0;
+
+		p->last.count += snap.count;
+		p->last.totlat += snap.totlat;
+		if (p->last.maxlat < snap.maxlat)
+			p->last.maxlat = snap.maxlat;
+	}
+
+	m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap);
+	p->avg[0] = calc_load(p->avg[0], EXP_1, m);
+	p->avg[1] = calc_load(p->avg[1], EXP_5, m);
+	p->avg[2] = calc_load(p->avg[2], EXP_15, m);
+	/* reset max_snap to calculate it correctly next time */
+	p->max_snap = 0;
+}
+EXPORT_SYMBOL(KSTAT_LAT_PCPU_UPDATE);


More information about the Devel mailing list