[Devel] [PATCH RHEL8 COMMIT] ve/page_alloc, kstat: account allocation latencies per-task and per-thread
Konstantin Khorenko
khorenko at virtuozzo.com
Mon May 24 20:10:44 MSK 2021
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.32
------>
commit f221a0258c4affb96e91a44037ef78636ed492a6
Author: Andrey Ryabinin <aryabinin at virtuozzo.com>
Date: Mon May 24 19:38:26 2021 +0300
ve/page_alloc, kstat: account allocation latencies per-task and per-thread
Vstorage wants per-process allocation latencies:
- total accumulated latency (total time spent inside the kernel allocator)
- total alloc attempts (so that average latency can be calculated)
This adds /proc/<pid>/vz_latency file which outputs the numbers:
Type Total_lat Calls
allocatomic: 0 1334
alloc: 8000000 36643
allocmp: 0 919
https://jira.sw.ru/browse/PSBM-81395
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
Cc: Pavel Borzenkov <pborzenkov at virtuozzo.com>
(cherry-picked from vz7 commit 6d9a9210395e ("ve/page_alloc, kstat: account
allocation latencies per-task"))
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
+++
ve/kstat/alloc_lat: Initialize alloc_lat to zero at start
It seems that 'struct task_struct' not initialized to zero after
allocation. Thus we need to initialize alloc_lat explicitly.
https://jira.sw.ru/browse/PSBM-81395
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
(cherry-picked from vz7 commit 82ddc4c43f2d ("ve/kstat/alloc_lat: Initialize
alloc_lat to zero at start"))
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
+++
ve/fs/proc: Make per-thread and per-process allocation latencies.
Follow-up for 6d9a9210395e ("ve/page_alloc, kstat: account allocation latencies per-task")
Make per-thread and per-process allocation latencies:
- /proc/<pid>/vz_latency - cumulative for a thread group
- /proc/<pid>/tasks/<pid>/vz_latency - thread-specific
During allocation we collect per-thread latency. When thread dies,
it submits its own latencies into shared task->signal.alloc_lat struct.
/proc/<pid>/vz_latency - sums allocation latencies over all live threads
plus latencies of already dead tasks from task->signal.alloc_lat.
https://jira.sw.ru/browse/PSBM-81395
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
Cc: Pavel Borzenkov <Pavel.Borzenkov at acronis.com>
Rebase to vz8:
- As signal_struct moved from sched.h to sched/signal.h so changes did
(cherry-picked from vz7 commit c4cb66d5e706 ("ve/fs/proc: Make per-thread and
per-process allocation latencies."))
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
+++
vz_latency: don't account allocations in interrupts to random tasks
When we in interrupt, the 'current' is just any random task. We shouldn't
account per-task atomic allocations latency to random tasks. Use in_task()
macro to identify task context, and account per-task latency iff we in
task.
https://jira.sw.ru/browse/PSBM-87797
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
Reviewed-by: Denis V. Lunev <den at openvz.org>
(cherry-picked from vz7 commit 3ed23cb6c686 ("vz_latency: don't account
allocations in interrupts to random tasks"))
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
---
fs/proc/base.c | 79 ++++++++++++++++++++++++++++++++++++++++++++
include/linux/sched.h | 5 +++
include/linux/sched/signal.h | 3 ++
kernel/exit.c | 16 +++++++++
kernel/fork.c | 4 +++
mm/page_alloc.c | 6 ++++
6 files changed, 113 insertions(+)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f931b1c0f902..96db35fa0443 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -56,6 +56,7 @@
#include <linux/stat.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/init.h>
+#include <linux/kstat.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
@@ -524,6 +525,78 @@ static const struct file_operations proc_lstats_operations = {
#endif
+#ifdef CONFIG_VE
+static void lastlat_seq_show(struct seq_file *m,
+ const char *name,
+ struct kstat_lat_snap_struct *snap)
+{
+ seq_printf(m, "%-12s %20Lu %20lu\n", name,
+ snap->totlat, snap->count);
+}
+static const char *alloc_descr[] = {
+ "allocatomic:",
+ "alloc:",
+ "allocmp:",
+};
+static const int alloc_types[] = {
+ KSTAT_ALLOCSTAT_ATOMIC,
+ KSTAT_ALLOCSTAT_LOW,
+ KSTAT_ALLOCSTAT_LOW_MP,
+};
+
+static int proc_tid_vz_lat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ int i;
+
+ seq_printf(m, "%-12s %20s %20s\n",
+ "Type", "Total_lat", "Calls");
+
+ for (i = 0; i < ARRAY_SIZE(alloc_types); i++)
+ lastlat_seq_show(m, alloc_descr[i],
+ &task->alloc_lat[alloc_types[i]]);
+ return 0;
+}
+
+static int proc_tgid_vz_lat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ int i;
+ unsigned long flags;
+ u64 lat[ARRAY_SIZE(alloc_types)];
+ u64 count[ARRAY_SIZE(alloc_types)];
+
+ for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
+ lat[i] = task->alloc_lat[alloc_types[i]].totlat;
+ count[i] = task->alloc_lat[alloc_types[i]].count;
+ }
+
+ if (lock_task_sighand(task, &flags)) {
+ struct task_struct *t = task;
+ while_each_thread(task, t) {
+ for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
+ lat[i] += t->alloc_lat[alloc_types[i]].totlat;
+ count[i] += t->alloc_lat[alloc_types[i]].count;
+ }
+ }
+ for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
+ lat[i] += t->signal->alloc_lat[alloc_types[i]].totlat;
+ count[i] += t->signal->alloc_lat[alloc_types[i]].count;
+ }
+ unlock_task_sighand(task, &flags);
+ }
+
+ seq_printf(m, "%-12s %20s %20s\n",
+ "Type", "Total_lat", "Calls");
+
+ for (i = 0; i < ARRAY_SIZE(alloc_types); i++)
+ seq_printf(m, "%-12s %20Lu %20Lu\n", alloc_descr[i],
+ lat[i], count[i]);
+
+ return 0;
+}
+#endif
+
static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -3048,6 +3121,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_LIVEPATCH
ONE("patch_state", S_IRUSR, proc_pid_patch_state),
#endif
+#ifdef CONFIG_VE
+ ONE("vz_latency", S_IRUGO, proc_tgid_vz_lat),
+#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3429,6 +3505,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_LIVEPATCH
ONE("patch_state", S_IRUSR, proc_pid_patch_state),
#endif
+#ifdef CONFIG_VE
+ ONE("vz_latency", S_IRUGO, proc_tid_vz_lat),
+#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 76cc39785a56..72b3f40623b4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -32,6 +32,7 @@
#include <linux/posix-timers.h>
#include <linux/rseq.h>
#include <linux/rh_kabi.h>
+#include <linux/kstat.h>
/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
@@ -1124,6 +1125,10 @@ struct task_struct {
struct tlbflush_unmap_batch tlb_ubc;
+#ifdef CONFIG_VE
+ struct kstat_lat_snap_struct alloc_lat[KSTAT_ALLOCSTAT_NR];
+#endif
+
RH_KABI_REPLACE(struct rcu_head rcu, union {
refcount_t rcu_users;
struct rcu_head rcu;
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 57cf2c5e4a70..adfe9cc9bd88 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -211,6 +211,9 @@ struct signal_struct {
#ifdef CONFIG_TASKSTATS
struct taskstats *stats;
#endif
+#ifdef CONFIG_VE
+ struct kstat_lat_snap_struct alloc_lat[KSTAT_ALLOCSTAT_NR];
+#endif
#ifdef CONFIG_AUDIT
unsigned audit_tty;
struct tty_audit_buf *tty_audit_buf;
diff --git a/kernel/exit.c b/kernel/exit.c
index 0922ca937c11..8fde663ad874 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -723,6 +723,20 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
+void kstat_add_dying(struct task_struct *tsk)
+{
+#ifdef CONFIG_VE
+ int i;
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) {
+ tsk->signal->alloc_lat[i].totlat += tsk->alloc_lat[i].totlat;
+ tsk->signal->alloc_lat[i].count += tsk->alloc_lat[i].count;
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+#endif
+}
+
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
@@ -791,6 +805,8 @@ void __noreturn do_exit(long code)
#endif
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
+ } else {
+ kstat_add_dying(tsk);
}
acct_collect(code, group_dead);
if (group_dead)
diff --git a/kernel/fork.c b/kernel/fork.c
index c996de548127..61175de283cf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -919,6 +919,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (orig->cpus_ptr == &orig->cpus_mask)
tsk->cpus_ptr = &tsk->cpus_mask;
+#ifdef CONFIG_VE
+ memset(tsk->alloc_lat, 0, sizeof(tsk->alloc_lat));
+#endif
+
/*
* One for the user space visible state that goes away when reaped.
* One for the scheduler.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2f5c6d156fbf..e430fda431ae 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4536,6 +4536,12 @@ static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order,
local_irq_save(flags);
cpu = smp_processor_id();
KSTAT_LAT_PCPU_ADD(&kstat_glob.alloc_lat[ind], time);
+
+ if (in_task()) {
+ current->alloc_lat[ind].totlat += time;
+ current->alloc_lat[ind].count++;
+ }
+
if (!page)
kstat_glob.alloc_fails[cpu][ind]++;
local_irq_restore(flags);
More information about the Devel
mailing list