[Devel] [PATCH RHEL7 COMMIT] oom: resurrect berserker mode
Konstantin Khorenko
khorenko at virtuozzo.com
Thu Oct 15 06:53:03 PDT 2015
The commit is pushed to "branch-rh7-3.10.0-229.7.2.vz7.8.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.8.6
------>
commit e651315e4475767b41a7e028c6127b25c5754312
Author: Vladimir Davydov <vdavydov at parallels.com>
Date: Thu Oct 15 17:53:03 2015 +0400
oom: resurrect berserker mode
Patchset description: oom enhancements - part 2
- Patches 1-2 prepare memcg for upcoming changes in oom design.
- Patch 3 reworks oom locking design so that the executioner waits for
victim to exit. This is necessary to increase oom kill rate, which is
essential for berserker mode.
- Patch 4 drops unused OOM_SCAN_ABORT
- Patch 5 introduces oom timeout.
https://jira.sw.ru/browse/PSBM-38581
- Patch 6 makes oom fairer when it comes to selecting a victim among
different containers.
https://jira.sw.ru/browse/PSBM-37915
- Patch 7 prepares oom for introducing berserker mode
- Patch 8 resurrects oom berserker mode, which is supposed to cope with
actively forking processes.
https://jira.sw.ru/browse/PSBM-17930
https://jira.sw.ru/browse/PSBM-26973
Changes in v3:
- rework oom_trylock (patch 3)
- select exiting process instead of aborting oom scan so as not to keep
busy-waiting for an exiting process to exit (patches 3, 4)
- cleanup oom timeout handling + fix stuck process trace dumped
multiple times on timeout (patch 5)
- set max_overdraft to ULONG_MAX on selected processes (patch 6)
- rework oom berserker process selection logic (patches 7, 8)
Changes in v2:
- s/time_after/time_after_eq to avoid BUG_ON in oom_trylock (patch 4)
- propagate victim to the context that initiated oom in oom_unlock
(patch 6)
- always set oom_end on releasing oom context (patch 6)
Vladimir Davydov (8):
memcg: add mem_cgroup_get/put helpers
memcg: add lock for protecting memcg->oom_notify list
oom: rework locking design
oom: introduce oom timeout
oom: drop OOM_SCAN_ABORT
oom: rework logic behind memory.oom_guarantee
oom: pass points and overdraft to oom_kill_process
oom: resurrect berserker mode
Reviewed-by: Kirill Tkhai <ktkhai at odin.com>
=========================================
This patch description:
The logic behind the OOM berserker is the same as in PCS6: if processes
are killed by oom killer too often (< sysctl vm.oom_relaxation, 1 sec by
default), we increase "rage" (min -10, max 20) and kill 1 << "rage"
youngest worst processes if "rage" >= 0.
https://jira.sw.ru/browse/PSBM-17930
Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
include/linux/oom.h | 3 ++
kernel/sysctl.c | 7 ++++
mm/oom_kill.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 116 insertions(+)
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6ea83b2..acf58fc 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -35,7 +35,9 @@ struct oom_context {
struct task_struct *victim;
bool marked;
unsigned long oom_start;
+ unsigned long oom_end;
unsigned long overdraft;
+ int rage;
wait_queue_head_t waitq;
};
@@ -126,4 +128,5 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p);
extern int sysctl_oom_dump_tasks;
extern int sysctl_oom_kill_allocating_task;
extern int sysctl_panic_on_oom;
+extern int sysctl_oom_relaxation;
#endif /* _INCLUDE_LINUX_OOM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 976f48c..9c081e3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1184,6 +1184,13 @@ static struct ctl_table vm_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "oom_relaxation",
+ .data = &sysctl_oom_relaxation,
+ .maxlen = sizeof(sysctl_oom_relaxation),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
.procname = "overcommit_ratio",
.data = &sysctl_overcommit_ratio,
.maxlen = sizeof(sysctl_overcommit_ratio),
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d8a89c0..6d16154 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,13 +42,18 @@
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks;
+int sysctl_oom_relaxation = HZ;
static DEFINE_SPINLOCK(oom_context_lock);
#define OOM_TIMEOUT (5 * HZ)
+#define OOM_BASE_RAGE -10
+#define OOM_MAX_RAGE 20
+
#ifndef CONFIG_MEMCG
struct oom_context oom_ctx = {
+ .rage = OOM_BASE_RAGE,
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(oom_ctx.waitq),
};
#endif
@@ -59,6 +64,8 @@ void init_oom_context(struct oom_context *ctx)
ctx->victim = NULL;
ctx->marked = false;
ctx->oom_start = 0;
+ ctx->oom_end = 0;
+ ctx->rage = OOM_BASE_RAGE;
init_waitqueue_head(&ctx->waitq);
}
@@ -67,6 +74,7 @@ static void __release_oom_context(struct oom_context *ctx)
ctx->owner = NULL;
ctx->victim = NULL;
ctx->marked = false;
+ ctx->oom_end = jiffies;
wake_up_all(&ctx->waitq);
}
@@ -690,6 +698,102 @@ void oom_unlock(struct mem_cgroup *memcg)
mem_cgroup_put(victim_memcg);
}
+/*
+ * Kill more processes if oom happens too often in this context.
+ */
+static void oom_berserker(unsigned long points, unsigned long overdraft,
+ unsigned long totalpages, struct mem_cgroup *memcg,
+ nodemask_t *nodemask)
+{
+ static DEFINE_RATELIMIT_STATE(berserker_rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ struct oom_context *ctx;
+ struct task_struct *p;
+ int rage;
+ int killed = 0;
+
+ spin_lock(&oom_context_lock);
+ ctx = mem_cgroup_oom_context(memcg);
+ if (ctx->owner != current) {
+ /* Lost ownership on timeout */
+ spin_unlock(&oom_context_lock);
+ return;
+ }
+ /*
+ * Increase rage if oom happened recently in this context, reset
+ * rage otherwise.
+ *
+ * previous oom this oom (unfinished)
+ * ++++++++++++----------------------------++++++++
+ * ^ ^
+ * oom_end <<oom_relaxation>> oom_start
+ */
+ if (time_after(ctx->oom_start, ctx->oom_end + sysctl_oom_relaxation))
+ ctx->rage = OOM_BASE_RAGE;
+ else if (ctx->rage < OOM_MAX_RAGE)
+ ctx->rage++;
+ rage = ctx->rage;
+ spin_unlock(&oom_context_lock);
+
+ if (rage < 0)
+ return;
+
+ /*
+ * So, we are in rage. Kill (1 << rage) youngest tasks that are
+ * as bad as the victim.
+ */
+ read_lock(&tasklist_lock);
+ list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
+ unsigned long tsk_points;
+ unsigned long tsk_overdraft;
+
+ if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
+ fatal_signal_pending(p) || p->flags & PF_EXITING ||
+ oom_unkillable_task(p, memcg, nodemask))
+ continue;
+
+ tsk_points = oom_badness(p, memcg, nodemask, totalpages,
+ &tsk_overdraft);
+ if (tsk_overdraft < overdraft)
+ continue;
+
+ /*
+ * oom_badness never returns a negative value, even if
+ * oom_score_adj would make badness so, instead it
+ * returns 1. So we do not kill task with badness 1 if
+ * the victim has badness > 1 so as not to risk killing
+ * protected tasks.
+ */
+ if (tsk_points <= 1 && points > 1)
+ continue;
+
+ /*
+ * Consider tasks as equally bad if they have equal
+ * normalized scores.
+ */
+ if (tsk_points * 1000 / totalpages <
+ points * 1000 / totalpages)
+ continue;
+
+ if (__ratelimit(&berserker_rs)) {
+ task_lock(p);
+ pr_err("Rage kill process %d (%s)\n",
+ task_pid_nr(p), p->comm);
+ task_unlock(p);
+ }
+
+ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+ mem_cgroup_note_oom_kill(memcg, p);
+
+ if (++killed >= 1 << rage)
+ break;
+ }
+ read_unlock(&tasklist_lock);
+
+ pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed);
+}
+
#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Must be called while holding a reference to p, which will be released upon
@@ -805,6 +909,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
mem_cgroup_note_oom_kill(memcg, victim);
put_task_struct(victim);
+
+ oom_berserker(points, overdraft, totalpages, memcg, nodemask);
}
#undef K
More information about the Devel
mailing list