[Devel] [PATCH rh7 6/6] oom: resurrect berserker mode

Thu Jan 28 07:00:15 PST 2016

From: Vladimir Davydov <vdavydov at parallels.com>

Patchset description: oom enhancements - part 2

 - Patches 1-2 prepare memcg for upcoming changes in oom design.
 - Patch 3 reworks oom locking design so that the executioner waits for
   victim to exit. This is necessary to increase oom kill rate, which is
   essential for berserker mode.
 - Patch 4 drops unused OOM_SCAN_ABORT
 - Patch 5 introduces oom timeout.
   https://jira.sw.ru/browse/PSBM-38581
 - Patch 6 makes oom fairer when it comes to selecting a victim among
   different containers.
   https://jira.sw.ru/browse/PSBM-37915
 - Patch 7 prepares oom for introducing berserker mode
 - Patch 8 resurrects oom berserker mode, which is supposed to cope with
   actively forking processes.
   https://jira.sw.ru/browse/PSBM-17930

https://jira.sw.ru/browse/PSBM-26973

Changes in v3:
 - rework oom_trylock (patch 3)
 - select exiting process instead of aborting oom scan so as not to keep
   busy-waiting for an exiting process to exit (patches 3, 4)
 - cleanup oom timeout handling + fix stuck process trace dumped
   multiple times on timeout (patch 5)
 - set max_overdraft to ULONG_MAX on selected processes (patch 6)
 - rework oom berserker process selection logic (patches 7, 8)

Changes in v2:
 - s/time_after/time_after_eq to avoid BUG_ON in oom_trylock (patch 4)
 - propagate victim to the context that initiated oom in oom_unlock
   (patch 6)
 - always set oom_end on releasing oom context (patch 6)

Vladimir Davydov (8):
  memcg: add mem_cgroup_get/put helpers
  memcg: add lock for protecting memcg->oom_notify list
  oom: rework locking design
  oom: introduce oom timeout
  oom: drop OOM_SCAN_ABORT
  oom: rework logic behind memory.oom_guarantee
  oom: pass points and overdraft to oom_kill_process
  oom: resurrect berserker mode

Reviewed-by: Kirill Tkhai <ktkhai at odin.com>

=========================================
This patch description:

The logic behind the OOM berserker is the same as in PCS6: if processes
are killed by oom killer too often (< sysctl vm.oom_relaxation, 1 sec by
default), we increase "rage" (min -10, max 20) and kill 1 << "rage"
youngest worst processes if "rage" >= 0.

https://jira.sw.ru/browse/PSBM-17930

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 include/linux/oom.h |   3 ++
 kernel/sysctl.c     |   7 ++++
 mm/oom_kill.c       | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6ea83b260aab..acf58fc0ce55 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -35,7 +35,9 @@ struct oom_context {
 	struct task_struct *victim;
 	bool marked;
 	unsigned long oom_start;
+	unsigned long oom_end;
 	unsigned long overdraft;
+	int rage;
 	wait_queue_head_t waitq;
 };
 
@@ -126,4 +128,5 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_panic_on_oom;
+extern int sysctl_oom_relaxation;
 #endif /* _INCLUDE_LINUX_OOM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e218c23a083b..a32154d9e257 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1202,6 +1202,13 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "oom_relaxation",
+		.data		= &sysctl_oom_relaxation,
+		.maxlen		= sizeof(sysctl_oom_relaxation),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
+	{
 		.procname	= "overcommit_ratio",
 		.data		= &sysctl_overcommit_ratio,
 		.maxlen		= sizeof(sysctl_overcommit_ratio),
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a56c9a942beb..2402fcceda6e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,13 +42,18 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks;
+int sysctl_oom_relaxation = HZ;
 
 static DEFINE_SPINLOCK(oom_context_lock);
 
 #define OOM_TIMEOUT	(5 * HZ)
 
+#define OOM_BASE_RAGE	-10
+#define OOM_MAX_RAGE	20
+
 #ifndef CONFIG_MEMCG
 struct oom_context oom_ctx = {
+	.rage		= OOM_BASE_RAGE,
 	.waitq		= __WAIT_QUEUE_HEAD_INITIALIZER(oom_ctx.waitq),
 };
 #endif
@@ -59,6 +64,8 @@ void init_oom_context(struct oom_context *ctx)
 	ctx->victim = NULL;
 	ctx->marked = false;
 	ctx->oom_start = 0;
+	ctx->oom_end = 0;
+	ctx->rage = OOM_BASE_RAGE;
 	init_waitqueue_head(&ctx->waitq);
 }
 
@@ -67,6 +74,7 @@ static void __release_oom_context(struct oom_context *ctx)
 	ctx->owner = NULL;
 	ctx->victim = NULL;
 	ctx->marked = false;
+	ctx->oom_end = jiffies;
 	wake_up_all(&ctx->waitq);
 }
 
@@ -690,6 +698,102 @@ void oom_unlock(struct mem_cgroup *memcg)
 	mem_cgroup_put(victim_memcg);
 }
 
+/*
+ * Kill more processes if oom happens too often in this context.
+ */
+static void oom_berserker(unsigned long points, unsigned long overdraft,
+			  unsigned long totalpages, struct mem_cgroup *memcg,
+			  nodemask_t *nodemask)
+{
+	static DEFINE_RATELIMIT_STATE(berserker_rs,
+				      DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+	struct oom_context *ctx;
+	struct task_struct *p;
+	int rage;
+	int killed = 0;
+
+	spin_lock(&oom_context_lock);
+	ctx = mem_cgroup_oom_context(memcg);
+	if (ctx->owner != current) {
+		/* Lost ownership on timeout */
+		spin_unlock(&oom_context_lock);
+		return;
+	}
+	/*
+	 * Increase rage if oom happened recently in this context, reset
+	 * rage otherwise.
+	 *
+	 * previous oom                            this oom (unfinished)
+	 * ++++++++++++----------------------------++++++++
+	 *            ^                            ^
+	 *         oom_end  <<oom_relaxation>>  oom_start
+	 */
+	if (time_after(ctx->oom_start, ctx->oom_end + sysctl_oom_relaxation))
+		ctx->rage = OOM_BASE_RAGE;
+	else if (ctx->rage < OOM_MAX_RAGE)
+		ctx->rage++;
+	rage = ctx->rage;
+	spin_unlock(&oom_context_lock);
+
+	if (rage < 0)
+		return;
+
+	/*
+	 * So, we are in rage. Kill (1 << rage) youngest tasks that are
+	 * as bad as the victim.
+	 */
+	read_lock(&tasklist_lock);
+	list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
+		unsigned long tsk_points;
+		unsigned long tsk_overdraft;
+
+		if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
+		    fatal_signal_pending(p) || p->flags & PF_EXITING ||
+		    oom_unkillable_task(p, memcg, nodemask))
+			continue;
+
+		tsk_points = oom_badness(p, memcg, nodemask, totalpages,
+					 &tsk_overdraft);
+		if (tsk_overdraft < overdraft)
+			continue;
+
+		/*
+		 * oom_badness never returns a negative value, even if
+		 * oom_score_adj would make badness so, instead it
+		 * returns 1. So we do not kill task with badness 1 if
+		 * the victim has badness > 1 so as not to risk killing
+		 * protected tasks.
+		 */
+		if (tsk_points <= 1 && points > 1)
+			continue;
+
+		/*
+		 * Consider tasks as equally bad if they have equal
+		 * normalized scores.
+		 */
+		if (tsk_points * 1000 / totalpages <
+		    points * 1000 / totalpages)
+			continue;
+
+		if (__ratelimit(&berserker_rs)) {
+			task_lock(p);
+			pr_err("Rage kill process %d (%s)\n",
+			       task_pid_nr(p), p->comm);
+			task_unlock(p);
+		}
+
+		do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+		mem_cgroup_note_oom_kill(memcg, p);
+
+		if (++killed >= 1 << rage)
+			break;
+	}
+	read_unlock(&tasklist_lock);
+
+	pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed);
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Must be called while holding a reference to p, which will be released upon
@@ -808,6 +912,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
 	mem_cgroup_note_oom_kill(memcg, victim);
 	put_task_struct(victim);
+
+	oom_berserker(points, overdraft, totalpages, memcg, nodemask);
 }
 #undef K
 
-- 
2.1.4