[Devel] [PATCH rh7 v2 6/6] oom: resurrect berserker mode

Fri Sep 11 08:18:06 PDT 2015

The logic behind the OOM berserker is the same as in PCS6: if processes
are killed by oom killer too often (< sysctl vm.oom_relaxation, 1 sec by
default), we increase "rage" (min -10, max 20) and kill 1 << "rage"
youngest worst processes if "rage" >= 0.

https://jira.sw.ru/browse/PSBM-17930

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 include/linux/memcontrol.h |  20 ++++++
 include/linux/oom.h        |   3 +
 kernel/sysctl.c            |   7 ++
 mm/memcontrol.c            |  11 ----
 mm/oom_kill.c              | 160 ++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 189 insertions(+), 12 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c856425ab7a..76a7dc8192fb 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -78,6 +78,19 @@ extern void mem_cgroup_uncharge_cache_page(struct page *page);
 
 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				  struct mem_cgroup *memcg);
+
+static inline
+bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+				struct mem_cgroup *memcg)
+{
+	bool ret;
+
+	rcu_read_lock();
+	ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
+	rcu_read_unlock();
+	return ret;
+}
+
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
@@ -286,6 +299,13 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm
 	return NULL;
 }
 
+static inline
+bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+				struct mem_cgroup *memcg)
+{
+	return true;
+}
+
 static inline bool mm_match_cgroup(struct mm_struct *mm,
 		struct mem_cgroup *memcg)
 {
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 4e12187663ed..445f6242ec9e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -35,7 +35,9 @@ struct oom_context {
 	struct task_struct *owner;
 	struct task_struct *victim;
 	unsigned long oom_start;
+	unsigned long oom_end;
 	unsigned long overdraft;
+	int rage;
 	wait_queue_head_t waitq;
 };
 
@@ -125,4 +127,5 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_panic_on_oom;
+extern int sysctl_oom_relaxation;
 #endif /* _INCLUDE_LINUX_OOM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 976f48c09748..9c081e3f350f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1184,6 +1184,13 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "oom_relaxation",
+		.data		= &sysctl_oom_relaxation,
+		.maxlen		= sizeof(sysctl_oom_relaxation),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
+	{
 		.procname	= "overcommit_ratio",
 		.data		= &sysctl_overcommit_ratio,
 		.maxlen		= sizeof(sysctl_overcommit_ratio),
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 41fb41b16664..2b87dbc5c0cd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1474,17 +1474,6 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 	return css_is_ancestor(&memcg->css, &root_memcg->css);
 }
 
-static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
-				       struct mem_cgroup *memcg)
-{
-	bool ret;
-
-	rcu_read_lock();
-	ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
-	rcu_read_unlock();
-	return ret;
-}
-
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
 {
 	int ret;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4290d6665429..9c990571713e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,13 +42,18 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks;
+int sysctl_oom_relaxation = HZ;
 
 static DEFINE_SPINLOCK(oom_context_lock);
 
 #define OOM_TIMEOUT	(5 * HZ)
 
+#define OOM_BASE_RAGE	-10
+#define OOM_MAX_RAGE	20
+
 #ifndef CONFIG_MEMCG
 struct oom_context oom_ctx = {
+	.rage		= OOM_BASE_RAGE,
 	.waitq		= __WAIT_QUEUE_HEAD_INITIALIZER(oom_ctx.waitq),
 };
 #endif
@@ -58,6 +63,8 @@ void init_oom_context(struct oom_context *ctx)
 	ctx->owner = NULL;
 	ctx->victim = NULL;
 	ctx->oom_start = 0;
+	ctx->oom_end = 0;
+	ctx->rage = OOM_BASE_RAGE;
 	init_waitqueue_head(&ctx->waitq);
 }
 
@@ -485,6 +492,7 @@ void mark_oom_victim(struct task_struct *tsk)
  */
 void exit_oom_victim(void)
 {
+	unsigned long now = jiffies;
 	struct mem_cgroup *iter;
 	struct oom_context *ctx;
 
@@ -499,13 +507,143 @@ void exit_oom_victim(void)
 		ctx = mem_cgroup_oom_context(iter);
 		if (ctx->victim == current) {
 			ctx->victim = NULL;
-			if (!ctx->owner)
+			if (!ctx->owner) {
+				ctx->oom_end = now;
 				wake_up_all(&ctx->waitq);
+			}
 		}
 	} while ((iter = mem_cgroup_iter(NULL, iter, NULL)));
 	spin_unlock(&oom_context_lock);
 }
 
+static void oom_berserker(struct task_struct *victim, unsigned long totalpages,
+			  struct mem_cgroup *root_memcg, nodemask_t *nodemask)
+{
+	static DEFINE_RATELIMIT_STATE(berserker_rs,
+				      DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+	unsigned long now = jiffies;
+	struct mem_cgroup *memcg, *memcg_to_put = NULL, *target_memcg = NULL;
+	struct oom_context *ctx;
+	struct task_struct *p;
+	unsigned long victim_overdraft;
+	unsigned long victim_points;
+	int rage = -1;
+	int killed = 0;
+
+	victim_points = oom_badness(victim, root_memcg, nodemask, totalpages,
+				    &victim_overdraft);
+
+	/*
+	 * Get the victim cgroup.
+	 */
+	p = find_lock_task_mm(victim);
+	if (p) {
+		memcg = try_get_mem_cgroup_from_mm(p->mm);
+		task_unlock(p);
+		/*
+		 * The victim could have been moved to another cgroup
+		 * since it was selected. If so, assume it belonged to
+		 * root_memcg.
+		 */
+		if (root_memcg &&
+		    !mem_cgroup_same_or_subtree(root_memcg, memcg)) {
+			mem_cgroup_put(memcg);
+			memcg = root_memcg;
+		} else
+			memcg_to_put = memcg;
+	} else {
+		/*
+		 * The victim has already freed its memory, so we can't
+		 * get its score and hence should not start berserker,
+		 * because the latter relies on it.
+		 */
+		return;
+	}
+
+	/*
+	 * Update berserker rage on each oom kill. Select oom context
+	 * with the maximal positive rage if any.
+	 */
+	spin_lock(&oom_context_lock);
+	while (1) {
+		ctx = mem_cgroup_oom_context(memcg);
+		if (time_after(now, ctx->oom_end + sysctl_oom_relaxation))
+			ctx->rage = OOM_BASE_RAGE;
+		else if (ctx->rage < OOM_MAX_RAGE)
+			ctx->rage++;
+		if (ctx->rage >= rage) {
+			target_memcg = memcg;
+			rage = ctx->rage;
+		}
+		if (memcg == root_memcg)
+			break;
+		memcg = parent_mem_cgroup(memcg);
+		/*
+		 * Break the loop if there is no parent (i.e. we've just
+		 * done with the root cgroup). Needed for the system
+		 * wide oom case (root_memcg equals NULL).
+		 */
+		if (!memcg)
+			break;
+	}
+	spin_unlock(&oom_context_lock);
+
+	if (rage < 0)
+		goto out;
+
+	/*
+	 * So, we are in rage. Kill (1 << rage) youngest tasks that are
+	 * as bad as the victim.
+	 */
+	read_lock(&tasklist_lock);
+	list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
+		unsigned long overdraft;
+		unsigned long points;
+
+		if (p == victim || !p->mm ||
+		    fatal_signal_pending(p) || p->flags & PF_EXITING ||
+		    oom_unkillable_task(p, target_memcg, nodemask))
+			continue;
+
+		points = oom_badness(p, target_memcg, nodemask, totalpages,
+				     &overdraft);
+		if (overdraft < victim_overdraft)
+			continue;
+
+		/*
+		 * Consider tasks as equally bad if their score values
+		 * (basically mem+swap usage in pages) differ by less
+		 * than 1/4th.
+		 */
+		if (overdraft == victim_overdraft && points < victim_points &&
+		    4 * (victim_points - points) >= victim_points)
+			continue;
+
+		/* Normalize score for reporting */
+		points = points * 1000 / totalpages;
+
+		if (__ratelimit(&berserker_rs)) {
+			task_lock(p);
+			pr_err("Rage kill process %d (%s) score %lu\n",
+			       task_pid_nr(p), p->comm, points);
+			task_unlock(p);
+		}
+
+		do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+		mem_cgroup_note_oom_kill(target_memcg, p);
+
+		if (++killed >= 1 << rage)
+			break;
+	}
+	read_unlock(&tasklist_lock);
+
+	pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed);
+out:
+	if (memcg_to_put)
+		mem_cgroup_put(memcg_to_put);
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Must be called while holding a reference to p, which will be released upon
@@ -617,6 +755,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 		}
 	rcu_read_unlock();
 
+	oom_berserker(victim, totalpages, memcg, nodemask);
+
 	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
 	mem_cgroup_note_oom_kill(memcg, victim);
 	put_task_struct(victim);
@@ -699,6 +839,7 @@ bool oom_trylock(struct mem_cgroup *memcg)
 			show_stack(p, NULL);
 
 			ctx->owner = ctx->victim = NULL;
+			ctx->oom_end = now;
 			wake_up_all(&ctx->waitq);
 		}
 	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
@@ -734,6 +875,7 @@ void oom_unlock(struct mem_cgroup *memcg)
 	unsigned long now = jiffies;
 	unsigned long timeout = 0;
 	struct mem_cgroup *iter, *victim_memcg = NULL;
+	struct task_struct *victim = NULL;
 	struct oom_context *ctx;
 	DEFINE_WAIT(wait);
 
@@ -752,6 +894,7 @@ void oom_unlock(struct mem_cgroup *memcg)
 			 * It's our responsibility to wake up blocked
 			 * processes then.
 			 */
+			ctx->oom_end = now;
 			wake_up_all(&ctx->waitq);
 			continue;
 		}
@@ -766,6 +909,8 @@ void oom_unlock(struct mem_cgroup *memcg)
 		timeout = ctx->oom_start + OOM_TIMEOUT - now;
 		BUG_ON(timeout == 0);
 
+		victim = ctx->victim;
+
 		/*
 		 * Remember victim memcg so that we can wait for victim
 		 * to exit below.
@@ -776,6 +921,19 @@ void oom_unlock(struct mem_cgroup *memcg)
 		prepare_to_wait(&ctx->waitq, &wait, TASK_KILLABLE);
 	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
 
+	/*
+	 * Propagate victim up to the context that initiated oom for
+	 * oom_end to be updated in all relevant contexts when the
+	 * victim exits (see exit_oom_victim).
+	 */
+	for (iter = victim_memcg; iter; iter = parent_mem_cgroup(iter)) {
+		ctx = mem_cgroup_oom_context(iter);
+		if (!ctx->victim)
+			ctx->victim = victim;
+		if (iter == memcg)
+			break;
+	}
+
 	spin_unlock(&oom_context_lock);
 
 	if (timeout > 0) {
-- 
2.1.4