[Devel] [PATCH 16/17] oom: resurrect berserker mode

Vladimir Davydov vdavydov at parallels.com
Fri Aug 14 10:03:40 PDT 2015


The logic behind the OOM berserker is the same as in PCS6: if OOM kills
happen in the same memcg too often (< sysctl vm.oom_relaxation, 1 sec by
default), we increase "rage" (min -10, max 20) and kill 1 << "rage"
youngest worst processes if "rage" >= 0.

https://jira.sw.ru/browse/PSBM-17930

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 include/linux/memcontrol.h |  8 +++++
 include/linux/oom.h        |  6 ++++
 kernel/sysctl.c            |  7 +++++
 mm/memcontrol.c            |  7 +++++
 mm/oom_kill.c              | 77 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 105 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 99f0f74be0af..46fe040605b1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -29,6 +29,7 @@ struct page_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
+struct oom_context;
 
 /* Stats that can be updated by kernel. */
 enum mem_cgroup_page_stat_item {
@@ -120,6 +121,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
+extern struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg);
 extern void mem_cgroup_note_oom_kill(struct mem_cgroup *memcg,
 				     struct task_struct *task);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
@@ -344,6 +346,12 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 {
 }
 
+static inline struct oom_context *
+mem_cgroup_oom_context(struct mem_cgroup *memcg)
+{
+	return NULL;
+}
+
 static inline void
 mem_cgroup_note_oom_kill(struct mem_cgroup *memcg, struct task_struct *task)
 {
diff --git a/include/linux/oom.h b/include/linux/oom.h
index f16e35bee7d9..42bf3e84e1d1 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -31,6 +31,11 @@ enum oom_scan_t {
 	OOM_SCAN_SELECT,	/* always select this thread first */
 };
 
+struct oom_context {
+	int oom_rage;
+	unsigned long last_oom;
+};
+
 /* Thread is the potential origin of an oom condition; kill first on oom */
 #define OOM_FLAG_ORIGIN		((__force oom_flags_t)0x1)
 
@@ -95,4 +100,5 @@ extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_panic_on_oom;
 extern int sysctl_oom_timeout;
+extern int sysctl_oom_relaxation;
 #endif /* _INCLUDE_LINUX_OOM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5a3ff6cb15fc..650755cd09ff 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1191,6 +1191,13 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= proc_dointvec_ms_jiffies,
 	},
 	{
+		.procname	= "oom_relaxation",
+		.data		= &sysctl_oom_relaxation,
+		.maxlen		= sizeof(sysctl_oom_relaxation),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
+	{
 		.procname	= "overcommit_ratio",
 		.data		= &sysctl_overcommit_ratio,
 		.maxlen		= sizeof(sysctl_overcommit_ratio),
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 86c0500bf508..8bf4022d18cb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -307,6 +307,8 @@ struct mem_cgroup {
 	bool		oom_lock;
 	atomic_t	under_oom;
 
+	struct oom_context oom_ctx;
+
 	int	swappiness;
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
@@ -1637,6 +1639,11 @@ unsigned long mem_cgroup_oom_score(struct mem_cgroup *memcg)
 	return score > 0 ? score : 1;
 }
 
+struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg)
+{
+	return &memcg->oom_ctx;
+}
+
 #define mem_cgroup_from_res_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e55193df00c9..6dec59129bba 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,8 +44,13 @@ int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
 int sysctl_oom_timeout = 5 * HZ;
+int sysctl_oom_relaxation = HZ;
+static struct oom_context global_oom_ctx;
 static DEFINE_SPINLOCK(zone_scan_lock);
 
+#define OOM_BASE_RAGE	-10
+#define OOM_MAX_RAGE	20
+
 #ifdef CONFIG_NUMA
 /**
  * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -369,6 +374,76 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 	return chosen;
 }
 
+static void oom_berserker(struct task_struct *victim, unsigned long totalpages,
+			  struct mem_cgroup *memcg, nodemask_t *nodemask)
+{
+	unsigned long now = jiffies;
+	unsigned long victim_ub_score, ub_score;
+	unsigned long victim_points, points;
+	struct oom_context *ctx;
+	struct task_struct *p;
+	int killed = 0;
+
+	ctx = !memcg ? &global_oom_ctx : mem_cgroup_oom_context(memcg);
+
+	/* Update oom rage on each oom kill */
+	if (time_after(now, ctx->last_oom + sysctl_oom_relaxation))
+		ctx->oom_rage = OOM_BASE_RAGE;
+	else if (ctx->oom_rage < OOM_MAX_RAGE)
+		ctx->oom_rage++;
+	ctx->last_oom = now;
+
+	if (ctx->oom_rage < 0)
+		return;
+
+	victim_points = oom_badness(victim, NULL, NULL, totalpages);
+
+	/* Only take into account ub oom score on global oom */
+	victim_ub_score = !memcg ? ub_oom_score(victim) : 0;
+
+	/* When enraged, we kill youngest tasks that are as bad as the victim */
+	read_lock(&tasklist_lock);
+	list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
+		if (!p->mm || p->mm == victim->mm ||
+		    test_tsk_thread_flag(p, TIF_MEMDIE) ||
+		    fatal_signal_pending(p) ||
+		    oom_unkillable_task(p, memcg, nodemask))
+			continue;
+
+		ub_score = !memcg ? ub_oom_score(p) : 0;
+		if (ub_score < victim_ub_score)
+			continue;
+
+		points = oom_badness(p, memcg, nodemask, totalpages);
+		/*
+		 * Consider tasks as equally bad if their scores (basically
+		 * mem+swap usage in pages) differ by less than 10%.
+		 */
+		if (points < victim_points &&
+		    100 * (victim_points - points) / (victim_points + 1) > 10)
+			continue;
+
+		points = points * 1000 / totalpages;
+
+		if (printk_ratelimit()) {
+			task_lock(p);
+			pr_err("Rage kill process %d (%s) score %lu\n",
+			       task_pid_nr(p), p->comm, points);
+			task_unlock(p);
+		}
+
+		do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+		mem_cgroup_note_oom_kill(memcg, p);
+
+		if (++killed >= (1 << ctx->oom_rage))
+			break;
+	}
+	read_unlock(&tasklist_lock);
+
+	pr_err("OOM killer in rage %d: %d tasks killed\n",
+	       ctx->oom_rage, killed);
+}
+
 /**
  * dump_tasks - dump current memory state of all system tasks
  * @memcg: current's memory controller, if constrained
@@ -540,6 +615,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 		K(get_mm_counter(victim->mm, MM_FILEPAGES)));
 	task_unlock(victim);
 
+	oom_berserker(victim, totalpages, memcg, nodemask);
+
 	/*
 	 * Kill all user processes sharing victim->mm in other thread groups, if
 	 * any.  They don't get access to memory reserves, though, to avoid
-- 
2.1.4




More information about the Devel mailing list