[Devel] [PATCH rh7 3/6] mm: ignore oom_score_adj of containerized tasks on global OOM

Vladimir Davydov vdavydov at parallels.com
Wed Jun 3 07:56:50 PDT 2015


When the system comes to the point when it is so utterly out of memory
that it is time to sacrifice someone, it should dismiss container users'
preferences. It should just go and kill the fattest task in order to
free room for others to breath.

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 fs/proc/base.c      |  7 +++++--
 include/linux/oom.h | 10 ++++++++++
 mm/oom_kill.c       | 10 ++++++++++
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5e3bbb84520b..3f2b4d96fd25 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -449,13 +449,16 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long totalpages = totalram_pages + total_swap_pages;
 	unsigned long points = 0;
+	struct mem_cgroup *memcg = NULL;
 
-	if (!ve_is_super(get_exec_env()))
+	if (!ve_is_super(get_exec_env())) {
 		totalpages = min(totalpages, mem_cgroup_total_pages(true));
+		memcg = OOM_BADNESS_DUMMY_MEMCG;
+	}
 
 	read_lock(&tasklist_lock);
 	if (pid_alive(task))
-		points = oom_badness(task, NULL, NULL, totalpages) *
+		points = oom_badness(task, memcg, NULL, totalpages) *
 						1000 / totalpages;
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 17100d02e8d3..04f4f579c36c 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -52,6 +52,16 @@ static inline bool oom_task_origin(const struct task_struct *p)
 /* linux/mm/oom_group.c */
 extern int get_task_oom_score_adj(struct task_struct *t);
 
+/*
+ * oom_score_adj must be 0 for containerized tasks on system-wide OOM, so
+ * oom_badness will always return 0 if memcg == NULL. However, we need to show
+ * real oom_badness when /proc/PID/oom_score is read from inside a container.
+ * Since procuring the memcg corresponding to a container is rather tricky, we
+ * pass OOM_BADNESS_DUMMY_MEMCG instead, which will make oom_badness act as if
+ * it was called on local OOM, but without dereferencing the memcg ptr.
+ */
+#define OOM_BADNESS_DUMMY_MEMCG		((struct mem_cgroup *)1UL)
+
 extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 			  const nodemask_t *nodemask, unsigned long totalpages);
 extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7201f7c39e3e..797103067d19 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -149,9 +149,14 @@ static bool oom_unkillable_task(struct task_struct *p,
 unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 			  const nodemask_t *nodemask, unsigned long totalpages)
 {
+	bool global;
 	long points;
 	long adj;
 
+	global = !memcg;
+	if (memcg == OOM_BADNESS_DUMMY_MEMCG)
+		memcg = NULL;
+
 	if (oom_unkillable_task(p, memcg, nodemask))
 		return 0;
 
@@ -160,6 +165,11 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 		return 0;
 
 	adj = (long)p->signal->oom_score_adj;
+#ifdef CONFIG_VE
+	/* Ignore oom_score_adj of containerized tasks on system-wide OOM */
+	if (global && p->task_ve != &ve0)
+		adj = 0;
+#endif
 	if (adj == OOM_SCORE_ADJ_MIN) {
 		task_unlock(p);
 		return 0;
-- 
2.1.4




More information about the Devel mailing list