[Devel] [PATCH rh7 4/6] oom: rework logic behind memory.oom_guarantee

Vladimir Davydov vdavydov at virtuozzo.com
Thu Jan 28 07:00:13 PST 2016


From: Vladimir Davydov <vdavydov at parallels.com>

Patchset description: oom enhancements - part 2

 - Patches 1-2 prepare memcg for upcoming changes in oom design.
 - Patch 3 reworks oom locking design so that the executioner waits for
   victim to exit. This is necessary to increase oom kill rate, which is
   essential for berserker mode.
 - Patch 4 drops unused OOM_SCAN_ABORT
 - Patch 5 introduces oom timeout.
   https://jira.sw.ru/browse/PSBM-38581
 - Patch 6 makes oom fairer when it comes to selecting a victim among
   different containers.
   https://jira.sw.ru/browse/PSBM-37915
 - Patch 7 prepares oom for introducing berserker mode
 - Patch 8 resurrects oom berserker mode, which is supposed to cope with
   actively forking processes.
   https://jira.sw.ru/browse/PSBM-17930

https://jira.sw.ru/browse/PSBM-26973

Changes in v3:
 - rework oom_trylock (patch 3)
 - select exiting process instead of aborting oom scan so as not to keep
   busy-waiting for an exiting process to exit (patches 3, 4)
 - cleanup oom timeout handling + fix stuck process trace dumped
   multiple times on timeout (patch 5)
 - set max_overdraft to ULONG_MAX on selected processes (patch 6)
 - rework oom berserker process selection logic (patches 7, 8)

Changes in v2:
 - s/time_after/time_after_eq to avoid BUG_ON in oom_trylock (patch 4)
 - propagate victim to the context that initiated oom in oom_unlock
   (patch 6)
 - always set oom_end on releasing oom context (patch 6)

Vladimir Davydov (8):
  memcg: add mem_cgroup_get/put helpers
  memcg: add lock for protecting memcg->oom_notify list
  oom: rework locking design
  oom: introduce oom timeout
  oom: drop OOM_SCAN_ABORT
  oom: rework logic behind memory.oom_guarantee
  oom: pass points and overdraft to oom_kill_process
  oom: resurrect berserker mode

Reviewed-by: Kirill Tkhai <ktkhai at odin.com>

=========================================
This patch description:

Currently, memory.oom_guarantee works as a threshold: we first select
processes in cgroups whose usage is below oom guarantee, and only if
there is no eligible process in such cgroups, we disregard oom guarantee
configuration and iterate over all processes. Although simple to
implement, such a behavior is unfair: we do not differentiate between
cgroups that only slightly above their guarantee and those who exceed it
significantly.

This patch therefore reworks the way how memory.oom_guarantee affects
oom killer behavior. First of all, it reverts old logic, which was
introduced by commit e94e18346f74c ("memcg: add oom_guarantee"), leaving
hunks bringing the memory.oom_guarantee knob intact. Then it implements
a new approach of selecting oom victim that works as follows.

Now a task is selected by oom killer iff (a) the memory cgroup which the
process resides in has the greatest overdraft of all cgroups eligible
for scan and (b) the process has the greatest score among all processes
which reside in cgroups with the greatest overdraft. A cgroup's
overdraft is defined as

  (U-G)/(L-G), if U<G,
  0,           otherwise

  G - memory.oom_guarantee
  L - memory.memsw.limit_in_bytes
  U - memory.memsw.usage_in_bytes

https://jira.sw.ru/browse/PSBM-37915

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>

Conflicts:
	mm/memcontrol.c
---
 fs/proc/base.c             |  2 +-
 include/linux/memcontrol.h |  6 ++--
 include/linux/oom.h        | 24 +++++++++++--
 mm/memcontrol.c            | 86 ++++++++++++++--------------------------------
 mm/oom_kill.c              | 61 ++++++++++++++++++++++----------
 5 files changed, 92 insertions(+), 87 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d07eb7191dcc..6d2faa08cd4e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -609,7 +609,7 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 
 	read_lock(&tasklist_lock);
 	if (pid_alive(task))
-		points = oom_badness(task, NULL, NULL, totalpages) *
+		points = oom_badness(task, NULL, NULL, totalpages, NULL) *
 						1000 / totalpages;
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index badd4acfae64..27b3c5681c9c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -125,7 +125,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
 extern struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg);
-extern bool mem_cgroup_below_oom_guarantee(struct task_struct *p);
+extern unsigned long mem_cgroup_overdraft(struct mem_cgroup *memcg);
 extern void mem_cgroup_note_oom_kill(struct mem_cgroup *memcg,
 				     struct task_struct *task);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
@@ -396,9 +396,9 @@ mem_cgroup_oom_context(struct mem_cgroup *memcg)
 	return &oom_ctx;
 }
 
-static inline bool mem_cgroup_below_oom_guarantee(struct task_struct *p)
+static inline unsigned long mem_cgroup_overdraft(struct mem_cgroup *memcg)
 {
-	return false;
+	return 0;
 }
 
 static inline void
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6d4a94f15f2a..9117d1df385e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -35,6 +35,7 @@ struct oom_context {
 	struct task_struct *victim;
 	bool marked;
 	unsigned long oom_start;
+	unsigned long overdraft;
 	wait_queue_head_t waitq;
 };
 
@@ -64,8 +65,25 @@ extern int get_task_oom_score_adj(struct task_struct *t);
 
 extern void mark_oom_victim(struct task_struct *tsk);
 
-extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-			  const nodemask_t *nodemask, unsigned long totalpages);
+extern unsigned long oom_badness(struct task_struct *p,
+		struct mem_cgroup *memcg, const nodemask_t *nodemask,
+		unsigned long totalpages, unsigned long *overdraft);
+
+static inline bool oom_worse(unsigned long points, unsigned long overdraft,
+		unsigned long *chosen_points, unsigned long *max_overdraft)
+{
+	if (overdraft > *max_overdraft) {
+		*max_overdraft = overdraft;
+		*chosen_points = points;
+		return true;
+	}
+	if (overdraft == *max_overdraft && points > *chosen_points) {
+		*chosen_points = points;
+		return true;
+	}
+	return false;
+}
+
 extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 			     unsigned int points, unsigned long totalpages,
 			     struct mem_cgroup *memcg, nodemask_t *nodemask,
@@ -79,7 +97,7 @@ extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 
 extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill, bool ignore_memcg_guarantee);
+		bool force_kill);
 
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *mask, bool force_kill);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6d79d961fd3e..5be42a770b9d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1621,51 +1621,6 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
 	return true;
 }
 
-static bool __mem_cgroup_below_oom_guarantee(struct mem_cgroup *root,
-					     struct mem_cgroup *memcg)
-{
-	if (mem_cgroup_disabled())
-		return false;
-
-	if (memcg == root_mem_cgroup)
-		return false;
-
-	if (res_counter_read_u64(&memcg->memsw, RES_USAGE) >=
-					memcg->oom_guarantee)
-		return false;
-
-	while (memcg != root) {
-		memcg = parent_mem_cgroup(memcg);
-		if (!memcg)
-			break;
-
-		if (memcg == root_mem_cgroup)
-			break;
-
-		if (res_counter_read_u64(&memcg->memsw, RES_USAGE) >=
-						memcg->oom_guarantee)
-			return false;
-	}
-	return true;
-}
-
-bool mem_cgroup_below_oom_guarantee(struct task_struct *p)
-{
-	struct mem_cgroup *memcg = NULL;
-	bool ret = false;
-
-	p = find_lock_task_mm(p);
-	if (p) {
-		memcg = try_get_mem_cgroup_from_mm(p->mm);
-		task_unlock(p);
-	}
-	if (memcg) {
-		ret = __mem_cgroup_below_oom_guarantee(root_mem_cgroup, memcg);
-		css_put(&memcg->css);
-	}
-	return ret;
-}
-
 #ifdef CONFIG_CLEANCACHE
 bool mem_cgroup_cleancache_disabled(struct page *page)
 {
@@ -1727,6 +1682,22 @@ struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg)
 	return &memcg->oom_ctx;
 }
 
+unsigned long mem_cgroup_overdraft(struct mem_cgroup *memcg)
+{
+	unsigned long long guarantee, limit, usage;
+	unsigned long score;
+
+	guarantee = ACCESS_ONCE(memcg->oom_guarantee);
+	limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+	usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+
+	if (limit >= RESOURCE_MAX || guarantee >= limit || usage <= guarantee)
+		return 0;
+
+	score = div64_u64(1000 * (usage - guarantee), limit - guarantee);
+	return score > 0 ? score : 1;
+}
+
 unsigned long mem_cgroup_total_pages(struct mem_cgroup *memcg, bool swap)
 {
 	unsigned long long limit;
@@ -2024,11 +1995,12 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				     int order)
 {
 	struct mem_cgroup *iter;
+	unsigned long max_overdraft = 0;
 	unsigned long chosen_points = 0;
 	unsigned long totalpages;
+	unsigned long overdraft;
 	unsigned int points = 0;
 	struct task_struct *chosen = NULL;
-	bool ignore_memcg_guarantee = false;
 
 	/*
 	 * If current has a pending SIGKILL or is exiting, then automatically
@@ -2042,25 +2014,21 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
 	totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
-retry:
 	for_each_mem_cgroup_tree(iter, memcg) {
 		struct cgroup *cgroup = iter->css.cgroup;
 		struct cgroup_iter it;
 		struct task_struct *task;
 
-		if (!ignore_memcg_guarantee &&
-		    __mem_cgroup_below_oom_guarantee(memcg, iter))
-			continue;
-
 		cgroup_iter_start(cgroup, &it);
 		while ((task = cgroup_iter_next(cgroup, &it))) {
 			switch (oom_scan_process_thread(task, totalpages, NULL,
-							false, true)) {
+							false)) {
 			case OOM_SCAN_SELECT:
 				if (chosen)
 					put_task_struct(chosen);
 				chosen = task;
 				chosen_points = ULONG_MAX;
+				max_overdraft = ULONG_MAX;
 				get_task_struct(chosen);
 				/* fall through */
 			case OOM_SCAN_CONTINUE:
@@ -2068,25 +2036,21 @@ retry:
 			case OOM_SCAN_OK:
 				break;
 			};
-			points = oom_badness(task, memcg, NULL, totalpages);
-			if (points > chosen_points) {
+			points = oom_badness(task, memcg, NULL, totalpages,
+					     &overdraft);
+			if (oom_worse(points, overdraft, &chosen_points,
+				      &max_overdraft)) {
 				if (chosen)
 					put_task_struct(chosen);
 				chosen = task;
-				chosen_points = points;
 				get_task_struct(chosen);
 			}
 		}
 		cgroup_iter_end(cgroup, &it);
 	}
 
-	if (!chosen) {
-		if (!ignore_memcg_guarantee) {
-			ignore_memcg_guarantee = true;
-			goto retry;
-		}
+	if (!chosen)
 		return;
-	}
 	points = chosen_points * 1000 / totalpages;
 	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
 			 NULL, "Memory cgroup out of memory");
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 36f6454daace..bea7a2a555b2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -170,6 +170,20 @@ static bool oom_unkillable_task(struct task_struct *p,
 	return false;
 }
 
+static unsigned long mm_overdraft(struct mm_struct *mm)
+{
+	struct mem_cgroup *memcg;
+	struct oom_context *ctx;
+	unsigned long overdraft;
+
+	memcg = try_get_mem_cgroup_from_mm(mm);
+	ctx = mem_cgroup_oom_context(memcg);
+	overdraft = ctx->overdraft;
+	mem_cgroup_put(memcg);
+
+	return overdraft;
+}
+
 /**
  * oom_badness - heuristic function to determine which candidate task to kill
  * @p: task struct of which task we should calculate
@@ -180,11 +194,15 @@ static bool oom_unkillable_task(struct task_struct *p,
  * task consuming the most memory to avoid subsequent oom failures.
  */
 unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-			  const nodemask_t *nodemask, unsigned long totalpages)
+			  const nodemask_t *nodemask, unsigned long totalpages,
+			  unsigned long *overdraft)
 {
 	long points;
 	long adj;
 
+	if (overdraft)
+		*overdraft = 0;
+
 	if (oom_unkillable_task(p, memcg, nodemask))
 		return 0;
 
@@ -192,6 +210,9 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	if (!p)
 		return 0;
 
+	if (overdraft)
+		*overdraft = mm_overdraft(p->mm);
+
 	adj = get_task_oom_score_adj(p);
 	if (adj == OOM_SCORE_ADJ_MIN) {
 		task_unlock(p);
@@ -289,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 
 enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill, bool ignore_memcg_guarantee)
+		bool force_kill)
 {
 	if (oom_unkillable_task(task, NULL, nodemask))
 		return OOM_SCAN_CONTINUE;
@@ -323,10 +344,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
 			return OOM_SCAN_SELECT;
 	}
-
-	if (!ignore_memcg_guarantee && mem_cgroup_below_oom_guarantee(task))
-		return OOM_SCAN_CONTINUE;
-
 	return OOM_SCAN_OK;
 }
 
@@ -343,36 +360,33 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
 	unsigned long chosen_points = 0;
-	bool ignore_memcg_guarantee = false;
+	unsigned long max_overdraft = 0;
 
 	rcu_read_lock();
-retry:
 	for_each_process_thread(g, p) {
 		unsigned int points;
+		unsigned long overdraft;
 
 		switch (oom_scan_process_thread(p, totalpages, nodemask,
-					force_kill, ignore_memcg_guarantee)) {
+						force_kill)) {
 		case OOM_SCAN_SELECT:
 			chosen = p;
 			chosen_points = ULONG_MAX;
+			max_overdraft = ULONG_MAX;
 			/* fall through */
 		case OOM_SCAN_CONTINUE:
 			continue;
 		case OOM_SCAN_OK:
 			break;
 		};
-		points = oom_badness(p, NULL, nodemask, totalpages);
-		if (points > chosen_points) {
+		points = oom_badness(p, NULL, nodemask, totalpages,
+				     &overdraft);
+		if (oom_worse(points, overdraft, &chosen_points,
+			      &max_overdraft))
 			chosen = p;
-			chosen_points = points;
-		}
 	}
 	if (chosen)
 		get_task_struct(chosen);
-	else if (!ignore_memcg_guarantee) {
-		ignore_memcg_guarantee = true;
-		goto retry;
-	}
 	rcu_read_unlock();
 
 	*ppoints = chosen_points * 1000 / totalpages;
@@ -526,7 +540,7 @@ static void __wait_oom_context(struct oom_context *ctx)
 bool oom_trylock(struct mem_cgroup *memcg)
 {
 	unsigned long now = jiffies;
-	struct mem_cgroup *iter;
+	struct mem_cgroup *iter, *parent;
 	struct oom_context *ctx;
 
 	spin_lock(&oom_context_lock);
@@ -579,6 +593,15 @@ bool oom_trylock(struct mem_cgroup *memcg)
 		BUG_ON(ctx->victim);
 		ctx->owner = current;
 		ctx->oom_start = now;
+		/*
+		 * Update overdraft of each cgroup under us. This
+		 * information will be used in oom_badness.
+		 */
+		ctx->overdraft = mem_cgroup_overdraft(iter);
+		parent = parent_mem_cgroup(iter);
+		if (parent && iter != memcg)
+			ctx->overdraft = max(ctx->overdraft,
+				mem_cgroup_oom_context(parent)->overdraft);
 	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
 
 	spin_unlock(&oom_context_lock);
@@ -721,7 +744,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 			 * oom_badness() returns 0 if the thread is unkillable
 			 */
 			child_points = oom_badness(child, memcg, nodemask,
-								totalpages);
+						   totalpages, NULL);
 			if (child_points > victim_points) {
 				put_task_struct(victim);
 				victim = child;
-- 
2.1.4



More information about the Devel mailing list