[Devel] [PATCH rh7 v2 5/6] oom: rework logic behind memory.oom_guarantee

Vladimir Davydov vdavydov at parallels.com
Fri Sep 11 08:18:05 PDT 2015


Currently, memory.oom_guarantee works as a threshold: we first select
processes in cgroups whose usage is below oom guarantee, and only if
there is no eligible process in such cgroups, we disregard oom guarantee
configuration and iterate over all processes. Although simple to
implement, such a behavior is unfair: we do not differentiate between
cgroups that only slightly above their guarantee and those who exceed it
significantly.

This patch therefore reworks the way how memory.oom_guarantee affects
oom killer behavior. First of all, it reverts old logic, which was
introduced by commit e94e18346f74c ("memcg: add oom_guarantee"), leaving
hunks bringing the memory.oom_guarantee knob intact. Then it implements
a new approach of selecting oom victim that works as follows.

Now a task is selected by oom killer iff (a) the memory cgroup which the
process resides in has the greatest overdraft of all cgroups eligible
for scan and (b) the process has the greatest score among all processes
which reside in cgroups with the greatest overdraft. A cgroup's
overdraft is defined as

  (U-G)/(L-G), if U<G,
  0,           otherwise

  G - memory.oom_guarantee
  L - memory.memsw.limit_in_bytes
  U - memory.memsw.usage_in_bytes

https://jira.sw.ru/browse/PSBM-37915

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 fs/proc/base.c             |  2 +-
 include/linux/memcontrol.h |  6 ++--
 include/linux/oom.h        | 24 +++++++++++--
 mm/memcontrol.c            | 85 +++++++++++++---------------------------------
 mm/oom_kill.c              | 57 ++++++++++++++++++++-----------
 5 files changed, 87 insertions(+), 87 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index b5744986e3b3..b5f3a70c19bd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -455,7 +455,7 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 
 	read_lock(&tasklist_lock);
 	if (pid_alive(task))
-		points = oom_badness(task, NULL, NULL, totalpages) *
+		points = oom_badness(task, NULL, NULL, totalpages, NULL) *
 						1000 / totalpages;
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 591132758911..0c856425ab7a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -122,7 +122,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
 extern struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg);
-extern bool mem_cgroup_below_oom_guarantee(struct task_struct *p);
+extern unsigned long mem_cgroup_overdraft(struct mem_cgroup *memcg);
 extern void mem_cgroup_note_oom_kill(struct mem_cgroup *memcg,
 				     struct task_struct *task);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
@@ -372,9 +372,9 @@ mem_cgroup_oom_context(struct mem_cgroup *memcg)
 	return &oom_ctx;
 }
 
-static inline bool mem_cgroup_below_oom_guarantee(struct task_struct *p)
+static inline unsigned long mem_cgroup_overdraft(struct mem_cgroup *memcg)
 {
-	return false;
+	return 0;
 }
 
 static inline void
diff --git a/include/linux/oom.h b/include/linux/oom.h
index caeda102e7ba..4e12187663ed 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -35,6 +35,7 @@ struct oom_context {
 	struct task_struct *owner;
 	struct task_struct *victim;
 	unsigned long oom_start;
+	unsigned long overdraft;
 	wait_queue_head_t waitq;
 };
 
@@ -64,8 +65,25 @@ extern int get_task_oom_score_adj(struct task_struct *t);
 
 extern void mark_oom_victim(struct task_struct *tsk);
 
-extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-			  const nodemask_t *nodemask, unsigned long totalpages);
+extern unsigned long oom_badness(struct task_struct *p,
+		struct mem_cgroup *memcg, const nodemask_t *nodemask,
+		unsigned long totalpages, unsigned long *overdraft);
+
+static inline bool oom_worse(unsigned long points, unsigned long overdraft,
+		unsigned long *chosen_points, unsigned long *max_overdraft)
+{
+	if (overdraft > *max_overdraft) {
+		*max_overdraft = overdraft;
+		*chosen_points = points;
+		return true;
+	}
+	if (overdraft == *max_overdraft && points > *chosen_points) {
+		*chosen_points = points;
+		return true;
+	}
+	return false;
+}
+
 extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 			     unsigned int points, unsigned long totalpages,
 			     struct mem_cgroup *memcg, nodemask_t *nodemask,
@@ -79,7 +97,7 @@ extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 
 extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill, bool ignore_memcg_guarantee);
+		bool force_kill);
 
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *mask, bool force_kill);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index de3ad32ecb89..41fb41b16664 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1578,51 +1578,6 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
 	return true;
 }
 
-static bool __mem_cgroup_below_oom_guarantee(struct mem_cgroup *root,
-					     struct mem_cgroup *memcg)
-{
-	if (mem_cgroup_disabled())
-		return false;
-
-	if (memcg == root_mem_cgroup)
-		return false;
-
-	if (res_counter_read_u64(&memcg->memsw, RES_USAGE) >=
-					memcg->oom_guarantee)
-		return false;
-
-	while (memcg != root) {
-		memcg = parent_mem_cgroup(memcg);
-		if (!memcg)
-			break;
-
-		if (memcg == root_mem_cgroup)
-			break;
-
-		if (res_counter_read_u64(&memcg->memsw, RES_USAGE) >=
-						memcg->oom_guarantee)
-			return false;
-	}
-	return true;
-}
-
-bool mem_cgroup_below_oom_guarantee(struct task_struct *p)
-{
-	struct mem_cgroup *memcg = NULL;
-	bool ret = false;
-
-	p = find_lock_task_mm(p);
-	if (p) {
-		memcg = try_get_mem_cgroup_from_mm(p->mm);
-		task_unlock(p);
-	}
-	if (memcg) {
-		ret = __mem_cgroup_below_oom_guarantee(root_mem_cgroup, memcg);
-		css_put(&memcg->css);
-	}
-	return ret;
-}
-
 void mem_cgroup_note_oom_kill(struct mem_cgroup *root_memcg,
 			      struct task_struct *task)
 {
@@ -1663,6 +1618,22 @@ struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg)
 	return &memcg->oom_ctx;
 }
 
+unsigned long mem_cgroup_overdraft(struct mem_cgroup *memcg)
+{
+	unsigned long long guarantee, limit, usage;
+	unsigned long score;
+
+	guarantee = ACCESS_ONCE(memcg->oom_guarantee);
+	limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+	usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+
+	if (limit >= RESOURCE_MAX || guarantee >= limit || usage <= guarantee)
+		return 0;
+
+	score = div64_u64(1000 * (usage - guarantee), limit - guarantee);
+	return score > 0 ? score : 1;
+}
+
 unsigned long mem_cgroup_total_pages(struct mem_cgroup *memcg, bool swap)
 {
 	unsigned long long limit;
@@ -1960,11 +1931,12 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				     int order)
 {
 	struct mem_cgroup *iter;
+	unsigned long max_overdraft = 0;
 	unsigned long chosen_points = 0;
 	unsigned long totalpages;
+	unsigned long overdraft;
 	unsigned int points = 0;
 	struct task_struct *chosen = NULL;
-	bool ignore_memcg_guarantee = false;
 
 	/*
 	 * If current has a pending SIGKILL or is exiting, then automatically
@@ -1978,20 +1950,15 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
 	totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
-retry:
 	for_each_mem_cgroup_tree(iter, memcg) {
 		struct cgroup *cgroup = iter->css.cgroup;
 		struct cgroup_iter it;
 		struct task_struct *task;
 
-		if (!ignore_memcg_guarantee &&
-		    __mem_cgroup_below_oom_guarantee(memcg, iter))
-			continue;
-
 		cgroup_iter_start(cgroup, &it);
 		while ((task = cgroup_iter_next(cgroup, &it))) {
 			switch (oom_scan_process_thread(task, totalpages, NULL,
-							false, true)) {
+							false)) {
 			case OOM_SCAN_SELECT:
 				if (chosen)
 					put_task_struct(chosen);
@@ -2010,25 +1977,21 @@ retry:
 			case OOM_SCAN_OK:
 				break;
 			};
-			points = oom_badness(task, memcg, NULL, totalpages);
-			if (points > chosen_points) {
+			points = oom_badness(task, memcg, NULL, totalpages,
+					     &overdraft);
+			if (oom_worse(points, overdraft, &chosen_points,
+				      &max_overdraft)) {
 				if (chosen)
 					put_task_struct(chosen);
 				chosen = task;
-				chosen_points = points;
 				get_task_struct(chosen);
 			}
 		}
 		cgroup_iter_end(cgroup, &it);
 	}
 
-	if (!chosen) {
-		if (!ignore_memcg_guarantee) {
-			ignore_memcg_guarantee = true;
-			goto retry;
-		}
+	if (!chosen)
 		return;
-	}
 	points = chosen_points * 1000 / totalpages;
 	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
 			 NULL, "Memory cgroup out of memory");
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 130318ee3598..4290d6665429 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -163,6 +163,20 @@ static bool oom_unkillable_task(struct task_struct *p,
 	return false;
 }
 
+static unsigned long mm_overdraft(struct mm_struct *mm)
+{
+	struct mem_cgroup *memcg;
+	struct oom_context *ctx;
+	unsigned long overdraft;
+
+	memcg = try_get_mem_cgroup_from_mm(mm);
+	ctx = mem_cgroup_oom_context(memcg);
+	overdraft = ctx->overdraft;
+	mem_cgroup_put(memcg);
+
+	return overdraft;
+}
+
 /**
  * oom_badness - heuristic function to determine which candidate task to kill
  * @p: task struct of which task we should calculate
@@ -173,7 +187,8 @@ static bool oom_unkillable_task(struct task_struct *p,
  * task consuming the most memory to avoid subsequent oom failures.
  */
 unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-			  const nodemask_t *nodemask, unsigned long totalpages)
+			  const nodemask_t *nodemask, unsigned long totalpages,
+			  unsigned long *overdraft)
 {
 	long points;
 	long adj;
@@ -185,6 +200,9 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	if (!p)
 		return 0;
 
+	if (overdraft)
+		*overdraft = mm_overdraft(p->mm);
+
 	adj = get_task_oom_score_adj(p);
 	if (adj == OOM_SCORE_ADJ_MIN) {
 		task_unlock(p);
@@ -282,7 +300,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 
 enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill, bool ignore_memcg_guarantee)
+		bool force_kill)
 {
 	if (oom_unkillable_task(task, NULL, nodemask))
 		return OOM_SCAN_CONTINUE;
@@ -316,10 +334,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
 			return OOM_SCAN_ABORT;
 	}
-
-	if (!ignore_memcg_guarantee && mem_cgroup_below_oom_guarantee(task))
-		return OOM_SCAN_CONTINUE;
-
 	return OOM_SCAN_OK;
 }
 
@@ -336,15 +350,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
 	unsigned long chosen_points = 0;
-	bool ignore_memcg_guarantee = false;
+	unsigned long max_overdraft = 0;
 
 	rcu_read_lock();
-retry:
 	for_each_process_thread(g, p) {
 		unsigned int points;
+		unsigned long overdraft;
 
 		switch (oom_scan_process_thread(p, totalpages, nodemask,
-					force_kill, ignore_memcg_guarantee)) {
+						force_kill)) {
 		case OOM_SCAN_SELECT:
 			chosen = p;
 			chosen_points = ULONG_MAX;
@@ -357,18 +371,14 @@ retry:
 		case OOM_SCAN_OK:
 			break;
 		};
-		points = oom_badness(p, NULL, nodemask, totalpages);
-		if (points > chosen_points) {
+		points = oom_badness(p, NULL, nodemask, totalpages,
+				     &overdraft);
+		if (oom_worse(points, overdraft, &chosen_points,
+			      &max_overdraft))
 			chosen = p;
-			chosen_points = points;
-		}
 	}
 	if (chosen)
 		get_task_struct(chosen);
-	else if (!ignore_memcg_guarantee) {
-		ignore_memcg_guarantee = true;
-		goto retry;
-	}
 	rcu_read_unlock();
 
 	*ppoints = chosen_points * 1000 / totalpages;
@@ -552,7 +562,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 			 * oom_badness() returns 0 if the thread is unkillable
 			 */
 			child_points = oom_badness(child, memcg, nodemask,
-								totalpages);
+						   totalpages, NULL);
 			if (child_points > victim_points) {
 				put_task_struct(victim);
 				victim = child;
@@ -652,7 +662,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 bool oom_trylock(struct mem_cgroup *memcg)
 {
 	unsigned long now = jiffies;
-	struct mem_cgroup *iter;
+	struct mem_cgroup *iter, *parent;
 	struct oom_context *ctx;
 	DEFINE_WAIT(wait);
 
@@ -703,6 +713,15 @@ bool oom_trylock(struct mem_cgroup *memcg)
 		BUG_ON(ctx->victim);
 		ctx->owner = current;
 		ctx->oom_start = now;
+		/*
+		 * Update overdraft of each cgroup under us. This
+		 * information will be used in oom_badness.
+		 */
+		ctx->overdraft = mem_cgroup_overdraft(iter);
+		parent = parent_mem_cgroup(iter);
+		if (parent && iter != memcg)
+			ctx->overdraft = max(ctx->overdraft,
+				mem_cgroup_oom_context(parent)->overdraft);
 	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
 
 	spin_unlock(&oom_context_lock);
-- 
2.1.4




More information about the Devel mailing list