[Devel] [PATCH 14/17] mm: take into account ub oom score on global oom

Vladimir Davydov vdavydov at parallels.com
Fri Aug 14 10:03:38 PDT 2015


OOM score of a beancounter is defined as

  (U-G)/(L-G), if U<G,
  0,           otherwise

  G - UB's OOM guarantee
  L - UB's memory+swap limit
  U - UB's memory+swap usage

All the parameters are taken from the memory cgroup the beancounter is
associated with (beancounter.memory).

When we select an OOM victim, now we consider not only task's OOM score,
but also beancounter's, as follows:

  1. OOM score of the beancounter the victim task belongs to must be
     minimal among all beancounters.
  2. OOM score of the victim task must be minimal among all tasks that
     belong to beancounters with the minimal OOM score.

https://jira.sw.ru/browse/PSBM-37915

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 include/bc/beancounter.h |  5 +++++
 kernel/bc/beancounter.c  | 29 +++++++++++++++++++++++++++++
 mm/memcontrol.c          | 16 ++++++++++++++++
 mm/oom_kill.c            | 27 ++++++++++++++++++++++++++-
 4 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/include/bc/beancounter.h b/include/bc/beancounter.h
index 9180f2a04f56..d88f9e690a60 100644
--- a/include/bc/beancounter.h
+++ b/include/bc/beancounter.h
@@ -146,6 +146,8 @@ struct user_beancounter {
 
 	void			*private_data2;
 
+	unsigned long		oom_score;
+
 	/* resources statistic and settings */
 	struct ubparm		ub_parms[UB_RESOURCES];
 	/* resources statistic for last interval */
@@ -333,6 +335,9 @@ extern void ub_page_stat(struct user_beancounter *ub,
 			 unsigned long *pages);
 extern unsigned long ub_total_pages(struct user_beancounter *ub, bool swap);
 
+extern void ub_update_oom_score(void);
+extern unsigned long ub_oom_score(struct task_struct *p);
+
 extern const char *ub_rnames[];
 /*
  *	Put a beancounter reference
diff --git a/kernel/bc/beancounter.c b/kernel/bc/beancounter.c
index 6b5ed78e08a1..35cd80572623 100644
--- a/kernel/bc/beancounter.c
+++ b/kernel/bc/beancounter.c
@@ -38,6 +38,7 @@
 #include <linux/ve.h>
 #include <linux/cgroup.h>
 #include <linux/task_work.h>
+#include <linux/oom.h>
 
 #include <bc/beancounter.h>
 #include <bc/io_acct.h>
@@ -203,6 +204,7 @@ extern void mem_cgroup_get_nr_pages(struct mem_cgroup *memcg, int nid,
 				    unsigned long *pages);
 extern unsigned long mem_cgroup_total_pages(struct mem_cgroup *memcg,
 					    bool swap);
+extern unsigned long mem_cgroup_oom_score(struct mem_cgroup *memcg);
 
 /*
  * Update memcg limits according to beancounter configuration.
@@ -257,6 +259,33 @@ unsigned long ub_total_pages(struct user_beancounter *ub, bool swap)
 	return ret;
 }
 
+void ub_update_oom_score(void)
+{
+	struct user_beancounter *ub;
+	struct cgroup_subsys_state *css;
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		css = ub_get_mem_css(ub);
+		ub->oom_score = mem_cgroup_oom_score(
+				mem_cgroup_from_cont(css->cgroup));
+		css_put(css);
+	}
+	rcu_read_unlock();
+}
+
+unsigned long ub_oom_score(struct task_struct *p)
+{
+	unsigned long ret;
+
+	p = find_lock_task_mm(p);
+	if (!p)
+		return 0;
+	ret = mm_ub(p->mm)->oom_score;
+	task_unlock(p);
+	return ret;
+}
+
 void init_beancounter_precharge(struct user_beancounter *ub, int resource)
 {
 	/* limit maximum precharge with one half of current resource excess */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b27b4148600b..505952f22ea9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1621,6 +1621,22 @@ unsigned long mem_cgroup_total_pages(struct mem_cgroup *memcg, bool swap)
 	return min_t(unsigned long long, ULONG_MAX, limit >> PAGE_SHIFT);
 }
 
+unsigned long mem_cgroup_oom_score(struct mem_cgroup *memcg)
+{
+	unsigned long long guarantee, limit, usage;
+	unsigned long score;
+
+	guarantee = memcg->oom_guarantee;
+	limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+	usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+
+	if (limit >= RESOURCE_MAX || guarantee >= limit || usage <= guarantee)
+		return 0;
+
+	score = div64_u64(1000 * (usage - guarantee), limit - guarantee);
+	return score > 0 ? score : 1;
+}
+
 #define mem_cgroup_from_res_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9ce3e021947e..e55193df00c9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -35,6 +35,7 @@
 #include <linux/freezer.h>
 #include <linux/ftrace.h>
 #include <linux/ratelimit.h>
+#include <bc/beancounter.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/oom.h>
@@ -319,10 +320,18 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
 	unsigned long chosen_points = 0;
+	unsigned long chosen_ub_score = 0;
+
+	/*
+	 * We cache per ub oom score values so that all tasks of the same ub
+	 * will compete fairly in spite of fluctuations in resource counters.
+	 */
+	ub_update_oom_score();
 
 	rcu_read_lock();
 	for_each_process_thread(g, p) {
 		unsigned int points;
+		unsigned int ub_score;
 
 		switch (oom_scan_process_thread(p, totalpages, nodemask,
 						force_kill)) {
@@ -339,10 +348,17 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 		case OOM_SCAN_OK:
 			break;
 		};
+
+		ub_score = ub_oom_score(p);
+		if (ub_score < chosen_ub_score)
+			continue;
+
 		points = oom_badness(p, NULL, nodemask, totalpages);
-		if (points > chosen_points) {
+		if (points > chosen_points ||
+		    ub_score > chosen_ub_score) {
 			chosen = p;
 			chosen_points = points;
+			chosen_ub_score = ub_score;
 		}
 	}
 	if (chosen)
@@ -462,6 +478,15 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	if (__ratelimit(&oom_rs))
 		dump_header(p, gfp_mask, order, memcg, nodemask);
 
+#ifdef CONFIG_BEANCOUNTERS
+	if (!memcg) {
+		struct user_beancounter *ub = get_task_ub(p);
+
+		pr_err("%s: Worst ub %s score %lu\n",
+		       message, ub->ub_name, ub->oom_score);
+	}
+#endif
+
 	task_lock(p);
 	pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
 		message, task_pid_nr(p), p->comm, points);
-- 
2.1.4




More information about the Devel mailing list