[Devel] [PATCH RH8] oom: Scan only the relevant memcg when in berserker mode

Evgenii Shatokhin eshatokhin at virtuozzo.com
Wed Jul 28 23:04:56 MSK 2021


Even if an OOM happened in a memory cgroup, oom_berserker() iterated
over all tasks in the system and called task_in_mem_cgroup() for each
one to find the tasks from the necessary memcg.

It is not needed to scan all the tasks though. select_bad_process() and
other functions already rely on mem_cgroup_scan_tasks() to iterate over
the tasks from a given memory cgroup only.

Now, oom_berserker() was revisited to make use of that too.
task_in_mem_cgroup() was no longer needed after that and was removed.

There is one potential disadvantage with this approach though. Walking
through the whole list of tasks in the system allowed oom_berserker() to
pick the youngest tasks in the memcg which were no better than the OOM
killer's previous target. mem_cgroup_scan_tasks(), however, iterates over
the tasks in an unspecified order. It seems, it processes the cgroups
from parent to children and the tasks in each cgroup - from the oldest
added to the newest added ones, but this could change in the future.
This way, oom_berserker() can select the tasks which are no better than
the target, but not the youngest ones.

Not sure if this is a real problem though. oom_berserker() triggers only
when the normal OOM killer kills tasks very often. If some system task
eats up lots of memory and gets shot as a result, then there is
something wrong with that task already. It would likely be shot by the
normal OOM killer anyway, but a bit later.

https://jira.sw.ru/browse/PSBM-131984
Signed-off-by: Evgenii Shatokhin <eshatokhin at virtuozzo.com>
---
 include/linux/memcontrol.h |   7 --
 mm/memcontrol.c            |  26 ------
 mm/oom_kill.c              | 164 +++++++++++++++++++++----------------
 3 files changed, 95 insertions(+), 102 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5458f18c0ae7..c6244a62f376 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -551,7 +551,6 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
 
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
@@ -1096,12 +1095,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 	return true;
 }
 
-static inline bool task_in_mem_cgroup(struct task_struct *task,
-				      const struct mem_cgroup *memcg)
-{
-	return true;
-}
-
 static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	return NULL;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f9a1c3c4f85e..424bc1e52b3b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1462,32 +1462,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 		*lru_size += nr_pages;
 }
 
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *task_memcg;
-	struct task_struct *p;
-	bool ret;
-
-	p = find_lock_task_mm(task);
-	if (p) {
-		task_memcg = get_mem_cgroup_from_mm(p->mm);
-		task_unlock(p);
-	} else {
-		/*
-		 * All threads may have already detached their mm's, but the oom
-		 * killer still needs to detect if they have already been oom
-		 * killed to prevent needlessly killing additional tasks.
-		 */
-		rcu_read_lock();
-		task_memcg = mem_cgroup_from_task(task);
-		css_get(&task_memcg->css);
-		rcu_read_unlock();
-	}
-	ret = mem_cgroup_is_descendant(task_memcg, memcg);
-	css_put(&task_memcg->css);
-	return ret;
-}
-
 #ifdef CONFIG_CLEANCACHE
 bool mem_cgroup_cleancache_disabled(struct page *page)
 {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a64a9ff7391b..382a85b64c1f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -951,21 +951,97 @@ static int oom_kill_memcg_member(struct task_struct *task, void *message)
 	return 0;
 }
 
+/*
+ * The data used when iterating over tasks in "rage/berserker" mode.
+ */
+struct rage_kill_data {
+	struct oom_control *oc;
+
+	/* current level of berserker rage */
+	int rage;
+
+	/* number of tasks killed so far */
+	int killed;
+};
+
+/*
+ * Returns 1 if enough tasks have been killed, 0 otherwise.
+ */
+static int
+maybe_rage_kill(struct task_struct *p, void *arg)
+{
+	static DEFINE_RATELIMIT_STATE(berserker_rs,
+				      DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+	struct rage_kill_data *rkd = arg;
+	struct oom_control *oc = rkd->oc;
+	unsigned long tsk_points;
+	unsigned long tsk_overdraft;
+
+	if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
+		fatal_signal_pending(p) || p->flags & PF_EXITING ||
+		oom_unkillable_task(p))
+		return 0;
+
+	/* p may not have freeable memory in nodemask */
+	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
+		return 0;
+
+	tsk_points = oom_badness(p, oc->totalpages, &tsk_overdraft);
+	if (tsk_overdraft < oc->overdraft)
+		return 0;
+
+	/*
+	 * oom_badness never returns a negative value, even if
+	 * oom_score_adj would make badness so, instead it
+	 * returns 1. So we do not kill task with badness 1 if
+	 * the victim has badness > 1 so as not to risk killing
+	 * protected tasks.
+	 */
+	if (tsk_points <= 1 && oc->chosen_points > 1)
+		return 0;
+
+	/*
+	 * Consider tasks as equally bad if they occupy equal
+	 * percentage of available memory.
+	 */
+	if (tsk_points * 100 / oc->totalpages <
+		oc->chosen_points * 100 / oc->totalpages)
+		return 0;
+
+	if (__ratelimit(&berserker_rs)) {
+		task_lock(p);
+		pr_err("Rage kill process %d (%s)\n",
+			task_pid_nr(p), p->comm);
+		task_unlock(p);
+	}
+
+	count_vm_event(OOM_KILL);
+	memcg_memory_event((is_memcg_oom(oc) ? oc->memcg : root_mem_cgroup),
+			   MEMCG_OOM_KILL);
+
+	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
+
+	++rkd->killed;
+	if (rkd->killed >= 1 << rkd->rage)
+		return 1;
+	return 0;
+}
+
 /*
  * Kill more processes if oom happens too often in this context.
  */
 static void oom_berserker(struct oom_control *oc)
 {
-	static DEFINE_RATELIMIT_STATE(berserker_rs,
-				DEFAULT_RATELIMIT_INTERVAL,
-				DEFAULT_RATELIMIT_BURST);
 	struct task_struct *p;
 	struct mem_cgroup *memcg;
 	unsigned long now = jiffies;
-	int rage;
-	int killed = 0;
+	struct rage_kill_data rkd = {
+		.oc = oc,
+		.killed = 0,
+	};
 
-	memcg = oc->memcg ?: root_mem_cgroup;
+	memcg = is_memcg_oom(oc) ? oc->memcg : root_mem_cgroup;
 
 	spin_lock(&memcg->oom_rage_lock);
 	memcg->prev_oom_time = memcg->oom_time;
@@ -983,77 +1059,27 @@ static void oom_berserker(struct oom_control *oc)
 		memcg->oom_rage = OOM_BASE_RAGE;
 	else if (memcg->oom_rage < OOM_MAX_RAGE)
 		memcg->oom_rage++;
-	rage = memcg->oom_rage;
+	rkd.rage = memcg->oom_rage;
 	spin_unlock(&memcg->oom_rage_lock);
 
-	if (rage < 0)
+	if (rkd.rage < 0)
 		return;
 
 	/*
-	 * So, we are in rage. Kill (1 << rage) youngest tasks that are
-	 * as bad as the victim.
+	 * So, we are in rage. Kill (1 << rage) tasks that are as bad as
+	 * the victim.
 	 */
-	read_lock(&tasklist_lock);
-	list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
-		unsigned long tsk_points;
-		unsigned long tsk_overdraft;
-
-		if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
-			fatal_signal_pending(p) || p->flags & PF_EXITING ||
-			oom_unkillable_task(p))
-			continue;
-
-		/*
-		 * When mem_cgroup_out_of_memory() and
-		 * p is not member of the group.
-		 */
-		if (oc->memcg && !task_in_mem_cgroup(p, oc->memcg))
-			continue;
-
-		/* p may not have freeable memory in nodemask */
-		if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
-			continue;
-
-		tsk_points = oom_badness(p, oc->totalpages, &tsk_overdraft);
-		if (tsk_overdraft < oc->overdraft)
-			continue;
-
-		/*
-		 * oom_badness never returns a negative value, even if
-		 * oom_score_adj would make badness so, instead it
-		 * returns 1. So we do not kill task with badness 1 if
-		 * the victim has badness > 1 so as not to risk killing
-		 * protected tasks.
-		 */
-		if (tsk_points <= 1 && oc->chosen_points > 1)
-			continue;
-
-		/*
-		 * Consider tasks as equally bad if they occupy equal
-		 * percentage of available memory.
-		 */
-		if (tsk_points * 100 / oc->totalpages <
-			oc->chosen_points * 100 / oc->totalpages)
-			continue;
-
-		if (__ratelimit(&berserker_rs)) {
-			task_lock(p);
-			pr_err("Rage kill process %d (%s)\n",
-				task_pid_nr(p), p->comm);
-			task_unlock(p);
-		}
-
-		count_vm_event(OOM_KILL);
-		memcg_memory_event(memcg, MEMCG_OOM_KILL);
-
-		do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
-
-		if (++killed >= 1 << rage)
-			break;
+	if (is_memcg_oom(oc)) {
+		mem_cgroup_scan_tasks(oc->memcg, maybe_rage_kill, &rkd);
+	} else {
+		read_lock(&tasklist_lock);
+		list_for_each_entry_reverse(p, &init_task.tasks, tasks)
+			if (maybe_rage_kill(p, &rkd))
+				break;
+		read_unlock(&tasklist_lock);
 	}
-	read_unlock(&tasklist_lock);
 
-	pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed);
+	pr_err("OOM killer in rage %d: %d tasks killed\n", rkd.rage, rkd.killed);
 }
 
 static void oom_kill_process(struct oom_control *oc, const char *message)
-- 
2.29.0



More information about the Devel mailing list