[Devel] [PATCH RH8] oom: Scan only the relevant memcg when in berserker mode
Evgenii Shatokhin
eshatokhin at virtuozzo.com
Wed Jul 28 23:04:56 MSK 2021
Even if an OOM happened in a memory cgroup, oom_berserker() iterated
over all tasks in the system and called task_in_mem_cgroup() for each
one to find the tasks from the necessary memcg.
It is not needed to scan all the tasks though. select_bad_process() and
other functions already rely on mem_cgroup_scan_tasks() to iterate over
the tasks from a given memory cgroup only.
Now, oom_berserker() was revisited to make use of that too.
task_in_mem_cgroup() was no longer needed after that and was removed.
There is one potential disadvantage with this approach though. Walking
through the whole list of tasks in the system allowed oom_berserker() to
pick the youngest tasks in the memcg which were no better than the OOM
killer's previous target. mem_cgroup_scan_tasks(), however, iterates over
the tasks in an unspecified order. It seems, it processes the cgroups
from parent to children and the tasks in each cgroup - from the oldest
added to the newest added ones, but this could change in the future.
This way, oom_berserker() can select the tasks which are no better than
the target, but not the youngest ones.
Not sure if this is a real problem though. oom_berserker() triggers only
when the normal OOM killer kills tasks very often. If some system task
eats up lots of memory and gets shot as a result, then there is
something wrong with that task already. It would likely be shot by the
normal OOM killer anyway, but a bit later.
https://jira.sw.ru/browse/PSBM-131984
Signed-off-by: Evgenii Shatokhin <eshatokhin at virtuozzo.com>
---
include/linux/memcontrol.h | 7 --
mm/memcontrol.c | 26 ------
mm/oom_kill.c | 164 +++++++++++++++++++++----------------
3 files changed, 95 insertions(+), 102 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5458f18c0ae7..c6244a62f376 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -551,7 +551,6 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
@@ -1096,12 +1095,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
return true;
}
-static inline bool task_in_mem_cgroup(struct task_struct *task,
- const struct mem_cgroup *memcg)
-{
- return true;
-}
-
static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
return NULL;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f9a1c3c4f85e..424bc1e52b3b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1462,32 +1462,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
*lru_size += nr_pages;
}
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
-{
- struct mem_cgroup *task_memcg;
- struct task_struct *p;
- bool ret;
-
- p = find_lock_task_mm(task);
- if (p) {
- task_memcg = get_mem_cgroup_from_mm(p->mm);
- task_unlock(p);
- } else {
- /*
- * All threads may have already detached their mm's, but the oom
- * killer still needs to detect if they have already been oom
- * killed to prevent needlessly killing additional tasks.
- */
- rcu_read_lock();
- task_memcg = mem_cgroup_from_task(task);
- css_get(&task_memcg->css);
- rcu_read_unlock();
- }
- ret = mem_cgroup_is_descendant(task_memcg, memcg);
- css_put(&task_memcg->css);
- return ret;
-}
-
#ifdef CONFIG_CLEANCACHE
bool mem_cgroup_cleancache_disabled(struct page *page)
{
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a64a9ff7391b..382a85b64c1f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -951,21 +951,97 @@ static int oom_kill_memcg_member(struct task_struct *task, void *message)
return 0;
}
+/*
+ * The data used when iterating over tasks in "rage/berserker" mode.
+ */
+struct rage_kill_data {
+ struct oom_control *oc;
+
+ /* current level of berserker rage */
+ int rage;
+
+ /* number of tasks killed so far */
+ int killed;
+};
+
+/*
+ * Returns 1 if enough tasks have been killed, 0 otherwise.
+ */
+static int
+maybe_rage_kill(struct task_struct *p, void *arg)
+{
+ static DEFINE_RATELIMIT_STATE(berserker_rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ struct rage_kill_data *rkd = arg;
+ struct oom_control *oc = rkd->oc;
+ unsigned long tsk_points;
+ unsigned long tsk_overdraft;
+
+ if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
+ fatal_signal_pending(p) || p->flags & PF_EXITING ||
+ oom_unkillable_task(p))
+ return 0;
+
+ /* p may not have freeable memory in nodemask */
+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
+ return 0;
+
+ tsk_points = oom_badness(p, oc->totalpages, &tsk_overdraft);
+ if (tsk_overdraft < oc->overdraft)
+ return 0;
+
+ /*
+ * oom_badness never returns a negative value, even if
+ * oom_score_adj would make badness so, instead it
+ * returns 1. So we do not kill task with badness 1 if
+ * the victim has badness > 1 so as not to risk killing
+ * protected tasks.
+ */
+ if (tsk_points <= 1 && oc->chosen_points > 1)
+ return 0;
+
+ /*
+ * Consider tasks as equally bad if they occupy equal
+ * percentage of available memory.
+ */
+ if (tsk_points * 100 / oc->totalpages <
+ oc->chosen_points * 100 / oc->totalpages)
+ return 0;
+
+ if (__ratelimit(&berserker_rs)) {
+ task_lock(p);
+ pr_err("Rage kill process %d (%s)\n",
+ task_pid_nr(p), p->comm);
+ task_unlock(p);
+ }
+
+ count_vm_event(OOM_KILL);
+ memcg_memory_event((is_memcg_oom(oc) ? oc->memcg : root_mem_cgroup),
+ MEMCG_OOM_KILL);
+
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
+
+ ++rkd->killed;
+ if (rkd->killed >= 1 << rkd->rage)
+ return 1;
+ return 0;
+}
+
/*
* Kill more processes if oom happens too often in this context.
*/
static void oom_berserker(struct oom_control *oc)
{
- static DEFINE_RATELIMIT_STATE(berserker_rs,
- DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
struct task_struct *p;
struct mem_cgroup *memcg;
unsigned long now = jiffies;
- int rage;
- int killed = 0;
+ struct rage_kill_data rkd = {
+ .oc = oc,
+ .killed = 0,
+ };
- memcg = oc->memcg ?: root_mem_cgroup;
+ memcg = is_memcg_oom(oc) ? oc->memcg : root_mem_cgroup;
spin_lock(&memcg->oom_rage_lock);
memcg->prev_oom_time = memcg->oom_time;
@@ -983,77 +1059,27 @@ static void oom_berserker(struct oom_control *oc)
memcg->oom_rage = OOM_BASE_RAGE;
else if (memcg->oom_rage < OOM_MAX_RAGE)
memcg->oom_rage++;
- rage = memcg->oom_rage;
+ rkd.rage = memcg->oom_rage;
spin_unlock(&memcg->oom_rage_lock);
- if (rage < 0)
+ if (rkd.rage < 0)
return;
/*
- * So, we are in rage. Kill (1 << rage) youngest tasks that are
- * as bad as the victim.
+ * So, we are in rage. Kill (1 << rage) tasks that are as bad as
+ * the victim.
*/
- read_lock(&tasklist_lock);
- list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
- unsigned long tsk_points;
- unsigned long tsk_overdraft;
-
- if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
- fatal_signal_pending(p) || p->flags & PF_EXITING ||
- oom_unkillable_task(p))
- continue;
-
- /*
- * When mem_cgroup_out_of_memory() and
- * p is not member of the group.
- */
- if (oc->memcg && !task_in_mem_cgroup(p, oc->memcg))
- continue;
-
- /* p may not have freeable memory in nodemask */
- if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
- continue;
-
- tsk_points = oom_badness(p, oc->totalpages, &tsk_overdraft);
- if (tsk_overdraft < oc->overdraft)
- continue;
-
- /*
- * oom_badness never returns a negative value, even if
- * oom_score_adj would make badness so, instead it
- * returns 1. So we do not kill task with badness 1 if
- * the victim has badness > 1 so as not to risk killing
- * protected tasks.
- */
- if (tsk_points <= 1 && oc->chosen_points > 1)
- continue;
-
- /*
- * Consider tasks as equally bad if they occupy equal
- * percentage of available memory.
- */
- if (tsk_points * 100 / oc->totalpages <
- oc->chosen_points * 100 / oc->totalpages)
- continue;
-
- if (__ratelimit(&berserker_rs)) {
- task_lock(p);
- pr_err("Rage kill process %d (%s)\n",
- task_pid_nr(p), p->comm);
- task_unlock(p);
- }
-
- count_vm_event(OOM_KILL);
- memcg_memory_event(memcg, MEMCG_OOM_KILL);
-
- do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
-
- if (++killed >= 1 << rage)
- break;
+ if (is_memcg_oom(oc)) {
+ mem_cgroup_scan_tasks(oc->memcg, maybe_rage_kill, &rkd);
+ } else {
+ read_lock(&tasklist_lock);
+ list_for_each_entry_reverse(p, &init_task.tasks, tasks)
+ if (maybe_rage_kill(p, &rkd))
+ break;
+ read_unlock(&tasklist_lock);
}
- read_unlock(&tasklist_lock);
- pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed);
+ pr_err("OOM killer in rage %d: %d tasks killed\n", rkd.rage, rkd.killed);
}
static void oom_kill_process(struct oom_control *oc, const char *message)
--
2.29.0
More information about the Devel
mailing list