[Devel] [PATCH RHEL COMMIT] oom: resurrect berserker mode
Konstantin Khorenko
khorenko at virtuozzo.com
Fri Sep 24 15:04:57 MSK 2021
The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit 5caeccd890d55834d65664760ba56cf3b66fe971
Author: Vladimir Davydov <vdavydov.dev at gmail.com>
Date: Fri Sep 24 15:04:56 2021 +0300
oom: resurrect berserker mode
Feature: oom: berserker mode
The logic behind the OOM berserker is the same as in PCS6: if processes
are killed by oom killer too often (< sysctl vm.oom_relaxation, 1 sec by
default), we increase "rage" (min -10, max 20) and kill 1 << "rage"
youngest worst processes if "rage" >= 0.
https://jira.sw.ru/browse/PSBM-17930
Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
[aryabinin: vz8 rebase]
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
+++
oom: Restore vm.oom_relaxation sysctl
'Berserker mode' is used if the OOM killer has to act too often: if
several tasks are killed within 'oom_relaxation' interval, additional tasks
will be killed.
In VZ7, 'vm.oom_relaxation' sysctl defined that value, but it is missing
in VZ8. Restore it, because the default value (1000 jiffies, 1 sec)
might be too small, for example, for slow or highly loaded machines.
Done in the scope of https://jira.sw.ru/browse/PSBM-131983.
Signed-off-by: Evgenii Shatokhin <eshatokhin at virtuozzo.com>
+++
oom: Initialize oom_rage_lock spinlock
Lockdep complained about it as follows:
Done in the scope of https://jira.sw.ru/browse/PSBM-131983.
Signed-off-by: Evgenii Shatokhin <eshatokhin at virtuozzo.com>
(cherry picked from vz8 commit 300a06439b2754e3486a00b68c2753e6c27a8f16)
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
---
include/linux/memcontrol.h | 12 +++++
include/linux/oom.h | 6 +++
kernel/sysctl.c | 7 +++
mm/memcontrol.c | 27 ++++++++++++
mm/oom_kill.c | 107 +++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 159 insertions(+)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 84bacc521142..b1feb6a36da0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -278,6 +278,11 @@ struct mem_cgroup {
/* OOM-Killer disable */
int oom_kill_disable;
+ int oom_rage;
+ spinlock_t oom_rage_lock;
+ unsigned long prev_oom_time;
+ unsigned long oom_time;
+
/* memory.events and memory.events.local */
struct cgroup_file events_file;
struct cgroup_file events_local_file;
@@ -771,6 +776,7 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
struct lruvec *lock_page_lruvec(struct page *page);
@@ -1267,6 +1273,12 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
return true;
}
+static inline bool task_in_mem_cgroup(struct task_struct *task,
+ const struct mem_cgroup *memcg)
+{
+ return true;
+}
+
static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
return NULL;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 3f3f23a785fc..3b31f4256aab 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -22,6 +22,10 @@ enum oom_constraint {
CONSTRAINT_MEMCG,
};
+
+#define OOM_BASE_RAGE -10
+#define OOM_MAX_RAGE 20
+
/*
* Details of the page allocation that triggered the oom killer that are used to
* determine what should be killed.
@@ -51,6 +55,7 @@ struct oom_control {
unsigned long totalpages;
struct task_struct *chosen;
long chosen_points;
+ unsigned long overdraft;
/* Used to print the constraint info. */
enum oom_constraint constraint;
@@ -144,4 +149,5 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p);
extern int sysctl_oom_dump_tasks;
extern int sysctl_oom_kill_allocating_task;
extern int sysctl_panic_on_oom;
+extern int sysctl_oom_relaxation;
#endif /* _INCLUDE_LINUX_OOM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5824d5dd2e1d..081e42171745 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2740,6 +2740,13 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "oom_relaxation",
+ .data = &sysctl_oom_relaxation,
+ .maxlen = sizeof(sysctl_oom_relaxation),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
{
.procname = "overcommit_ratio",
.data = &sysctl_overcommit_ratio,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3f56f33cf6df..6d882c660c21 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1294,6 +1294,32 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
*lru_size += nr_pages;
}
+bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *task_memcg;
+ struct task_struct *p;
+ bool ret;
+
+ p = find_lock_task_mm(task);
+ if (p) {
+ task_memcg = get_mem_cgroup_from_mm(p->mm);
+ task_unlock(p);
+ } else {
+ /*
+ * All threads may have already detached their mm's, but the oom
+ * killer still needs to detect if they have already been oom
+ * killed to prevent needlessly killing additional tasks.
+ */
+ rcu_read_lock();
+ task_memcg = mem_cgroup_from_task(task);
+ css_get(&task_memcg->css);
+ rcu_read_unlock();
+ }
+ ret = mem_cgroup_is_descendant(task_memcg, memcg);
+ css_put(&task_memcg->css);
+ return ret;
+}
+
#ifdef CONFIG_CLEANCACHE
bool mem_cgroup_cleancache_disabled(struct page *page)
{
@@ -5326,6 +5352,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;
INIT_WORK(&memcg->high_work, high_work_func);
+ spin_lock_init(&memcg->oom_rage_lock);
INIT_LIST_HEAD(&memcg->oom_notify);
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0bff802b1887..60c371c655af 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -55,6 +55,7 @@
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks;
+int sysctl_oom_relaxation = HZ;
/*
* Serializes oom killer invocations (out_of_memory()) from all contexts to
@@ -969,6 +970,111 @@ static int oom_kill_memcg_member(struct task_struct *task, void *message)
return 0;
}
+/*
+ * Kill more processes if oom happens too often in this context.
+ */
+static void oom_berserker(struct oom_control *oc)
+{
+ static DEFINE_RATELIMIT_STATE(berserker_rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ struct task_struct *p;
+ struct mem_cgroup *memcg;
+ unsigned long now = jiffies;
+ int rage;
+ int killed = 0;
+
+ memcg = oc->memcg ?: root_mem_cgroup;
+
+ spin_lock(&memcg->oom_rage_lock);
+ memcg->prev_oom_time = memcg->oom_time;
+ memcg->oom_time = now;
+ /*
+ * Increase rage if oom happened recently in this context, reset
+ * rage otherwise.
+ *
+ * previous oom this oom (unfinished)
+ * +++++++++----------------------------++++++++
+ * ^ ^
+ * prev_oom_time <<oom_relaxation>> oom_time
+ */
+ if (time_after(now, memcg->prev_oom_time + sysctl_oom_relaxation))
+ memcg->oom_rage = OOM_BASE_RAGE;
+ else if (memcg->oom_rage < OOM_MAX_RAGE)
+ memcg->oom_rage++;
+ rage = memcg->oom_rage;
+ spin_unlock(&memcg->oom_rage_lock);
+
+ if (rage < 0)
+ return;
+
+ /*
+ * So, we are in rage. Kill (1 << rage) youngest tasks that are
+ * as bad as the victim.
+ */
+ read_lock(&tasklist_lock);
+ list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
+ unsigned long tsk_points;
+ unsigned long tsk_overdraft;
+
+ if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
+ fatal_signal_pending(p) || p->flags & PF_EXITING ||
+ oom_unkillable_task(p))
+ continue;
+
+ /*
+ * When mem_cgroup_out_of_memory() and
+ * p is not member of the group.
+ */
+ if (oc->memcg && !task_in_mem_cgroup(p, oc->memcg))
+ continue;
+
+ /* p may not have freeable memory in nodemask */
+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
+ continue;
+
+ tsk_points = oom_badness(p, oc->totalpages, &tsk_overdraft);
+ if (tsk_overdraft < oc->overdraft)
+ continue;
+
+ /*
+ * oom_badness never returns a negative value, even if
+ * oom_score_adj would make badness so, instead it
+ * returns 1. So we do not kill task with badness 1 if
+ * the victim has badness > 1 so as not to risk killing
+ * protected tasks.
+ */
+ if (tsk_points <= 1 && oc->chosen_points > 1)
+ continue;
+
+ /*
+ * Consider tasks as equally bad if they have equal
+ * normalized scores.
+ */
+ if (tsk_points * 1000 / oc->totalpages <
+ oc->chosen_points * 1000 / oc->totalpages)
+ continue;
+
+ if (__ratelimit(&berserker_rs)) {
+ task_lock(p);
+ pr_err("Rage kill process %d (%s)\n",
+ task_pid_nr(p), p->comm);
+ task_unlock(p);
+ }
+
+ count_vm_event(OOM_KILL);
+ memcg_memory_event(memcg, MEMCG_OOM_KILL);
+
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
+
+ if (++killed >= 1 << rage)
+ break;
+ }
+ read_unlock(&tasklist_lock);
+
+ pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed);
+}
+
static void oom_kill_process(struct oom_control *oc, const char *message)
{
struct task_struct *victim = oc->chosen;
@@ -1012,6 +1118,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
(void *)message);
mem_cgroup_put(oom_group);
}
+ oom_berserker(oc);
}
/*
More information about the Devel
mailing list