[Devel] [PATCH RHEL7 COMMIT] oom: introduce oom timeout
Konstantin Khorenko
khorenko at virtuozzo.com
Thu Jan 28 08:21:28 PST 2016
The commit is pushed to "branch-rh7-3.10.0-327.3.1-vz7.10.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-327.3.1.vz7.10.7
------>
commit 522b3faa45d160bb1dc4903bdf524286d5a543d4
Author: Vladimir Davydov <vdavydov at virtuozzo.com>
Date: Thu Jan 28 20:21:28 2016 +0400
oom: introduce oom timeout
Rebase to RHEL 7.2 based kernel:
https://jira.sw.ru/browse/PSBM-42320
===
From: Vladimir Davydov <vdavydov at parallels.com>
Patchset description: oom enhancements - part 2
- Patches 1-2 prepare memcg for upcoming changes in oom design.
- Patch 3 reworks oom locking design so that the executioner waits for
victim to exit. This is necessary to increase oom kill rate, which is
essential for berserker mode.
- Patch 4 drops unused OOM_SCAN_ABORT
- Patch 5 introduces oom timeout.
https://jira.sw.ru/browse/PSBM-38581
- Patch 6 makes oom fairer when it comes to selecting a victim among
different containers.
https://jira.sw.ru/browse/PSBM-37915
- Patch 7 prepares oom for introducing berserker mode
- Patch 8 resurrects oom berserker mode, which is supposed to cope with
actively forking processes.
https://jira.sw.ru/browse/PSBM-17930
https://jira.sw.ru/browse/PSBM-26973
Changes in v3:
- rework oom_trylock (patch 3)
- select exiting process instead of aborting oom scan so as not to keep
busy-waiting for an exiting process to exit (patches 3, 4)
- cleanup oom timeout handling + fix stuck process trace dumped
multiple times on timeout (patch 5)
- set max_overdraft to ULONG_MAX on selected processes (patch 6)
- rework oom berserker process selection logic (patches 7, 8)
Changes in v2:
- s/time_after/time_after_eq to avoid BUG_ON in oom_trylock (patch 4)
- propagate victim to the context that initiated oom in oom_unlock
(patch 6)
- always set oom_end on releasing oom context (patch 6)
Vladimir Davydov (8):
memcg: add mem_cgroup_get/put helpers
memcg: add lock for protecting memcg->oom_notify list
oom: rework locking design
oom: introduce oom timeout
oom: drop OOM_SCAN_ABORT
oom: rework logic behind memory.oom_guarantee
oom: pass points and overdraft to oom_kill_process
oom: resurrect berserker mode
Reviewed-by: Kirill Tkhai <ktkhai at odin.com>
=========================================
This patch description:
Currently, we won't select a new oom victim until the previous one has
passed away. This might lead to a deadlock if an allocating task holds a
lock needed by the victim to complete. To cope with this problem, this
patch introduced oom timeout, after which a new task will be selected
even if the previous victim hasn't died. The timeout is hard-coded,
equals 5 seconds.
https://jira.sw.ru/browse/PSBM-38581
Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
include/linux/oom.h | 2 ++
mm/oom_kill.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++-------
2 files changed, 54 insertions(+), 8 deletions(-)
diff --git a/include/linux/oom.h b/include/linux/oom.h
index e19385d..f804551 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -34,6 +34,8 @@ enum oom_scan_t {
struct oom_context {
struct task_struct *owner;
struct task_struct *victim;
+ bool marked;
+ unsigned long oom_start;
wait_queue_head_t waitq;
};
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e50621b..fd6defa7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -45,6 +45,8 @@ int sysctl_oom_dump_tasks;
static DEFINE_SPINLOCK(oom_context_lock);
+#define OOM_TIMEOUT (5 * HZ)
+
#ifndef CONFIG_MEMCG
struct oom_context oom_ctx = {
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(oom_ctx.waitq),
@@ -55,6 +57,8 @@ void init_oom_context(struct oom_context *ctx)
{
ctx->owner = NULL;
ctx->victim = NULL;
+ ctx->marked = false;
+ ctx->oom_start = 0;
init_waitqueue_head(&ctx->waitq);
}
@@ -62,6 +66,7 @@ static void __release_oom_context(struct oom_context *ctx)
{
ctx->owner = NULL;
ctx->victim = NULL;
+ ctx->marked = false;
wake_up_all(&ctx->waitq);
}
@@ -291,11 +296,14 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
/*
* This task already has access to memory reserves and is being killed.
- * Don't allow any other task to have access to the reserves.
+ * Try to select another one.
+ *
+ * This can only happen if oom_trylock timeout-ed, which most probably
+ * means that the victim had dead-locked.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
if (!force_kill)
- return OOM_SCAN_ABORT;
+ return OOM_SCAN_CONTINUE;
}
if (!task->mm)
return OOM_SCAN_CONTINUE;
@@ -463,8 +471,10 @@ void mark_oom_victim(struct task_struct *tsk)
memcg = try_get_mem_cgroup_from_mm(tsk->mm);
ctx = mem_cgroup_oom_context(memcg);
spin_lock(&oom_context_lock);
- if (!ctx->victim)
+ if (!ctx->victim) {
ctx->victim = tsk;
+ ctx->marked = true;
+ }
spin_unlock(&oom_context_lock);
mem_cgroup_put(memcg);
}
@@ -499,21 +509,26 @@ void exit_oom_victim(void)
static void __wait_oom_context(struct oom_context *ctx)
{
+ unsigned long now = jiffies;
+ unsigned long timeout;
DEFINE_WAIT(wait);
- if (ctx->victim == current) {
+ if (ctx->victim == current ||
+ time_after_eq(now, ctx->oom_start + OOM_TIMEOUT)) {
spin_unlock(&oom_context_lock);
return;
}
prepare_to_wait(&ctx->waitq, &wait, TASK_KILLABLE);
+ timeout = ctx->oom_start + OOM_TIMEOUT - now;
spin_unlock(&oom_context_lock);
- schedule();
+ schedule_timeout(timeout);
finish_wait(&ctx->waitq, &wait);
}
bool oom_trylock(struct mem_cgroup *memcg)
{
+ unsigned long now = jiffies;
struct mem_cgroup *iter;
struct oom_context *ctx;
@@ -528,10 +543,32 @@ bool oom_trylock(struct mem_cgroup *memcg)
iter = mem_cgroup_iter(memcg, NULL, NULL);
do {
ctx = mem_cgroup_oom_context(iter);
- if (ctx->owner || ctx->victim) {
+ if ((ctx->owner || ctx->victim) &&
+ time_before(now, ctx->oom_start + OOM_TIMEOUT)) {
__wait_oom_context(ctx);
mem_cgroup_iter_break(memcg, iter);
return false;
+ } else if (ctx->owner || ctx->victim) {
+ /*
+ * Timeout. Release the context and dump stack
+ * trace of the stuck process.
+ *
+ * To avoid dumping stack trace of the same task
+ * more than once, we mark the context that
+ * contained the victim when it was killed (see
+ * mark_oom_victim).
+ */
+ struct task_struct *p = ctx->victim;
+
+ if (p && ctx->marked) {
+ task_lock(p);
+ pr_err("OOM kill timeout: %d (%s)\n",
+ task_pid_nr(p), p->comm);
+ task_unlock(p);
+ show_stack(p, NULL);
+ }
+
+ __release_oom_context(ctx);
}
} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
@@ -544,6 +581,7 @@ bool oom_trylock(struct mem_cgroup *memcg)
BUG_ON(ctx->owner);
BUG_ON(ctx->victim);
ctx->owner = current;
+ ctx->oom_start = now;
} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
spin_unlock(&oom_context_lock);
@@ -565,7 +603,11 @@ void oom_unlock(struct mem_cgroup *memcg)
iter = mem_cgroup_iter(memcg, NULL, NULL);
do {
ctx = mem_cgroup_oom_context(iter);
- BUG_ON(ctx->owner != current);
+ if (ctx->owner != current) {
+ /* Lost ownership on timeout */
+ mem_cgroup_iter_break(memcg, iter);
+ break;
+ }
if (ctx->victim) {
victim = ctx->victim;
/*
@@ -598,7 +640,9 @@ void oom_unlock(struct mem_cgroup *memcg)
iter = mem_cgroup_iter(memcg, NULL, NULL);
do {
ctx = mem_cgroup_oom_context(iter);
- BUG_ON(ctx->owner != current);
+ if (ctx->owner != current)
+ /* Lost ownership on timeout */
+ continue;
if (!ctx->victim)
/*
* Victim already exited or nobody was killed in
More information about the Devel
mailing list