[Devel] [PATCH RHEL7 COMMIT] oom: introduce oom timeout

Thu Jan 28 08:21:28 PST 2016

The commit is pushed to "branch-rh7-3.10.0-327.3.1-vz7.10.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-327.3.1.vz7.10.7
------>
commit 522b3faa45d160bb1dc4903bdf524286d5a543d4
Author: Vladimir Davydov <vdavydov at virtuozzo.com>
Date:   Thu Jan 28 20:21:28 2016 +0400

    oom: introduce oom timeout
    
    Rebase to RHEL 7.2 based kernel:
    https://jira.sw.ru/browse/PSBM-42320
    ===
    From: Vladimir Davydov <vdavydov at parallels.com>
    
    Patchset description: oom enhancements - part 2
    
     - Patches 1-2 prepare memcg for upcoming changes in oom design.
     - Patch 3 reworks oom locking design so that the executioner waits for
       victim to exit. This is necessary to increase oom kill rate, which is
       essential for berserker mode.
     - Patch 4 drops unused OOM_SCAN_ABORT
     - Patch 5 introduces oom timeout.
       https://jira.sw.ru/browse/PSBM-38581
     - Patch 6 makes oom fairer when it comes to selecting a victim among
       different containers.
       https://jira.sw.ru/browse/PSBM-37915
     - Patch 7 prepares oom for introducing berserker mode
     - Patch 8 resurrects oom berserker mode, which is supposed to cope with
       actively forking processes.
       https://jira.sw.ru/browse/PSBM-17930
    
    https://jira.sw.ru/browse/PSBM-26973
    
    Changes in v3:
     - rework oom_trylock (patch 3)
     - select exiting process instead of aborting oom scan so as not to keep
       busy-waiting for an exiting process to exit (patches 3, 4)
     - cleanup oom timeout handling + fix stuck process trace dumped
       multiple times on timeout (patch 5)
     - set max_overdraft to ULONG_MAX on selected processes (patch 6)
     - rework oom berserker process selection logic (patches 7, 8)
    
    Changes in v2:
     - s/time_after/time_after_eq to avoid BUG_ON in oom_trylock (patch 4)
     - propagate victim to the context that initiated oom in oom_unlock
       (patch 6)
     - always set oom_end on releasing oom context (patch 6)
    
    Vladimir Davydov (8):
      memcg: add mem_cgroup_get/put helpers
      memcg: add lock for protecting memcg->oom_notify list
      oom: rework locking design
      oom: introduce oom timeout
      oom: drop OOM_SCAN_ABORT
      oom: rework logic behind memory.oom_guarantee
      oom: pass points and overdraft to oom_kill_process
      oom: resurrect berserker mode
    
    Reviewed-by: Kirill Tkhai <ktkhai at odin.com>
    
    =========================================
    This patch description:
    
    Currently, we won't select a new oom victim until the previous one has
    passed away. This might lead to a deadlock if an allocating task holds a
    lock needed by the victim to complete. To cope with this problem, this
    patch introduced oom timeout, after which a new task will be selected
    even if the previous victim hasn't died. The timeout is hard-coded,
    equals 5 seconds.
    
    https://jira.sw.ru/browse/PSBM-38581
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 include/linux/oom.h |  2 ++
 mm/oom_kill.c       | 60 ++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index e19385d..f804551 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -34,6 +34,8 @@ enum oom_scan_t {
 struct oom_context {
 	struct task_struct *owner;
 	struct task_struct *victim;
+	bool marked;
+	unsigned long oom_start;
 	wait_queue_head_t waitq;
 };
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e50621b..fd6defa7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -45,6 +45,8 @@ int sysctl_oom_dump_tasks;
 
 static DEFINE_SPINLOCK(oom_context_lock);
 
+#define OOM_TIMEOUT	(5 * HZ)
+
 #ifndef CONFIG_MEMCG
 struct oom_context oom_ctx = {
 	.waitq		= __WAIT_QUEUE_HEAD_INITIALIZER(oom_ctx.waitq),
@@ -55,6 +57,8 @@ void init_oom_context(struct oom_context *ctx)
 {
 	ctx->owner = NULL;
 	ctx->victim = NULL;
+	ctx->marked = false;
+	ctx->oom_start = 0;
 	init_waitqueue_head(&ctx->waitq);
 }
 
@@ -62,6 +66,7 @@ static void __release_oom_context(struct oom_context *ctx)
 {
 	ctx->owner = NULL;
 	ctx->victim = NULL;
+	ctx->marked = false;
 	wake_up_all(&ctx->waitq);
 }
 
@@ -291,11 +296,14 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 
 	/*
 	 * This task already has access to memory reserves and is being killed.
-	 * Don't allow any other task to have access to the reserves.
+	 * Try to select another one.
+	 *
+	 * This can only happen if oom_trylock timeout-ed, which most probably
+	 * means that the victim had dead-locked.
 	 */
 	if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
 		if (!force_kill)
-			return OOM_SCAN_ABORT;
+			return OOM_SCAN_CONTINUE;
 	}
 	if (!task->mm)
 		return OOM_SCAN_CONTINUE;
@@ -463,8 +471,10 @@ void mark_oom_victim(struct task_struct *tsk)
 	memcg = try_get_mem_cgroup_from_mm(tsk->mm);
 	ctx = mem_cgroup_oom_context(memcg);
 	spin_lock(&oom_context_lock);
-	if (!ctx->victim)
+	if (!ctx->victim) {
 		ctx->victim = tsk;
+		ctx->marked = true;
+	}
 	spin_unlock(&oom_context_lock);
 	mem_cgroup_put(memcg);
 }
@@ -499,21 +509,26 @@ void exit_oom_victim(void)
 
 static void __wait_oom_context(struct oom_context *ctx)
 {
+	unsigned long now = jiffies;
+	unsigned long timeout;
 	DEFINE_WAIT(wait);
 
-	if (ctx->victim == current) {
+	if (ctx->victim == current ||
+	    time_after_eq(now, ctx->oom_start + OOM_TIMEOUT)) {
 		spin_unlock(&oom_context_lock);
 		return;
 	}
 
 	prepare_to_wait(&ctx->waitq, &wait, TASK_KILLABLE);
+	timeout = ctx->oom_start + OOM_TIMEOUT - now;
 	spin_unlock(&oom_context_lock);
-	schedule();
+	schedule_timeout(timeout);
 	finish_wait(&ctx->waitq, &wait);
 }
 
 bool oom_trylock(struct mem_cgroup *memcg)
 {
+	unsigned long now = jiffies;
 	struct mem_cgroup *iter;
 	struct oom_context *ctx;
 
@@ -528,10 +543,32 @@ bool oom_trylock(struct mem_cgroup *memcg)
 	iter = mem_cgroup_iter(memcg, NULL, NULL);
 	do {
 		ctx = mem_cgroup_oom_context(iter);
-		if (ctx->owner || ctx->victim) {
+		if ((ctx->owner || ctx->victim) &&
+		    time_before(now, ctx->oom_start + OOM_TIMEOUT)) {
 			__wait_oom_context(ctx);
 			mem_cgroup_iter_break(memcg, iter);
 			return false;
+		} else if (ctx->owner || ctx->victim) {
+			/*
+			 * Timeout. Release the context and dump stack
+			 * trace of the stuck process.
+			 *
+			 * To avoid dumping stack trace of the same task
+			 * more than once, we mark the context that
+			 * contained the victim when it was killed (see
+			 * mark_oom_victim).
+			 */
+			struct task_struct *p = ctx->victim;
+
+			if (p && ctx->marked) {
+				task_lock(p);
+				pr_err("OOM kill timeout: %d (%s)\n",
+				       task_pid_nr(p), p->comm);
+				task_unlock(p);
+				show_stack(p, NULL);
+			}
+
+			__release_oom_context(ctx);
 		}
 	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
 
@@ -544,6 +581,7 @@ bool oom_trylock(struct mem_cgroup *memcg)
 		BUG_ON(ctx->owner);
 		BUG_ON(ctx->victim);
 		ctx->owner = current;
+		ctx->oom_start = now;
 	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
 
 	spin_unlock(&oom_context_lock);
@@ -565,7 +603,11 @@ void oom_unlock(struct mem_cgroup *memcg)
 	iter = mem_cgroup_iter(memcg, NULL, NULL);
 	do {
 		ctx = mem_cgroup_oom_context(iter);
-		BUG_ON(ctx->owner != current);
+		if (ctx->owner != current) {
+			/* Lost ownership on timeout */
+			mem_cgroup_iter_break(memcg, iter);
+			break;
+		}
 		if (ctx->victim) {
 			victim = ctx->victim;
 			/*
@@ -598,7 +640,9 @@ void oom_unlock(struct mem_cgroup *memcg)
 	iter = mem_cgroup_iter(memcg, NULL, NULL);
 	do {
 		ctx = mem_cgroup_oom_context(iter);
-		BUG_ON(ctx->owner != current);
+		if (ctx->owner != current)
+			/* Lost ownership on timeout */
+			continue;
 		if (!ctx->victim)
 			/*
 			 * Victim already exited or nobody was killed in