[Devel] [PATCH RHEL7 COMMIT] oom: rework locking design
    Konstantin Khorenko 
    khorenko at virtuozzo.com
       
    Thu Jan 28 08:21:27 PST 2016
    
    
  
The commit is pushed to "branch-rh7-3.10.0-327.3.1-vz7.10.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-327.3.1.vz7.10.7
------>
commit f54e8dbdba09fe02977b9d137afa9be3fd2c5400
Author: Vladimir Davydov <vdavydov at virtuozzo.com>
Date:   Thu Jan 28 20:21:27 2016 +0400
    oom: rework locking design
    
    Rebase to RHEL 7.2 based kernel:
    https://jira.sw.ru/browse/PSBM-42320
    ===
    From: Vladimir Davydov <vdavydov at parallels.com>
    
    Patchset description: oom enhancements - part 2
    
     - Patches 1-2 prepare memcg for upcoming changes in oom design.
     - Patch 3 reworks oom locking design so that the executioner waits for
       victim to exit. This is necessary to increase oom kill rate, which is
       essential for berserker mode.
     - Patch 4 drops unused OOM_SCAN_ABORT
     - Patch 5 introduces oom timeout.
       https://jira.sw.ru/browse/PSBM-38581
     - Patch 6 makes oom fairer when it comes to selecting a victim among
       different containers.
       https://jira.sw.ru/browse/PSBM-37915
     - Patch 7 prepares oom for introducing berserker mode
     - Patch 8 resurrects oom berserker mode, which is supposed to cope with
       actively forking processes.
       https://jira.sw.ru/browse/PSBM-17930
    
    https://jira.sw.ru/browse/PSBM-26973
    
    Changes in v3:
     - rework oom_trylock (patch 3)
     - select exiting process instead of aborting oom scan so as not to keep
       busy-waiting for an exiting process to exit (patches 3, 4)
     - cleanup oom timeout handling + fix stuck process trace dumped
       multiple times on timeout (patch 5)
     - set max_overdraft to ULONG_MAX on selected processes (patch 6)
     - rework oom berserker process selection logic (patches 7, 8)
    
    Changes in v2:
     - s/time_after/time_after_eq to avoid BUG_ON in oom_trylock (patch 4)
     - propagate victim to the context that initiated oom in oom_unlock
       (patch 6)
     - always set oom_end on releasing oom context (patch 6)
    
    Vladimir Davydov (8):
      memcg: add mem_cgroup_get/put helpers
      memcg: add lock for protecting memcg->oom_notify list
      oom: rework locking design
      oom: introduce oom timeout
      oom: drop OOM_SCAN_ABORT
      oom: rework logic behind memory.oom_guarantee
      oom: pass points and overdraft to oom_kill_process
      oom: resurrect berserker mode
    
    Reviewed-by: Kirill Tkhai <ktkhai at odin.com>
    
    =========================================
    This patch description:
    
    Currently, after oom-killing a process, we keep busy waiting for it
    until it frees some memory and we can fulfil the allocation request that
    initiated oom. This slows down oom kill rate dramatically, because the
    oom victim has to compete for cpu time with other (possibly numerous)
    processes. The latter is unacceptable for the upcoming oom berserker,
    which triggers if oom kills happen to often.
    
    This patch reworks oom locking design as follows. Now only one process
    is allowed to invoke oom killer in a memcg (root included) and all its
    descendants, others have to wait for it to finish. Next, once a victim
    is selected, the executioner will wait for it to die before retrying
    allocation.
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
    
    Conflicts:
    	mm/memcontrol.c
    	mm/oom_kill.c
---
 include/linux/memcontrol.h |   9 ++
 include/linux/oom.h        |  13 ++-
 mm/memcontrol.c            | 130 ++++++++--------------
 mm/oom_kill.c              | 263 +++++++++++++++++++++++++++++++++------------
 mm/page_alloc.c            |   6 +-
 5 files changed, 262 insertions(+), 159 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index aa26c4c..badd4ac 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -29,6 +29,7 @@ struct page_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
+struct oom_context;
 
 /* Stats that can be updated by kernel. */
 enum mem_cgroup_page_stat_item {
@@ -123,6 +124,7 @@ bool mem_cgroup_cleancache_disabled(struct page *page);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
+extern struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg);
 extern bool mem_cgroup_below_oom_guarantee(struct task_struct *p);
 extern void mem_cgroup_note_oom_kill(struct mem_cgroup *memcg,
 				     struct task_struct *task);
@@ -387,6 +389,13 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 {
 }
 
+static inline struct oom_context *
+mem_cgroup_oom_context(struct mem_cgroup *memcg)
+{
+	extern struct oom_context oom_ctx;
+	return &oom_ctx;
+}
+
 static inline bool mem_cgroup_below_oom_guarantee(struct task_struct *p)
 {
 	return false;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 486fc6f..e19385d 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -31,6 +31,15 @@ enum oom_scan_t {
 	OOM_SCAN_SELECT,	/* always select this thread first */
 };
 
+struct oom_context {
+	struct task_struct *owner;
+	struct task_struct *victim;
+	wait_queue_head_t waitq;
+};
+
+extern void init_oom_context(struct oom_context *ctx);
+extern void release_oom_context(struct oom_context *ctx);
+
 /* Thread is the potential origin of an oom condition; kill first on oom */
 #define OOM_FLAG_ORIGIN		((__force oom_flags_t)0x1)
 
@@ -61,8 +70,8 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 			     struct mem_cgroup *memcg, nodemask_t *nodemask,
 			     const char *message);
 
-extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
-extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
+extern bool oom_trylock(struct mem_cgroup *memcg);
+extern void oom_unlock(struct mem_cgroup *memcg);
 
 extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 			       int order, const nodemask_t *nodemask);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1b18d72..83d7823 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -296,6 +296,7 @@ struct mem_cgroup {
 	atomic_long_t swap_failcnt;
 	atomic_long_t oom_kill_cnt;
 
+	struct oom_context oom_ctx;
 	unsigned long long oom_guarantee;
 
 	/*
@@ -1719,6 +1720,13 @@ void mem_cgroup_note_oom_kill(struct mem_cgroup *root_memcg,
 		css_put(&memcg_to_put->css);
 }
 
+struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg)
+{
+	if (!memcg)
+		memcg = root_mem_cgroup;
+	return &memcg->oom_ctx;
+}
+
 unsigned long mem_cgroup_total_pages(struct mem_cgroup *memcg, bool swap)
 {
 	unsigned long long limit;
@@ -2319,60 +2327,6 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
 	return total;
 }
 
-static DEFINE_SPINLOCK(memcg_oom_lock);
-
-/*
- * Check OOM-Killer is already running under our hierarchy.
- * If someone is running, return false.
- */
-static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *iter, *failed = NULL;
-
-	spin_lock(&memcg_oom_lock);
-
-	for_each_mem_cgroup_tree(iter, memcg) {
-		if (iter->oom_lock) {
-			/*
-			 * this subtree of our hierarchy is already locked
-			 * so we cannot give a lock.
-			 */
-			failed = iter;
-			mem_cgroup_iter_break(memcg, iter);
-			break;
-		} else
-			iter->oom_lock = true;
-	}
-
-	if (failed) {
-		/*
-		 * OK, we failed to lock the whole subtree so we have
-		 * to clean up what we set up to the failing subtree
-		 */
-		for_each_mem_cgroup_tree(iter, memcg) {
-			if (iter == failed) {
-				mem_cgroup_iter_break(memcg, iter);
-				break;
-			}
-			iter->oom_lock = false;
-		}
-	}
-
-	spin_unlock(&memcg_oom_lock);
-
-	return !failed;
-}
-
-static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *iter;
-
-	spin_lock(&memcg_oom_lock);
-	for_each_mem_cgroup_tree(iter, memcg)
-		iter->oom_lock = false;
-	spin_unlock(&memcg_oom_lock);
-}
-
 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
@@ -2434,6 +2388,23 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 		memcg_wakeup_oom(memcg);
 }
 
+static void memcg_wait_oom_recover(struct mem_cgroup *memcg)
+{
+	struct oom_wait_info owait;
+
+	owait.memcg = memcg;
+	owait.wait.flags = 0;
+	owait.wait.func = memcg_oom_wake_function;
+	owait.wait.private = current;
+	INIT_LIST_HEAD(&owait.wait.task_list);
+
+	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+	schedule();
+	finish_wait(&memcg_oom_waitq, &owait.wait);
+
+	memcg_wakeup_oom(memcg);
+}
+
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
 	if (!current->memcg_oom.may_oom)
@@ -2478,8 +2449,6 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 bool mem_cgroup_oom_synchronize(bool handle)
 {
 	struct mem_cgroup *memcg = current->memcg_oom.memcg;
-	struct oom_wait_info owait;
-	bool locked;
 
 	/* OOM is global, do not handle */
 	if (!memcg)
@@ -2488,40 +2457,19 @@ bool mem_cgroup_oom_synchronize(bool handle)
 	if (!handle)
 		goto cleanup;
 
-	owait.memcg = memcg;
-	owait.wait.flags = 0;
-	owait.wait.func = memcg_oom_wake_function;
-	owait.wait.private = current;
-	INIT_LIST_HEAD(&owait.wait.task_list);
-
-	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
 	mem_cgroup_mark_under_oom(memcg);
-
-	locked = mem_cgroup_oom_trylock(memcg);
-
-	if (locked)
+	if (oom_trylock(memcg)) {
 		mem_cgroup_oom_notify(memcg);
-
-	if (locked && !memcg->oom_kill_disable) {
-		mem_cgroup_unmark_under_oom(memcg);
-		finish_wait(&memcg_oom_waitq, &owait.wait);
-		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
-					 current->memcg_oom.order);
-	} else {
-		schedule();
-		mem_cgroup_unmark_under_oom(memcg);
-		finish_wait(&memcg_oom_waitq, &owait.wait);
+		if (memcg->oom_kill_disable)
+			memcg_wait_oom_recover(memcg);
+		else
+			mem_cgroup_out_of_memory(memcg,
+						 current->memcg_oom.gfp_mask,
+						 current->memcg_oom.order);
+		oom_unlock(memcg);
 	}
+	mem_cgroup_unmark_under_oom(memcg);
 
-	if (locked) {
-		mem_cgroup_oom_unlock(memcg);
-		/*
-		 * There is no guarantee that an OOM-lock contender
-		 * sees the wakeups triggered by the OOM kill
-		 * uncharges.  Wake any sleepers explicitely.
-		 */
-		memcg_oom_recover(memcg);
-	}
 cleanup:
 	current->memcg_oom.memcg = NULL;
 	css_put(&memcg->css);
@@ -6577,6 +6525,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
 	vmpressure_init(&memcg->vmpressure);
+	init_oom_context(&memcg->oom_ctx);
 #ifdef CONFIG_MEMCG_KMEM
 	memcg->kmemcg_id = -1;
 	INIT_LIST_HEAD(&memcg->kmemcg_sharers);
@@ -6666,6 +6615,15 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
 
 	mem_cgroup_invalidate_reclaim_iterators(memcg);
 	mem_cgroup_reparent_charges(memcg);
+
+	/*
+	 * A cgroup can be destroyed while somebody is waiting for its
+	 * oom context, in which case the context will never be unlocked
+	 * from oom_unlock, because the latter only iterates over live
+	 * cgroups. So we need to release the context now, when one can
+	 * no longer iterate over it.
+	 */
+	release_oom_context(&memcg->oom_ctx);
 }
 
 static void mem_cgroup_css_free(struct cgroup *cont)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1b98acd..e50621b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,7 +42,35 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks;
-static DEFINE_SPINLOCK(zone_scan_lock);
+
+static DEFINE_SPINLOCK(oom_context_lock);
+
+#ifndef CONFIG_MEMCG
+struct oom_context oom_ctx = {
+	.waitq		= __WAIT_QUEUE_HEAD_INITIALIZER(oom_ctx.waitq),
+};
+#endif
+
+void init_oom_context(struct oom_context *ctx)
+{
+	ctx->owner = NULL;
+	ctx->victim = NULL;
+	init_waitqueue_head(&ctx->waitq);
+}
+
+static void __release_oom_context(struct oom_context *ctx)
+{
+	ctx->owner = NULL;
+	ctx->victim = NULL;
+	wake_up_all(&ctx->waitq);
+}
+
+void release_oom_context(struct oom_context *ctx)
+{
+	spin_lock(&oom_context_lock);
+	__release_oom_context(ctx);
+	spin_unlock(&oom_context_lock);
+}
 
 #ifdef CONFIG_NUMA
 /**
@@ -285,7 +313,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		 * to finish before killing some other task unnecessarily.
 		 */
 		if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
-			return OOM_SCAN_ABORT;
+			return OOM_SCAN_SELECT;
 	}
 
 	if (!ignore_memcg_guarantee && mem_cgroup_below_oom_guarantee(task))
@@ -414,6 +442,9 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
  */
 void mark_oom_victim(struct task_struct *tsk)
 {
+	struct mem_cgroup *memcg;
+	struct oom_context *ctx;
+
 	set_tsk_thread_flag(tsk, TIF_MEMDIE);
 
 	/*
@@ -423,6 +454,19 @@ void mark_oom_victim(struct task_struct *tsk)
 	 * that TIF_MEMDIE tasks should be ignored.
 	 */
 	__thaw_task(tsk);
+
+	/*
+	 * Record the pointer to the victim in the oom context of the
+	 * owner memcg so that others can wait for it to exit. It will
+	 * be cleared in exit_oom_victim.
+	 */
+	memcg = try_get_mem_cgroup_from_mm(tsk->mm);
+	ctx = mem_cgroup_oom_context(memcg);
+	spin_lock(&oom_context_lock);
+	if (!ctx->victim)
+		ctx->victim = tsk;
+	spin_unlock(&oom_context_lock);
+	mem_cgroup_put(memcg);
 }
 
 /**
@@ -430,7 +474,154 @@ void mark_oom_victim(struct task_struct *tsk)
  */
 void exit_oom_victim(void)
 {
+	struct mem_cgroup *iter;
+	struct oom_context *ctx;
+
 	clear_thread_flag(TIF_MEMDIE);
+
+	/*
+	 * Wake up every process waiting for this oom victim to exit.
+	 */
+	spin_lock(&oom_context_lock);
+	iter = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		if (ctx->victim != current)
+			continue;
+		if (!ctx->owner)
+			__release_oom_context(ctx);
+		else
+			/* To be released by owner (see oom_unlock) */
+			ctx->victim = NULL;
+	} while ((iter = mem_cgroup_iter(NULL, iter, NULL)));
+	spin_unlock(&oom_context_lock);
+}
+
+static void __wait_oom_context(struct oom_context *ctx)
+{
+	DEFINE_WAIT(wait);
+
+	if (ctx->victim == current) {
+		spin_unlock(&oom_context_lock);
+		return;
+	}
+
+	prepare_to_wait(&ctx->waitq, &wait, TASK_KILLABLE);
+	spin_unlock(&oom_context_lock);
+	schedule();
+	finish_wait(&ctx->waitq, &wait);
+}
+
+bool oom_trylock(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup *iter;
+	struct oom_context *ctx;
+
+	spin_lock(&oom_context_lock);
+
+	/*
+	 * Check if oom context of memcg or any of its descendants is
+	 * active, i.e. if there is a process selecting a victim or a
+	 * victim dying. If there is, wait for it to finish, otherwise
+	 * proceed to oom.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		if (ctx->owner || ctx->victim) {
+			__wait_oom_context(ctx);
+			mem_cgroup_iter_break(memcg, iter);
+			return false;
+		}
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
+
+	/*
+	 * Acquire oom context of memcg and all its descendants.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		BUG_ON(ctx->owner);
+		BUG_ON(ctx->victim);
+		ctx->owner = current;
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
+
+	spin_unlock(&oom_context_lock);
+
+	return true;
+}
+
+void oom_unlock(struct mem_cgroup *memcg)
+{
+	struct task_struct *victim = NULL;
+	struct mem_cgroup *iter, *victim_memcg = NULL;
+	struct oom_context *ctx;
+
+	spin_lock(&oom_context_lock);
+
+	/*
+	 * Find oom victim if any.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		BUG_ON(ctx->owner != current);
+		if (ctx->victim) {
+			victim = ctx->victim;
+			/*
+			 * Remember the victim memcg so that we can wait
+			 * on it for the victim to exit below.
+			 */
+			victim_memcg = iter;
+			mem_cgroup_get(iter);
+
+			mem_cgroup_iter_break(memcg, iter);
+			break;
+		}
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
+
+	/*
+	 * Propagate victim up to the context that initiated oom.
+	 */
+	for (iter = victim_memcg; iter; iter = parent_mem_cgroup(iter)) {
+		ctx = mem_cgroup_oom_context(iter);
+		BUG_ON(ctx->owner != current);
+		if (!ctx->victim)
+			ctx->victim = victim;
+		if (iter == memcg)
+			break;
+	}
+
+	/*
+	 * Release oom context of memcg and all its descendants.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		BUG_ON(ctx->owner != current);
+		if (!ctx->victim)
+			/*
+			 * Victim already exited or nobody was killed in
+			 * this cgroup? It's our responsibility to wake
+			 * up blocked processes then.
+			 */
+			__release_oom_context(ctx);
+		else
+			/* To be released by victim (see exit_oom_victim) */
+			ctx->owner = NULL;
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
+
+	if (!victim) {
+		spin_unlock(&oom_context_lock);
+		return;
+	}
+
+	/*
+	 * Wait for the victim to exit.
+	 */
+	ctx = mem_cgroup_oom_context(victim_memcg);
+	__wait_oom_context(ctx);
+	mem_cgroup_put(victim_memcg);
 }
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -589,56 +780,6 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
-/*
- * Try to acquire the OOM killer lock for the zones in zonelist.  Returns zero
- * if a parallel OOM killing is already taking place that includes a zone in
- * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
- */
-int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
-{
-	struct zoneref *z;
-	struct zone *zone;
-	int ret = 1;
-
-	spin_lock(&zone_scan_lock);
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-		if (zone_is_oom_locked(zone)) {
-			ret = 0;
-			goto out;
-		}
-	}
-
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-		/*
-		 * Lock each zone in the zonelist under zone_scan_lock so a
-		 * parallel invocation of try_set_zonelist_oom() doesn't succeed
-		 * when it shouldn't.
-		 */
-		zone_set_flag(zone, ZONE_OOM_LOCKED);
-	}
-
-out:
-	spin_unlock(&zone_scan_lock);
-	return ret;
-}
-
-/*
- * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
- * allocation attempts with zonelists containing them may now recall the OOM
- * killer, if necessary.
- */
-void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
-{
-	struct zoneref *z;
-	struct zone *zone;
-
-	spin_lock(&zone_scan_lock);
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-		zone_clear_flag(zone, ZONE_OOM_LOCKED);
-	}
-	spin_unlock(&zone_scan_lock);
-}
-
 /**
  * out_of_memory - kill the "best" process when we run out of memory
  * @zonelist: zonelist pointer
@@ -661,7 +802,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	unsigned long freed = 0;
 	unsigned int uninitialized_var(points);
 	enum oom_constraint constraint = CONSTRAINT_NONE;
-	int killed = 0;
 
 	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
 	if (freed > 0)
@@ -698,7 +838,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
 				 nodemask,
 				 "Out of memory (oom_kill_allocating_task)");
-		goto out;
+		return;
 	}
 
 	p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
@@ -710,15 +850,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	if (PTR_ERR(p) != -1UL) {
 		oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
 				 nodemask, "Out of memory");
-		killed = 1;
 	}
-out:
-	/*
-	 * Give the killed threads a good chance of exiting before trying to
-	 * allocate memory again.
-	 */
-	if (killed)
-		schedule_timeout_killable(1);
 }
 
 /*
@@ -728,14 +860,11 @@ out:
  */
 void pagefault_out_of_memory(void)
 {
-	struct zonelist *zonelist;
-
 	if (mem_cgroup_oom_synchronize(true))
 		return;
 
-	zonelist = node_zonelist(first_online_node, GFP_KERNEL);
-	if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
+	if (oom_trylock(NULL)) {
 		out_of_memory(NULL, 0, 0, NULL, false);
-		clear_zonelist_oom(zonelist, GFP_KERNEL);
+		oom_unlock(NULL);
 	}
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ddad1c2..9ab61fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2178,10 +2178,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct page *page;
 
 	/* Acquire the OOM killer lock for the zones in zonelist */
-	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
-		schedule_timeout_uninterruptible(1);
+	if (!oom_trylock(NULL))
 		return NULL;
-	}
 
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
@@ -2216,7 +2214,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 
 out:
-	clear_zonelist_oom(zonelist, gfp_mask);
+	oom_unlock(NULL);
 	return page;
 }
 
    
    
More information about the Devel
mailing list