[Devel] [PATCH rh7 3/6] oom: rework locking design

Fri Sep 11 02:54:29 PDT 2015

Currently, after oom-killing a process, we keep busy waiting for it
until it frees some memory and we can fulfil the allocation request that
initiated oom. This slows down oom kill rate dramatically, because the
oom victim has to compete for cpu time with other (possibly numerous)
processes. The latter is unacceptable for the upcoming oom berserker,
which triggers if oom kills happen to often.

This patch reworks oom locking design as follows. Now only one process
is allowed to invoke oom killer in a memcg (root included) and all its
descendants, others have to wait for it to finish. Next, once a victim
is selected, the executioner will wait for it to die before retrying
allocation.

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 include/linux/memcontrol.h |   9 +++
 include/linux/oom.h        |  13 ++-
 mm/memcontrol.c            | 122 +++++++++-------------------
 mm/oom_kill.c              | 198 ++++++++++++++++++++++++++++++++-------------
 mm/page_alloc.c            |   6 +-
 5 files changed, 202 insertions(+), 146 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 548a82cde972..591132758911 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -29,6 +29,7 @@ struct page_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
+struct oom_context;
 
 /* Stats that can be updated by kernel. */
 enum mem_cgroup_page_stat_item {
@@ -120,6 +121,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
+extern struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg);
 extern bool mem_cgroup_below_oom_guarantee(struct task_struct *p);
 extern void mem_cgroup_note_oom_kill(struct mem_cgroup *memcg,
 				     struct task_struct *task);
@@ -363,6 +365,13 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 {
 }
 
+static inline struct oom_context *
+mem_cgroup_oom_context(struct mem_cgroup *memcg)
+{
+	extern struct oom_context oom_ctx;
+	return &oom_ctx;
+}
+
 static inline bool mem_cgroup_below_oom_guarantee(struct task_struct *p)
 {
 	return false;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 486fc6f3cdc1..e19385dd29aa 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -31,6 +31,15 @@ enum oom_scan_t {
 	OOM_SCAN_SELECT,	/* always select this thread first */
 };
 
+struct oom_context {
+	struct task_struct *owner;
+	struct task_struct *victim;
+	wait_queue_head_t waitq;
+};
+
+extern void init_oom_context(struct oom_context *ctx);
+extern void release_oom_context(struct oom_context *ctx);
+
 /* Thread is the potential origin of an oom condition; kill first on oom */
 #define OOM_FLAG_ORIGIN		((__force oom_flags_t)0x1)
 
@@ -61,8 +70,8 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 			     struct mem_cgroup *memcg, nodemask_t *nodemask,
 			     const char *message);
 
-extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
-extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
+extern bool oom_trylock(struct mem_cgroup *memcg);
+extern void oom_unlock(struct mem_cgroup *memcg);
 
 extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 			       int order, const nodemask_t *nodemask);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index df98b3dfe0f1..de3ad32ecb89 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -296,6 +296,7 @@ struct mem_cgroup {
 	atomic_long_t swap_failcnt;
 	atomic_long_t oom_kill_cnt;
 
+	struct oom_context oom_ctx;
 	unsigned long long oom_guarantee;
 
 	/*
@@ -1655,6 +1656,13 @@ void mem_cgroup_note_oom_kill(struct mem_cgroup *root_memcg,
 		css_put(&memcg_to_put->css);
 }
 
+struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg)
+{
+	if (!memcg)
+		memcg = root_mem_cgroup;
+	return &memcg->oom_ctx;
+}
+
 unsigned long mem_cgroup_total_pages(struct mem_cgroup *memcg, bool swap)
 {
 	unsigned long long limit;
@@ -2255,57 +2263,6 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
 	return total;
 }
 
-/*
- * Check OOM-Killer is already running under our hierarchy.
- * If someone is running, return false.
- * Has to be called with memcg_oom_lock
- */
-static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *iter, *failed = NULL;
-
-	for_each_mem_cgroup_tree(iter, memcg) {
-		if (iter->oom_lock) {
-			/*
-			 * this subtree of our hierarchy is already locked
-			 * so we cannot give a lock.
-			 */
-			failed = iter;
-			mem_cgroup_iter_break(memcg, iter);
-			break;
-		} else
-			iter->oom_lock = true;
-	}
-
-	if (!failed)
-		return true;
-
-	/*
-	 * OK, we failed to lock the whole subtree so we have to clean up
-	 * what we set up to the failing subtree
-	 */
-	for_each_mem_cgroup_tree(iter, memcg) {
-		if (iter == failed) {
-			mem_cgroup_iter_break(memcg, iter);
-			break;
-		}
-		iter->oom_lock = false;
-	}
-	return false;
-}
-
-/*
- * Has to be called with memcg_oom_lock
- */
-static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *iter;
-
-	for_each_mem_cgroup_tree(iter, memcg)
-		iter->oom_lock = false;
-	return 0;
-}
-
 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
@@ -2327,7 +2284,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 		atomic_add_unless(&iter->under_oom, -1, 0);
 }
 
-static DEFINE_SPINLOCK(memcg_oom_lock);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 
 struct oom_wait_info {
@@ -2367,57 +2323,42 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 		memcg_wakeup_oom(memcg);
 }
 
-/*
- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
- */
-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
-				  int order)
+static void memcg_wait_oom_recover(struct mem_cgroup *memcg)
 {
 	struct oom_wait_info owait;
-	bool locked, need_to_kill;
 
 	owait.memcg = memcg;
 	owait.wait.flags = 0;
 	owait.wait.func = memcg_oom_wake_function;
 	owait.wait.private = current;
 	INIT_LIST_HEAD(&owait.wait.task_list);
-	need_to_kill = true;
-	mem_cgroup_mark_under_oom(memcg);
 
-	/* At first, try to OOM lock hierarchy under memcg.*/
-	spin_lock(&memcg_oom_lock);
-	locked = mem_cgroup_oom_lock(memcg);
-	/*
-	 * Even if signal_pending(), we can't quit charge() loop without
-	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
-	 * under OOM is always welcomed, use TASK_KILLABLE here.
-	 */
 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-	if (!locked || memcg->oom_kill_disable)
-		need_to_kill = false;
-	if (locked)
-		mem_cgroup_oom_notify(memcg);
-	spin_unlock(&memcg_oom_lock);
+	schedule();
+	finish_wait(&memcg_oom_waitq, &owait.wait);
 
-	if (need_to_kill) {
-		finish_wait(&memcg_oom_waitq, &owait.wait);
-		mem_cgroup_out_of_memory(memcg, mask, order);
-	} else {
-		schedule();
-		finish_wait(&memcg_oom_waitq, &owait.wait);
-	}
-	spin_lock(&memcg_oom_lock);
-	if (locked)
-		mem_cgroup_oom_unlock(memcg);
 	memcg_wakeup_oom(memcg);
-	spin_unlock(&memcg_oom_lock);
+}
 
+/*
+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ */
+static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
+				  int order)
+{
+	mem_cgroup_mark_under_oom(memcg);
+	if (oom_trylock(memcg)) {
+		mem_cgroup_oom_notify(memcg);
+		if (memcg->oom_kill_disable)
+			memcg_wait_oom_recover(memcg);
+		else
+			mem_cgroup_out_of_memory(memcg, mask, order);
+		oom_unlock(memcg);
+	}
 	mem_cgroup_unmark_under_oom(memcg);
 
 	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
 		return false;
-	/* Give chance to dying process */
-	schedule_timeout_uninterruptible(1);
 	return true;
 }
 
@@ -6454,6 +6395,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
 	vmpressure_init(&memcg->vmpressure);
+	init_oom_context(&memcg->oom_ctx);
 #ifdef CONFIG_MEMCG_KMEM
 	memcg->kmemcg_id = -1;
 	INIT_LIST_HEAD(&memcg->kmemcg_sharers);
@@ -6540,6 +6482,14 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
 
 	mem_cgroup_invalidate_reclaim_iterators(memcg);
 	mem_cgroup_reparent_charges(memcg);
+
+	/*
+	 * A cgroup can be destroyed while somebody is waiting for its
+	 * oom context, in which case the context will never be unlocked
+	 * from oom_unlock, because the latter only iterates over live
+	 * cgroups. So we need to release the context here.
+	 */
+	release_oom_context(&memcg->oom_ctx);
 }
 
 static void mem_cgroup_css_free(struct cgroup *cont)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7dfbcb6ffb80..c9265092825d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,7 +42,30 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks;
-static DEFINE_SPINLOCK(zone_scan_lock);
+
+static DEFINE_SPINLOCK(oom_context_lock);
+
+#ifndef CONFIG_MEMCG
+struct oom_context oom_ctx = {
+	.waitq		= __WAIT_QUEUE_HEAD_INITIALIZER(oom_ctx.waitq),
+};
+#endif
+
+void init_oom_context(struct oom_context *ctx)
+{
+	ctx->owner = NULL;
+	ctx->victim = NULL;
+	init_waitqueue_head(&ctx->waitq);
+}
+
+void release_oom_context(struct oom_context *ctx)
+{
+	spin_lock(&oom_context_lock);
+	if (ctx->owner || ctx->victim)
+		wake_up_all(&ctx->waitq);
+	ctx->owner = ctx->victim = NULL;
+	spin_unlock(&oom_context_lock);
+}
 
 #ifdef CONFIG_NUMA
 /**
@@ -414,6 +437,9 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
  */
 void mark_oom_victim(struct task_struct *tsk)
 {
+	struct mem_cgroup *memcg;
+	struct oom_context *ctx;
+
 	set_tsk_thread_flag(tsk, TIF_MEMDIE);
 
 	/*
@@ -423,6 +449,19 @@ void mark_oom_victim(struct task_struct *tsk)
 	 * that TIF_MEMDIE tasks should be ignored.
 	 */
 	__thaw_task(tsk);
+
+	/*
+	 * Record the pointer to the victim in the oom context of the
+	 * owner memcg. It will be used in exit_oom_victim in order to
+	 * wake up all processes waiting for the oom victim to exit.
+	 */
+	memcg = try_get_mem_cgroup_from_mm(tsk->mm);
+	ctx = mem_cgroup_oom_context(memcg);
+	spin_lock(&oom_context_lock);
+	if (!ctx->victim)
+		ctx->victim = tsk;
+	spin_unlock(&oom_context_lock);
+	mem_cgroup_put(memcg);
 }
 
 /**
@@ -430,7 +469,25 @@ void mark_oom_victim(struct task_struct *tsk)
  */
 void exit_oom_victim(void)
 {
+	struct mem_cgroup *iter;
+	struct oom_context *ctx;
+
 	clear_thread_flag(TIF_MEMDIE);
+
+	/*
+	 * Wake up every process waiting for this oom victim to exit.
+	 */
+	spin_lock(&oom_context_lock);
+	iter = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		if (ctx->victim == current) {
+			ctx->victim = NULL;
+			if (!ctx->owner)
+				wake_up_all(&ctx->waitq);
+		}
+	} while ((iter = mem_cgroup_iter(NULL, iter, NULL)));
+	spin_unlock(&oom_context_lock);
 }
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -586,54 +643,99 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
-/*
- * Try to acquire the OOM killer lock for the zones in zonelist.  Returns zero
- * if a parallel OOM killing is already taking place that includes a zone in
- * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
- */
-int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+bool oom_trylock(struct mem_cgroup *memcg)
 {
-	struct zoneref *z;
-	struct zone *zone;
-	int ret = 1;
+	struct mem_cgroup *iter;
+	struct oom_context *ctx;
+	DEFINE_WAIT(wait);
 
-	spin_lock(&zone_scan_lock);
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-		if (zone_is_oom_locked(zone)) {
-			ret = 0;
-			goto out;
+	spin_lock(&oom_context_lock);
+
+	/*
+	 * Check if oom context of any of our descendants is active,
+	 * i.e. if there is a process selecting a victim or a victim is
+	 * dying. If there is, wait for it to finish, otherwise proceed
+	 * to oom.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		if (ctx->owner || ctx->victim) {
+			prepare_to_wait(&ctx->waitq, &wait,
+					TASK_KILLABLE);
+			spin_unlock(&oom_context_lock);
+			schedule();
+			finish_wait(&ctx->waitq, &wait);
+			mem_cgroup_iter_break(memcg, iter);
+			return false;
 		}
-	}
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
 
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-		/*
-		 * Lock each zone in the zonelist under zone_scan_lock so a
-		 * parallel invocation of try_set_zonelist_oom() doesn't succeed
-		 * when it shouldn't.
-		 */
-		zone_set_flag(zone, ZONE_OOM_LOCKED);
-	}
+	/*
+	 * Acquire oom context of all descendants.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		BUG_ON(ctx->owner);
+		BUG_ON(ctx->victim);
+		ctx->owner = current;
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
 
-out:
-	spin_unlock(&zone_scan_lock);
-	return ret;
+	spin_unlock(&oom_context_lock);
+
+	return true;
 }
 
-/*
- * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
- * allocation attempts with zonelists containing them may now recall the OOM
- * killer, if necessary.
- */
-void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+void oom_unlock(struct mem_cgroup *memcg)
 {
-	struct zoneref *z;
-	struct zone *zone;
+	struct mem_cgroup *iter, *victim_memcg = NULL;
+	struct oom_context *ctx;
+	bool need_to_wait = false;
+	DEFINE_WAIT(wait);
+
+	spin_lock(&oom_context_lock);
+
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		if (ctx->owner != current)
+			continue;
+
+		ctx->owner = NULL;
+		if (!ctx->victim) {
+			/*
+			 * Victim already exited or nobody was killed?
+			 * It's our responsibility to wake up blocked
+			 * processes then.
+			 */
+			wake_up_all(&ctx->waitq);
+			continue;
+		}
+
+		/* Already waiting? */
+		if (need_to_wait)
+			continue;
+		/*
+		 * Remember victim memcg so that we can wait for victim
+		 * to exit below.
+		 */
+		victim_memcg = iter;
+		mem_cgroup_get(iter);
 
-	spin_lock(&zone_scan_lock);
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-		zone_clear_flag(zone, ZONE_OOM_LOCKED);
+		prepare_to_wait(&ctx->waitq, &wait, TASK_KILLABLE);
+		need_to_wait = true;
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
+
+	spin_unlock(&oom_context_lock);
+
+	if (need_to_wait) {
+		schedule();
+		ctx = mem_cgroup_oom_context(victim_memcg);
+		finish_wait(&ctx->waitq, &wait);
+		mem_cgroup_put(victim_memcg);
+		return;
 	}
-	spin_unlock(&zone_scan_lock);
 }
 
 /**
@@ -658,7 +760,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	unsigned long freed = 0;
 	unsigned int uninitialized_var(points);
 	enum oom_constraint constraint = CONSTRAINT_NONE;
-	int killed = 0;
 
 	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
 	if (freed > 0)
@@ -695,7 +796,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
 				 nodemask,
 				 "Out of memory (oom_kill_allocating_task)");
-		goto out;
+		return;
 	}
 
 	p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
@@ -707,15 +808,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	if (PTR_ERR(p) != -1UL) {
 		oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
 				 nodemask, "Out of memory");
-		killed = 1;
 	}
-out:
-	/*
-	 * Give the killed threads a good chance of exiting before trying to
-	 * allocate memory again.
-	 */
-	if (killed)
-		schedule_timeout_killable(1);
 }
 
 /*
@@ -725,11 +818,8 @@ out:
  */
 void pagefault_out_of_memory(void)
 {
-	struct zonelist *zonelist = node_zonelist(first_online_node,
-						  GFP_KERNEL);
-
-	if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
+	if (oom_trylock(NULL)) {
 		out_of_memory(NULL, 0, 0, NULL, false);
-		clear_zonelist_oom(zonelist, GFP_KERNEL);
+		oom_unlock(NULL);
 	}
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c479b9aa36bd..f70c5f4da2a2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2157,10 +2157,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct page *page;
 
 	/* Acquire the OOM killer lock for the zones in zonelist */
-	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
-		schedule_timeout_uninterruptible(1);
+	if (!oom_trylock(NULL))
 		return NULL;
-	}
 
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
@@ -2195,7 +2193,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 
 out:
-	clear_zonelist_oom(zonelist, gfp_mask);
+	oom_unlock(NULL);
 	return page;
 }
 
-- 
2.1.4