[Devel] [PATCH 11/17] oom: introduce oom kill timeout

Vladimir Davydov vdavydov at parallels.com
Fri Aug 14 10:03:35 PDT 2015


Currently, we won't select a new oom victim until the previous one has
passed away. This might lead to a deadlock if an allocating task holds a
lock needed by the victim to complete. To cope with this problem, this
patch introduced oom timeout, after which a new task will be selected
even if the previous victim hasn't died. The timeout can be configured
by sysctl vm.oom_timeout. It equals 5 seconds by default.

https://jira.sw.ru/browse/PSBM-38581

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 include/linux/oom.h   |  3 +++
 include/linux/sched.h |  2 ++
 kernel/sysctl.c       |  7 +++++++
 mm/memcontrol.c       |  2 +-
 mm/oom_kill.c         | 26 +++++++++++++++++++++-----
 5 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 201aa1ab1dd2..f16e35bee7d9 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -52,6 +52,8 @@ static inline bool oom_task_origin(const struct task_struct *p)
 /* linux/mm/oom_group.c */
 extern int get_task_oom_score_adj(struct task_struct *t);
 
+extern void set_tsk_memdie(struct task_struct *p);
+
 extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 			  const nodemask_t *nodemask, unsigned long totalpages);
 extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
@@ -92,4 +94,5 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_panic_on_oom;
+extern int sysctl_oom_timeout;
 #endif /* _INCLUDE_LINUX_OOM_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 12dc066e0681..ac5f9f2ba4b3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1582,6 +1582,8 @@ struct task_struct {
 	unsigned int	sequential_io_avg;
 #endif
 
+	unsigned long memdie_start;
+
 	/* reserved for Red Hat */
 #ifdef CONFIG_DETECT_HUNG_TASK
 	RH_KABI_USE(1, unsigned long last_switch_count)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8478a1e762d5..5a3ff6cb15fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1184,6 +1184,13 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "oom_timeout",
+		.data		= &sysctl_oom_timeout,
+		.maxlen		= sizeof(sysctl_oom_timeout),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
+	{
 		.procname	= "overcommit_ratio",
 		.data		= &sysctl_overcommit_ratio,
 		.maxlen		= sizeof(sysctl_overcommit_ratio),
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 47d859c6cb6f..b27b4148600b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1916,7 +1916,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 * quickly exit and free its memory.
 	 */
 	if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
-		set_thread_flag(TIF_MEMDIE);
+		set_tsk_memdie(current);
 		return;
 	}
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ca765a82fa1a..0d569020390a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,6 +42,7 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
+int sysctl_oom_timeout = 5 * HZ;
 static DEFINE_SPINLOCK(zone_scan_lock);
 
 #ifdef CONFIG_NUMA
@@ -270,6 +271,10 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 	if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
 		if (unlikely(frozen(task)))
 			__thaw_task(task);
+		smp_rmb(); /* matches smp_wmb() in set_tsk_memdie() */
+		if (time_after(jiffies, task->memdie_start +
+					sysctl_oom_timeout))
+			return OOM_SCAN_CONTINUE;
 		if (!force_kill)
 			return OOM_SCAN_ABORT;
 	}
@@ -283,13 +288,13 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 	if (oom_task_origin(task))
 		return OOM_SCAN_SELECT;
 
-	if (task->flags & PF_EXITING && !force_kill) {
+	if (task->flags & PF_EXITING) {
 		/*
 		 * If this task is not being ptraced on exit, then wait for it
 		 * to finish before killing some other task unnecessarily.
 		 */
 		if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
-			return OOM_SCAN_ABORT;
+			return OOM_SCAN_SELECT;
 	}
 	return OOM_SCAN_OK;
 }
@@ -412,6 +417,15 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 		dump_tasks(memcg, nodemask);
 }
 
+void set_tsk_memdie(struct task_struct *p)
+{
+	if (!test_tsk_thread_flag(p, TIF_MEMDIE)) {
+		p->memdie_start = jiffies;
+		smp_wmb(); /* matches smp_rmb() in oom_scan_process_thread() */
+		set_tsk_thread_flag(p, TIF_MEMDIE);
+	}
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Must be called while holding a reference to p, which will be released upon
@@ -435,7 +449,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	 * its children or threads, just set TIF_MEMDIE so it can die quickly
 	 */
 	if (p->flags & PF_EXITING) {
-		set_tsk_thread_flag(p, TIF_MEMDIE);
+		set_tsk_memdie(p);
 		goto out;
 	}
 
@@ -460,6 +474,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 
 			if (child->mm == p->mm)
 				continue;
+			if (test_tsk_thread_flag(child, TIF_MEMDIE))
+				continue;
 			/*
 			 * oom_badness() returns 0 if the thread is unkillable
 			 */
@@ -518,7 +534,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 		}
 	rcu_read_unlock();
 
-	set_tsk_thread_flag(victim, TIF_MEMDIE);
+	set_tsk_memdie(victim);
 	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
 	mem_cgroup_note_oom_kill(memcg, victim);
 out:
@@ -649,7 +665,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 * quickly exit and free its memory.
 	 */
 	if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
-		set_thread_flag(TIF_MEMDIE);
+		set_tsk_memdie(current);
 		boost_dying_task(current);
 		return;
 	}
-- 
2.1.4




More information about the Devel mailing list