[Devel] [PATCH rh7 4/6] oom: introduce oom timeout

Fri Sep 11 06:41:57 PDT 2015

On 11.09.2015 12:54, Vladimir Davydov wrote:
> Currently, we won't select a new oom victim until the previous one has
> passed away. This might lead to a deadlock if an allocating task holds a
> lock needed by the victim to complete. To cope with this problem, this
> patch introduced oom timeout, after which a new task will be selected
> even if the previous victim hasn't died. The timeout is hard-coded,
> equals 5 seconds.
> 
> https://jira.sw.ru/browse/PSBM-38581
> 
> Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
> ---
>  include/linux/oom.h |  1 +
>  mm/oom_kill.c       | 48 +++++++++++++++++++++++++++++++++++++++---------
>  2 files changed, 40 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index e19385dd29aa..caeda102e7ba 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -34,6 +34,7 @@ enum oom_scan_t {
>  struct oom_context {
>  	struct task_struct *owner;
>  	struct task_struct *victim;
> +	unsigned long oom_start;
>  	wait_queue_head_t waitq;
>  };
>  
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index c9265092825d..bf101f9242ff 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -45,6 +45,8 @@ int sysctl_oom_dump_tasks;
>  
>  static DEFINE_SPINLOCK(oom_context_lock);
>  
> +#define OOM_TIMEOUT	(5 * HZ)
> +
>  #ifndef CONFIG_MEMCG
>  struct oom_context oom_ctx = {
>  	.waitq		= __WAIT_QUEUE_HEAD_INITIALIZER(oom_ctx.waitq),
> @@ -55,6 +57,7 @@ void init_oom_context(struct oom_context *ctx)
>  {
>  	ctx->owner = NULL;
>  	ctx->victim = NULL;
> +	ctx->oom_start = 0;
>  	init_waitqueue_head(&ctx->waitq);
>  }
>  
> @@ -286,11 +289,14 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
>  
>  	/*
>  	 * This task already has access to memory reserves and is being killed.
> -	 * Don't allow any other task to have access to the reserves.
> +	 * Try to select another one.
> +	 *
> +	 * This can only happen if oom_trylock timeout-ed, which most probably
> +	 * means that the victim had dead-locked.
>  	 */
>  	if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
>  		if (!force_kill)
> -			return OOM_SCAN_ABORT;
> +			return OOM_SCAN_CONTINUE;
>  	}
>  	if (!task->mm)
>  		return OOM_SCAN_CONTINUE;
> @@ -645,6 +651,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
>  
>  bool oom_trylock(struct mem_cgroup *memcg)
>  {
> +	unsigned long now = jiffies;
>  	struct mem_cgroup *iter;
>  	struct oom_context *ctx;
>  	DEFINE_WAIT(wait);
> @@ -660,14 +667,29 @@ bool oom_trylock(struct mem_cgroup *memcg)
>  	iter = mem_cgroup_iter(memcg, NULL, NULL);
>  	do {
>  		ctx = mem_cgroup_oom_context(iter);
> -		if (ctx->owner || ctx->victim) {
> +		if ((ctx->owner || ctx->victim) &&
> +		    time_before(now, ctx->oom_start + OOM_TIMEOUT)) {
>  			prepare_to_wait(&ctx->waitq, &wait,
>  					TASK_KILLABLE);
>  			spin_unlock(&oom_context_lock);
> -			schedule();
> +			schedule_timeout(ctx->oom_start + OOM_TIMEOUT - now);
>  			finish_wait(&ctx->waitq, &wait);
>  			mem_cgroup_iter_break(memcg, iter);
>  			return false;
> +		} else if (ctx->owner || ctx->victim) {
> +			/*
> +			 * Timeout. Release the context.
> +			 */
> +			struct task_struct *p = ctx->victim ?: ctx->owner;
> +
> +			task_lock(p);
> +			pr_err("OOM kill timeout: %d (%s)\n",
> +			       task_pid_nr(p), p->comm);
> +			task_unlock(p);
> +			show_stack(p, NULL);
> +
> +			ctx->owner = ctx->victim = NULL;
> +			wake_up_all(&ctx->waitq);
>  		}
>  	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
>  
> @@ -680,6 +702,7 @@ bool oom_trylock(struct mem_cgroup *memcg)
>  		BUG_ON(ctx->owner);
>  		BUG_ON(ctx->victim);
>  		ctx->owner = current;
> +		ctx->oom_start = now;
>  	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
>  
>  	spin_unlock(&oom_context_lock);
> @@ -689,9 +712,10 @@ bool oom_trylock(struct mem_cgroup *memcg)
>  
>  void oom_unlock(struct mem_cgroup *memcg)
>  {
> +	unsigned long now = jiffies;
> +	unsigned long timeout = 0;
>  	struct mem_cgroup *iter, *victim_memcg = NULL;
>  	struct oom_context *ctx;
> -	bool need_to_wait = false;
>  	DEFINE_WAIT(wait);
>  
>  	spin_lock(&oom_context_lock);
> @@ -714,8 +738,15 @@ void oom_unlock(struct mem_cgroup *memcg)
>  		}
>  
>  		/* Already waiting? */
> -		if (need_to_wait)
> +		if (timeout > 0)
>  			continue;
> +		/* Expired? */
> +		if (time_after(now, ctx->oom_start + OOM_TIMEOUT))
> +			continue;
> +
> +		timeout = ctx->oom_start + OOM_TIMEOUT - now;
> +		BUG_ON(timeout == 0);

now may be less or more than (oom_start + OOM_TIMEOUT), so why can't
it be equal? Did you mean time_after_eq() above?

> +
>  		/*
>  		 * Remember victim memcg so that we can wait for victim
>  		 * to exit below.
> @@ -724,13 +755,12 @@ void oom_unlock(struct mem_cgroup *memcg)
>  		mem_cgroup_get(iter);
>  
>  		prepare_to_wait(&ctx->waitq, &wait, TASK_KILLABLE);
> -		need_to_wait = true;
>  	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
>  
>  	spin_unlock(&oom_context_lock);
>  
> -	if (need_to_wait) {
> -		schedule();
> +	if (timeout > 0) {
> +		schedule_timeout(timeout);
>  		ctx = mem_cgroup_oom_context(victim_memcg);
>  		finish_wait(&ctx->waitq, &wait);
>  		mem_cgroup_put(victim_memcg);
>