[Devel] [PATCH RH8] oom: Scan only the relevant memcg when in berserker mode

Konstantin Khorenko khorenko at virtuozzo.com
Thu Aug 12 14:11:00 MSK 2021


On 28.07.2021 23:04, Evgenii Shatokhin wrote:
> Even if an OOM happened in a memory cgroup, oom_berserker() iterated
> over all tasks in the system and called task_in_mem_cgroup() for each
> one to find the tasks from the necessary memcg.
> 
> It is not needed to scan all the tasks though. select_bad_process() and
> other functions already rely on mem_cgroup_scan_tasks() to iterate over
> the tasks from a given memory cgroup only.
> 
> Now, oom_berserker() was revisited to make use of that too.
> task_in_mem_cgroup() was no longer needed after that and was removed.
> 
> There is one potential disadvantage with this approach though. Walking
> through the whole list of tasks in the system allowed oom_berserker() to
> pick the youngest tasks in the memcg which were no better than the OOM
> killer's previous target. mem_cgroup_scan_tasks(), however, iterates over
> the tasks in an unspecified order. It seems, it processes the cgroups
> from parent to children and the tasks in each cgroup - from the oldest
> added to the newest added ones, but this could change in the future.
> This way, oom_berserker() can select the tasks which are no better than
> the target, but not the youngest ones.

Well, after discussions we decided to postpone this patch until we have
a complain on slow berserk and findout that the reason of it is exactly
the time spent walking through all tasks of the Node.

Why? Just because fighting the fork bombs the age of the task is very important,
if we ignore it it's very likely we kill all other processes in the Container
before we acually kill the fork bomb (just because fork bomb processes are likely
very small, smaller than other normal processes)

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

> Not sure if this is a real problem though. oom_berserker() triggers only
> when the normal OOM killer kills tasks very often. If some system task
> eats up lots of memory and gets shot as a result, then there is
> something wrong with that task already. It would likely be shot by the
> normal OOM killer anyway, but a bit later.
> 
> https://jira.sw.ru/browse/PSBM-131984
> Signed-off-by: Evgenii Shatokhin <eshatokhin at virtuozzo.com>
> ---
>   include/linux/memcontrol.h |   7 --
>   mm/memcontrol.c            |  26 ------
>   mm/oom_kill.c              | 164 +++++++++++++++++++++----------------
>   3 files changed, 95 insertions(+), 102 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 5458f18c0ae7..c6244a62f376 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -551,7 +551,6 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
>   
>   struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
>   
> -bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
>   struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
>   
>   struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
> @@ -1096,12 +1095,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
>   	return true;
>   }
>   
> -static inline bool task_in_mem_cgroup(struct task_struct *task,
> -				      const struct mem_cgroup *memcg)
> -{
> -	return true;
> -}
> -
>   static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
>   {
>   	return NULL;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f9a1c3c4f85e..424bc1e52b3b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1462,32 +1462,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
>   		*lru_size += nr_pages;
>   }
>   
> -bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
> -{
> -	struct mem_cgroup *task_memcg;
> -	struct task_struct *p;
> -	bool ret;
> -
> -	p = find_lock_task_mm(task);
> -	if (p) {
> -		task_memcg = get_mem_cgroup_from_mm(p->mm);
> -		task_unlock(p);
> -	} else {
> -		/*
> -		 * All threads may have already detached their mm's, but the oom
> -		 * killer still needs to detect if they have already been oom
> -		 * killed to prevent needlessly killing additional tasks.
> -		 */
> -		rcu_read_lock();
> -		task_memcg = mem_cgroup_from_task(task);
> -		css_get(&task_memcg->css);
> -		rcu_read_unlock();
> -	}
> -	ret = mem_cgroup_is_descendant(task_memcg, memcg);
> -	css_put(&task_memcg->css);
> -	return ret;
> -}
> -
>   #ifdef CONFIG_CLEANCACHE
>   bool mem_cgroup_cleancache_disabled(struct page *page)
>   {
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index a64a9ff7391b..382a85b64c1f 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -951,21 +951,97 @@ static int oom_kill_memcg_member(struct task_struct *task, void *message)
>   	return 0;
>   }
>   
> +/*
> + * The data used when iterating over tasks in "rage/berserker" mode.
> + */
> +struct rage_kill_data {
> +	struct oom_control *oc;
> +
> +	/* current level of berserker rage */
> +	int rage;
> +
> +	/* number of tasks killed so far */
> +	int killed;
> +};
> +
> +/*
> + * Returns 1 if enough tasks have been killed, 0 otherwise.
> + */
> +static int
> +maybe_rage_kill(struct task_struct *p, void *arg)
> +{
> +	static DEFINE_RATELIMIT_STATE(berserker_rs,
> +				      DEFAULT_RATELIMIT_INTERVAL,
> +				      DEFAULT_RATELIMIT_BURST);
> +	struct rage_kill_data *rkd = arg;
> +	struct oom_control *oc = rkd->oc;
> +	unsigned long tsk_points;
> +	unsigned long tsk_overdraft;
> +
> +	if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
> +		fatal_signal_pending(p) || p->flags & PF_EXITING ||
> +		oom_unkillable_task(p))
> +		return 0;
> +
> +	/* p may not have freeable memory in nodemask */
> +	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> +		return 0;
> +
> +	tsk_points = oom_badness(p, oc->totalpages, &tsk_overdraft);
> +	if (tsk_overdraft < oc->overdraft)
> +		return 0;
> +
> +	/*
> +	 * oom_badness never returns a negative value, even if
> +	 * oom_score_adj would make badness so, instead it
> +	 * returns 1. So we do not kill task with badness 1 if
> +	 * the victim has badness > 1 so as not to risk killing
> +	 * protected tasks.
> +	 */
> +	if (tsk_points <= 1 && oc->chosen_points > 1)
> +		return 0;
> +
> +	/*
> +	 * Consider tasks as equally bad if they occupy equal
> +	 * percentage of available memory.
> +	 */
> +	if (tsk_points * 100 / oc->totalpages <
> +		oc->chosen_points * 100 / oc->totalpages)
> +		return 0;
> +
> +	if (__ratelimit(&berserker_rs)) {
> +		task_lock(p);
> +		pr_err("Rage kill process %d (%s)\n",
> +			task_pid_nr(p), p->comm);
> +		task_unlock(p);
> +	}
> +
> +	count_vm_event(OOM_KILL);
> +	memcg_memory_event((is_memcg_oom(oc) ? oc->memcg : root_mem_cgroup),
> +			   MEMCG_OOM_KILL);
> +
> +	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
> +
> +	++rkd->killed;
> +	if (rkd->killed >= 1 << rkd->rage)
> +		return 1;
> +	return 0;
> +}
> +
>   /*
>    * Kill more processes if oom happens too often in this context.
>    */
>   static void oom_berserker(struct oom_control *oc)
>   {
> -	static DEFINE_RATELIMIT_STATE(berserker_rs,
> -				DEFAULT_RATELIMIT_INTERVAL,
> -				DEFAULT_RATELIMIT_BURST);
>   	struct task_struct *p;
>   	struct mem_cgroup *memcg;
>   	unsigned long now = jiffies;
> -	int rage;
> -	int killed = 0;
> +	struct rage_kill_data rkd = {
> +		.oc = oc,
> +		.killed = 0,
> +	};
>   
> -	memcg = oc->memcg ?: root_mem_cgroup;
> +	memcg = is_memcg_oom(oc) ? oc->memcg : root_mem_cgroup;
>   
>   	spin_lock(&memcg->oom_rage_lock);
>   	memcg->prev_oom_time = memcg->oom_time;
> @@ -983,77 +1059,27 @@ static void oom_berserker(struct oom_control *oc)
>   		memcg->oom_rage = OOM_BASE_RAGE;
>   	else if (memcg->oom_rage < OOM_MAX_RAGE)
>   		memcg->oom_rage++;
> -	rage = memcg->oom_rage;
> +	rkd.rage = memcg->oom_rage;
>   	spin_unlock(&memcg->oom_rage_lock);
>   
> -	if (rage < 0)
> +	if (rkd.rage < 0)
>   		return;
>   
>   	/*
> -	 * So, we are in rage. Kill (1 << rage) youngest tasks that are
> -	 * as bad as the victim.
> +	 * So, we are in rage. Kill (1 << rage) tasks that are as bad as
> +	 * the victim.
>   	 */
> -	read_lock(&tasklist_lock);
> -	list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
> -		unsigned long tsk_points;
> -		unsigned long tsk_overdraft;
> -
> -		if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
> -			fatal_signal_pending(p) || p->flags & PF_EXITING ||
> -			oom_unkillable_task(p))
> -			continue;
> -
> -		/*
> -		 * When mem_cgroup_out_of_memory() and
> -		 * p is not member of the group.
> -		 */
> -		if (oc->memcg && !task_in_mem_cgroup(p, oc->memcg))
> -			continue;
> -
> -		/* p may not have freeable memory in nodemask */
> -		if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> -			continue;
> -
> -		tsk_points = oom_badness(p, oc->totalpages, &tsk_overdraft);
> -		if (tsk_overdraft < oc->overdraft)
> -			continue;
> -
> -		/*
> -		 * oom_badness never returns a negative value, even if
> -		 * oom_score_adj would make badness so, instead it
> -		 * returns 1. So we do not kill task with badness 1 if
> -		 * the victim has badness > 1 so as not to risk killing
> -		 * protected tasks.
> -		 */
> -		if (tsk_points <= 1 && oc->chosen_points > 1)
> -			continue;
> -
> -		/*
> -		 * Consider tasks as equally bad if they occupy equal
> -		 * percentage of available memory.
> -		 */
> -		if (tsk_points * 100 / oc->totalpages <
> -			oc->chosen_points * 100 / oc->totalpages)
> -			continue;
> -
> -		if (__ratelimit(&berserker_rs)) {
> -			task_lock(p);
> -			pr_err("Rage kill process %d (%s)\n",
> -				task_pid_nr(p), p->comm);
> -			task_unlock(p);
> -		}
> -
> -		count_vm_event(OOM_KILL);
> -		memcg_memory_event(memcg, MEMCG_OOM_KILL);
> -
> -		do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
> -
> -		if (++killed >= 1 << rage)
> -			break;
> +	if (is_memcg_oom(oc)) {
> +		mem_cgroup_scan_tasks(oc->memcg, maybe_rage_kill, &rkd);
> +	} else {
> +		read_lock(&tasklist_lock);
> +		list_for_each_entry_reverse(p, &init_task.tasks, tasks)
> +			if (maybe_rage_kill(p, &rkd))
> +				break;
> +		read_unlock(&tasklist_lock);
>   	}
> -	read_unlock(&tasklist_lock);
>   
> -	pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed);
> +	pr_err("OOM killer in rage %d: %d tasks killed\n", rkd.rage, rkd.killed);
>   }
>   
>   static void oom_kill_process(struct oom_control *oc, const char *message)
> 




More information about the Devel mailing list