[Devel] [PATCH RHEL8 COMMIT] ms/memcg: prohibit unconditional exceeding the limit of dying tasks

Thu Sep 23 19:26:48 MSK 2021

The commit is pushed to "branch-rh8-4.18.0-305.3.1.vz8.7.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-305.3.1.vz8.7.13
------>
commit ded8b4372053eb4abb997daf86dc3ac39f31465d
Author: Vasily Averin <vvs at virtuozzo.com>
Date:   Thu Sep 23 19:26:48 2021 +0300

    ms/memcg: prohibit unconditional exceeding the limit of dying tasks
    
    The kernel currently allows dying tasks to exceed the memcg limits.
    The allocation is expected to be the last one and the occupied memory
    will be freed soon.
    This is not always true because it can be part of the huge vmalloc
    allocation. Allowed once, they will repeat over and over again.
    Moreover lifetime of the allocated object can differ from the lifetime
    of the dying task.
    Multiple such allocations running concurrently can not only overuse
    the memcg limit, but can lead to a global out of memory and,
    in the worst case, cause the host to panic.
    
    This patch removes checks forced exceed of the memcg limit for dying
    tasks. Also it breaks endless loop for tasks bypassed by the oom killer.
    In addition, it renames should_force_charge() helper to task_is_dying()
    because now its use do not lead to the forced charge.
    
    Suggested-by: Michal Hocko <mhocko at suse.com>
    Signed-off-by: Vasily Averin <vvs at virtuozzo.com>
    
    [backported upstream patch version]
    https://lkml.org/lkml/2021/9/14/438
    https://jira.sw.ru/browse/PSBM-132705
    
    Signed-off-by: Vasily Averin <vvs at virtuozzo.com>
---
 mm/memcontrol.c | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3f6f59ac8746..fbbad73acb81 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -250,7 +250,7 @@ enum res_type {
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 
-static inline bool should_force_charge(void)
+static inline bool task_is_dying(void)
 {
 	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
 		(current->flags & PF_EXITING);
@@ -1842,7 +1842,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 * A few threads which were not waiting at mutex_lock_killable() can
 	 * fail to bail out. Therefore, check again after holding oom_lock.
 	 */
-	ret = should_force_charge() || out_of_memory(&oc);
+	ret = task_is_dying() || out_of_memory(&oc);
 
 unlock:
 	mutex_unlock(&oom_lock);
@@ -2846,6 +2846,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
 	struct page_counter *counter;
 	unsigned long nr_reclaimed;
 	bool kmem_limit = false;
+	bool passed_oom = false;
 	bool may_swap = true;
 	bool drained = false;
 	enum oom_status oom_status;
@@ -2904,15 +2905,6 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
 	if (gfp_mask & __GFP_ATOMIC)
 		goto force;
 
-	/*
-	 * Unlike in global OOM situations, memcg is not in a physical
-	 * memory shortage.  Allow dying and OOM-killed tasks to
-	 * bypass the last charges so that they can exit quickly and
-	 * free their memory.
-	 */
-	if (unlikely(should_force_charge()))
-		goto force;
-
 	/*
 	 * Prevent unbounded recursion when reclaim operations need to
 	 * allocate memory. This might exceed the limits temporarily,
@@ -2971,8 +2963,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
 	if (gfp_mask & __GFP_NOFAIL)
 		goto force;
 
-	if (fatal_signal_pending(current))
-		goto force;
+	/* Avoid endless loop for tasks bypassed by the oom killer */
+	if (passed_oom && task_is_dying())
+		goto nomem;
 
 	/*
 	 * We might have [a lot of] reclaimable kmem which we cannot reclaim in
@@ -2991,14 +2984,10 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
 	 */
 	oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
 		       get_order(nr_pages * PAGE_SIZE), kmem_limit);
-	switch (oom_status) {
-	case OOM_SUCCESS:
+	if (oom_status == OOM_SUCCESS) {
+		passed_oom = true;
 		nr_retries = MAX_RECLAIM_RETRIES;
 		goto retry;
-	case OOM_FAILED:
-		goto force;
-	default:
-		goto nomem;
 	}
 nomem:
 	if (!(gfp_mask & __GFP_NOFAIL))