[Devel] [PATCH RHEL7 COMMIT] mm: memcontrol: fix race between kmem uncharge and charge reparenting

Mon Jun 20 09:59:38 PDT 2016

The commit is pushed to "branch-rh7-3.10.0-327.18.2.vz7.14.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-327.18.2.vz7.14.16
------>
commit 35c0d2a992aaa399cccaee2fc9f3ed6879840dd4
Author: Vladimir Davydov <vdavydov at virtuozzo.com>
Date:   Mon Jun 20 20:59:38 2016 +0400

    mm: memcontrol: fix race between kmem uncharge and charge reparenting
    
    When a cgroup is destroyed, all user memory pages get recharged to the
    parent cgroup. Recharging is done by mem_cgroup_reparent_charges which
    keeps looping until res <= kmem. This is supposed to guarantee that by
    the time cgroup gets released, no pages is charged to it. However, the
    guarantee might be violated in case mem_cgroup_reparent_charges races
    with kmem charge or uncharge.
    
    Currently, kmem is charged before res and uncharged after. As a result,
    kmem might become greater than res for a short period of time even if
    there are still user memory pages charged to the cgroup. In this case
    mem_cgroup_reparent_charges will give up prematurely, and the cgroup
    might be released though there are still pages charged to it. Uncharge
    of such a page will trigger kernel panic:
    
      general protection fault: 0000 [#1] SMP
      CPU: 0 PID: 972445 Comm: httpd ve: 0 Tainted: G           OE  ------------   3.10.0-427.10.1.lve1.4.9.el7.x86_64 #1 12.14
      task: ffff88065d53d8d0 ti: ffff880224f34000 task.ti: ffff880224f34000
      RIP: 0010:[<ffffffff811e7733>]  [<ffffffff811e7733>] mem_cgroup_charge_statistics.isra.16+0x13/0x60
      RSP: 0018:ffff880224f37a80  EFLAGS: 00010202
      RAX: ffffffffffffffff RBX: ffff8807b26f0110 RCX: 00000000ffffffff
      RDX: 79726f6765746163 RSI: ffffea000c9c0440 RDI: ffff8806a55662f8
      RBP: ffff880224f37a80 R08: 0000000000000000 R09: 0000000003808000
      R10: 00000000000000b8 R11: ffffea001eaa8980 R12: ffffea000c9c0440
      R13: 0000000000000001 R14: 0000000000000000 R15: ffff8806a5566000
      FS:  0000000000000000(0000) GS:ffff8807d4000000(0000) knlGS:0000000000000000
      CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
      CR2: 00007f54289bd74c CR3: 00000006638b1000 CR4: 00000000000006f0
      DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
      DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
      Stack:
       ffff880224f37ac0 ffffffff811e9ddf ffff880600000001 ffffea000c9c0440
       0000000000000001 00000000037d1000 ffff880224f37c78 0000000003800000
       ffff880224f37ad0 ffffffff811ee99a ffff880224f37b08 ffffffff811b9ec9
      Call Trace:
       [<ffffffff811e9ddf>] __mem_cgroup_uncharge_common+0xcf/0x320
       [<ffffffff811ee99a>] mem_cgroup_uncharge_page+0x2a/0x30
       [<ffffffff811b9ec9>] page_remove_rmap+0xb9/0x160
       [<ffffffff81114e73>] ? res_counter_uncharge+0x13/0x20
       [<ffffffff811ab580>] unmap_page_range+0x460/0x870
       [<ffffffff811aba11>] unmap_single_vma+0x81/0xf0
       [<ffffffff811ace79>] unmap_vmas+0x49/0x90
       [<ffffffff811b663c>] exit_mmap+0xac/0x1a0
       [<ffffffff8107853b>] mmput+0x6b/0x140
       [<ffffffff81202547>] flush_old_exec+0x467/0x8d0
       [<ffffffff8125883c>] load_elf_binary+0x33c/0xde0
       [<ffffffff811afe02>] ? get_user_pages+0x52/0x60
       [<ffffffff81258500>] ? load_elf_library+0x220/0x220
       [<ffffffff81201c25>] search_binary_handler+0xd5/0x300
       [<ffffffff812032b7>] do_execve_common.isra.26+0x657/0x720
       [<ffffffff81203619>] SyS_execve+0x29/0x30
       [<ffffffff81649369>] stub_execve+0x69/0xa0
    
    To prevent this from happening, let's always charge kmem after res and
    uncharge before res.
    
    https://bugs.openvz.org/browse/OVZ-6756
    
    Reported-by: Anatoly Stepanov <astepanov at cloudlinux.com>
    Signed-off-by: Vladimir Davydov <vdavydov at virtuozzo.com>
    Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 mm/memcontrol.c | 44 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c3fbb2d2c48..de7c36295515 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3163,10 +3163,6 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
 	int ret = 0;
 	bool may_oom;
 
-	ret = res_counter_charge(&memcg->kmem, size, &fail_res);
-	if (ret)
-		return ret;
-
 	/*
 	 * Conditions under which we can wait for the oom_killer. Those are
 	 * the same conditions tested by the core page allocator
@@ -3198,8 +3194,33 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
 			res_counter_charge_nofail(&memcg->memsw, size,
 						  &fail_res);
 		ret = 0;
-	} else if (ret)
-		res_counter_uncharge(&memcg->kmem, size);
+	}
+
+	if (ret)
+		return ret;
+
+	/*
+	 * When a cgroup is destroyed, all user memory pages get recharged to
+	 * the parent cgroup. Recharging is done by mem_cgroup_reparent_charges
+	 * which keeps looping until res <= kmem. This is supposed to guarantee
+	 * that by the time cgroup gets released, no pages is charged to it.
+	 *
+	 * If kmem were charged before res or uncharged after, kmem might
+	 * become greater than res for a short period of time even if there
+	 * were still user memory pages charged to the cgroup. In this case
+	 * mem_cgroup_reparent_charges would give up prematurely, and the
+	 * cgroup could be released though there were still pages charged to
+	 * it. Uncharge of such a page would trigger kernel panic.
+	 *
+	 * To prevent this from happening, kmem must be charged after res and
+	 * uncharged before res.
+	 */
+	ret = res_counter_charge(&memcg->kmem, size, &fail_res);
+	if (ret) {
+		res_counter_uncharge(&memcg->res, size);
+		if (do_swap_account)
+			res_counter_uncharge(&memcg->memsw, size);
+	}
 
 	return ret;
 }
@@ -3208,20 +3229,27 @@ void memcg_charge_kmem_nofail(struct mem_cgroup *memcg, u64 size)
 {
 	struct res_counter *fail_res;
 
-	res_counter_charge_nofail(&memcg->kmem, size, &fail_res);
 	res_counter_charge_nofail(&memcg->res, size, &fail_res);
 	if (do_swap_account)
 		res_counter_charge_nofail(&memcg->memsw, size, &fail_res);
+
+	/* kmem must be charged after res - see memcg_charge_kmem() */
+	res_counter_charge_nofail(&memcg->kmem, size, &fail_res);
 }
 
 void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
 {
+	u64 kmem;
+
+	/* kmem must be uncharged before res - see memcg_charge_kmem() */
+	kmem = res_counter_uncharge(&memcg->kmem, size);
+
 	res_counter_uncharge(&memcg->res, size);
 	if (do_swap_account)
 		res_counter_uncharge(&memcg->memsw, size);
 
 	/* Not down to 0 */
-	if (res_counter_uncharge(&memcg->kmem, size))
+	if (kmem)
 		return;
 
 	/*