[Devel] [RFC][BUGFIX] memcg: fix dead lock between cpuset and memcg (Re: [PATCH v5 3/3] cgroups: make procs file writable)
Daisuke Nishimura
nishimura at mxp.nes.nec.co.jp
Mon Dec 27 18:43:52 PST 2010
> It looks to me like when memcg holds the mmap_sem the whole time, it's
> just to avoid the deadlock, not that there's there some need for the
> stuff under mmap_sem not to change between can_attach and attach. But if
> there is such a need, then the write-side in mpol_rebind_mm may conflict
> even with my proposed solution.
>
> Regardless, the best way would be to avoid holding the mmap_sem across
> the whole window, possibly by solving the move_charge deadlock some
> other internal way, if at all possible?
>
I made a patch to fix these probrems(deadlock between cpuset and memcg which
commit b1dd693e introduces, and deadlock which the commit fixed).
I'll test and resend this after new year holidays in Japan.
===
From: Daisuke Nishimura <nishimura at mxp.nes.nec.co.jp>
The commit b1dd693e(memcg: avoid deadlock between move charge and try_charge())
can cause another deadlock about mmap_sem on task migration if cpuset and memcg
are mounted onto the same mount point.
After the commit, cgroup_attach_task() has sequence like:
cgroup_attach_task()
ss->can_attach()
cpuset_can_attach()
mem_cgroup_can_attach()
down_read(&mmap_sem) (1)
ss->attach()
cpuset_attach()
mpol_rebind_mm()
down_write(&mmap_sem) (2)
up_write(&mmap_sem)
cpuset_migrate_mm()
do_migrate_pages()
down_read(&mmap_sem)
up_read(&mmap_sem)
mem_cgroup_move_task()
mem_cgroup_clear_mc()
up_read(&mmap_sem)
We can cause deadlock at (2) because we've already aquire the mmap_sem at (1).
But the commit itself is necessary to fix deadlocks which have existed before
the commit like:
Ex.1)
move charge | try charge
--------------------------------------+------------------------------
mem_cgroup_can_attach() | down_write(&mmap_sem)
mc.moving_task = current | ..
mem_cgroup_precharge_mc() | __mem_cgroup_try_charge()
mem_cgroup_count_precharge() | prepare_to_wait()
down_read(&mmap_sem) | if (mc.moving_task)
-> cannot aquire the lock | -> true
| schedule()
| -> move charge should wake it up
Ex.2)
move charge | try charge
--------------------------------------+------------------------------
mem_cgroup_can_attach() |
mc.moving_task = current |
mem_cgroup_precharge_mc() |
mem_cgroup_count_precharge() |
down_read(&mmap_sem) |
.. |
up_read(&mmap_sem) |
| down_write(&mmap_sem)
mem_cgroup_move_task() | ..
mem_cgroup_move_charge() | __mem_cgroup_try_charge()
down_read(&mmap_sem) | prepare_to_wait()
-> cannot aquire the lock | if (mc.moving_task)
| -> true
| schedule()
| -> move charge should wake it up
This patch fixes all of these problems by:
1. revert the commit.
2. To fix the Ex.1, we set mc.moving_task after mem_cgroup_count_precharge()
has released the mmap_sem.
3. To fix the Ex.2, we use down_read_trylock() instead of down_read() in
mem_cgroup_move_charge() and, if it has failed to aquire the lock, cancel
all extra charges, wake up all waiters, and retry trylock.
Reported-by: Ben Blum <bblum at andrew.cmu.edu>
Signed-off-by: Daisuke Nishimura <nishimura at mxp.nes.nec.co.jp>
---
mm/memcontrol.c | 78 +++++++++++++++++++++++++++++++------------------------
1 files changed, 44 insertions(+), 34 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7a22b41..b108b30 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -292,7 +292,6 @@ static struct move_charge_struct {
unsigned long moved_charge;
unsigned long moved_swap;
struct task_struct *moving_task; /* a task moving charges */
- struct mm_struct *mm;
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -4639,7 +4638,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
unsigned long precharge;
struct vm_area_struct *vma;
- /* We've already held the mmap_sem */
+ down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
struct mm_walk mem_cgroup_count_precharge_walk = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4651,6 +4650,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_count_precharge_walk);
}
+ up_read(&mm->mmap_sem);
precharge = mc.precharge;
mc.precharge = 0;
@@ -4660,10 +4660,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
static int mem_cgroup_precharge_mc(struct mm_struct *mm)
{
- return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+ unsigned long precharge = mem_cgroup_count_precharge(mm);
+
+ VM_BUG_ON(mc.moving_task);
+ mc.moving_task = current;
+ return mem_cgroup_do_precharge(precharge);
}
-static void mem_cgroup_clear_mc(void)
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
@@ -4698,23 +4703,24 @@ static void mem_cgroup_clear_mc(void)
PAGE_SIZE * mc.moved_swap);
}
/* we've already done mem_cgroup_get(mc.to) */
-
mc.moved_swap = 0;
}
- if (mc.mm) {
- up_read(&mc.mm->mmap_sem);
- mmput(mc.mm);
- }
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
+ wake_up_all(&mc.waitq);
+}
+
+static void mem_cgroup_clear_mc(void)
+{
+ struct mem_cgroup *from = mc.from;
+
+ __mem_cgroup_clear_mc();
spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
spin_unlock(&mc.lock);
mc.moving_task = NULL;
- mc.mm = NULL;
mem_cgroup_end_move(from);
- memcg_oom_recover(from);
- memcg_oom_recover(to);
- wake_up_all(&mc.waitq);
}
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4736,38 +4742,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
return 0;
/* We move charges only when we move a owner of the mm */
if (mm->owner == p) {
- /*
- * We do all the move charge works under one mmap_sem to
- * avoid deadlock with down_write(&mmap_sem)
- * -> try_charge() -> if (mc.moving_task) -> sleep.
- */
- down_read(&mm->mmap_sem);
-
VM_BUG_ON(mc.from);
VM_BUG_ON(mc.to);
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
- VM_BUG_ON(mc.moving_task);
- VM_BUG_ON(mc.mm);
-
mem_cgroup_start_move(from);
spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
- mc.precharge = 0;
- mc.moved_charge = 0;
- mc.moved_swap = 0;
spin_unlock(&mc.lock);
- mc.moving_task = current;
- mc.mm = mm;
+ /* We set mc.moving_task later */
ret = mem_cgroup_precharge_mc(mm);
if (ret)
mem_cgroup_clear_mc();
- /* We call up_read() and mmput() in clear_mc(). */
- } else
- mmput(mm);
+ }
+ mmput(mm);
}
return ret;
}
@@ -4855,7 +4846,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
struct vm_area_struct *vma;
lru_add_drain_all();
- /* We've already held the mmap_sem */
+retry:
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ /*
+ * Someone who are holding the mmap_sem might be waiting in
+ * waitq. So we cancel all extra charges, wake up all waiters,
+ * and retry. Because we cancel precharges, we might not be able
+ * to move enough charges, but moving charge is a best-effort
+ * feature anyway, so it wouldn't be a big problem.
+ */
+ __mem_cgroup_clear_mc();
+ cond_resched();
+ goto retry;
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret;
struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4874,6 +4877,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
*/
break;
}
+ up_read(&mm->mmap_sem);
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4882,11 +4886,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct task_struct *p,
bool threadgroup)
{
- if (!mc.mm)
+ struct mm_struct *mm;
+
+ if (!mc.to)
/* no need to move charge */
return;
- mem_cgroup_move_charge(mc.mm);
+ mm = get_task_mm(p);
+ if (mm) {
+ mem_cgroup_move_charge(mm);
+ mmput(mm);
+ }
mem_cgroup_clear_mc();
}
#else /* !CONFIG_MMU */
--
1.7.1
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list