[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

Tue Mar 2 22:01:37 PST 2010

On Wed, 3 Mar 2010 12:29:06 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu at jp.fujitsu.com> wrote:
> On Wed, 3 Mar 2010 11:12:38 +0900
> Daisuke Nishimura <nishimura at mxp.nes.nec.co.jp> wrote:
> 
> > > diff --git a/mm/filemap.c b/mm/filemap.c
> > > index fe09e51..f85acae 100644
> > > --- a/mm/filemap.c
> > > +++ b/mm/filemap.c
> > > @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
> > >  	 * having removed the page entirely.
> > >  	 */
> > >  	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
> > > +		mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1);
> > >  		dec_zone_page_state(page, NR_FILE_DIRTY);
> > >  		dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
> > >  	}
> > (snip)
> > > @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page)
> > >  void account_page_dirtied(struct page *page, struct address_space *mapping)
> > >  {
> > >  	if (mapping_cap_account_dirty(mapping)) {
> > > +		mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1);
> > >  		__inc_zone_page_state(page, NR_FILE_DIRTY);
> > >  		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
> > >  		task_dirty_inc(current);
> > As long as I can see, those two functions(at least) calls mem_cgroup_update_state(),
> > which acquires page cgroup lock, under mapping->tree_lock.
> > But as I fixed before in commit e767e056, page cgroup lock must not acquired under
> > mapping->tree_lock.
> > hmm, we should call those mem_cgroup_update_state() outside mapping->tree_lock,
> > or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid dead-lock.
> > 
> Ah, good catch! But hmmmmmm...
> This account_page_dirtted() seems to be called under IRQ-disabled.
> About  __remove_from_page_cache(), I think page_cgroup should have its own DIRTY flag,
> then, mem_cgroup_uncharge_page() can handle it automatically.
> 
> But. there are no guarantee that following never happens. 
> 	lock_page_cgroup()
> 	    <=== interrupt.
> 	    -> mapping->tree_lock()
> Even if mapping->tree_lock is held with IRQ-disabled.
> Then, if we add local_irq_save(), we have to add it to all lock_page_cgroup().
> 
> Then, hm...some kind of new trick ? as..
> (Follwoing patch is not tested!!)
> 
If we can verify that all callers of mem_cgroup_update_stat() have always either aquired
or not aquired tree_lock, this direction will work fine.
But if we can't, we have to add local_irq_save() to lock_page_cgroup() like below.

===
 include/linux/page_cgroup.h |    8 ++++++--
 mm/memcontrol.c             |   43 +++++++++++++++++++++++++------------------
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 30b0813..51da916 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -83,15 +83,19 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
 	return page_zonenum(pc->page);
 }
 
-static inline void lock_page_cgroup(struct page_cgroup *pc)
+static inline void __lock_page_cgroup(struct page_cgroup *pc)
 {
 	bit_spin_lock(PCG_LOCK, &pc->flags);
 }
+#define lock_page_cgroup(pc, flags) \
+  do { local_irq_save(flags); __lock_page_cgroup(pc); } while (0)
 
-static inline void unlock_page_cgroup(struct page_cgroup *pc)
+static inline void __unlock_page_cgroup(struct page_cgroup *pc)
 {
 	bit_spin_unlock(PCG_LOCK, &pc->flags);
 }
+#define unlock_page_cgroup(pc, flags) \
+  do { __unlock_page_cgroup(pc); local_irq_restore(flags); } while (0)
 
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 00ed4b1..40b9be4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1327,12 +1327,13 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc;
+	unsigned long flags;
 
 	pc = lookup_page_cgroup(page);
 	if (unlikely(!pc))
 		return;
 
-	lock_page_cgroup(pc);
+	lock_page_cgroup(pc, flags);
 	mem = pc->mem_cgroup;
 	if (!mem)
 		goto done;
@@ -1346,7 +1347,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
 	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
 
 done:
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 }
 
 /*
@@ -1680,11 +1681,12 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 	struct page_cgroup *pc;
 	unsigned short id;
 	swp_entry_t ent;
+	unsigned long flags;
 
 	VM_BUG_ON(!PageLocked(page));
 
 	pc = lookup_page_cgroup(page);
-	lock_page_cgroup(pc);
+	lock_page_cgroup(pc, flags);
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		if (mem && !css_tryget(&mem->css))
@@ -1698,7 +1700,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 			mem = NULL;
 		rcu_read_unlock();
 	}
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 	return mem;
 }
 
@@ -1711,13 +1713,15 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 				     struct page_cgroup *pc,
 				     enum charge_type ctype)
 {
+	unsigned long flags;
+
 	/* try_charge() can return NULL to *memcg, taking care of it. */
 	if (!mem)
 		return;
 
-	lock_page_cgroup(pc);
+	lock_page_cgroup(pc, flags);
 	if (unlikely(PageCgroupUsed(pc))) {
-		unlock_page_cgroup(pc);
+		unlock_page_cgroup(pc, flags);
 		mem_cgroup_cancel_charge(mem);
 		return;
 	}
@@ -1747,7 +1751,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 
 	mem_cgroup_charge_statistics(mem, pc, true);
 
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 	/*
 	 * "charge_statistics" updated event counter. Then, check it.
 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -1817,12 +1821,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
 	int ret = -EINVAL;
-	lock_page_cgroup(pc);
+	unsigned long flags;
+	lock_page_cgroup(pc, flags);
 	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
 		__mem_cgroup_move_account(pc, from, to, uncharge);
 		ret = 0;
 	}
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 	/*
 	 * check events
 	 */
@@ -1949,17 +1954,17 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 	 */
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
-
+		unsigned long flags;
 
 		pc = lookup_page_cgroup(page);
 		if (!pc)
 			return 0;
-		lock_page_cgroup(pc);
+		lock_page_cgroup(pc, flags);
 		if (PageCgroupUsed(pc)) {
-			unlock_page_cgroup(pc);
+			unlock_page_cgroup(pc, flags);
 			return 0;
 		}
-		unlock_page_cgroup(pc);
+		unlock_page_cgroup(pc, flags);
 	}
 
 	if (unlikely(!mm && !mem))
@@ -2141,6 +2146,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem = NULL;
 	struct mem_cgroup_per_zone *mz;
+	unsigned long flags;
 
 	if (mem_cgroup_disabled())
 		return NULL;
@@ -2155,7 +2161,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	if (unlikely(!pc || !PageCgroupUsed(pc)))
 		return NULL;
 
-	lock_page_cgroup(pc);
+	lock_page_cgroup(pc, flags);
 
 	mem = pc->mem_cgroup;
 
@@ -2194,7 +2200,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	 */
 
 	mz = page_cgroup_zoneinfo(pc);
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 
 	memcg_check_events(mem, page);
 	/* at swapout, this memcg will be accessed to record to swap */
@@ -2204,7 +2210,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	return mem;
 
 unlock_out:
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 	return NULL;
 }
 
@@ -2392,17 +2398,18 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem = NULL;
 	int ret = 0;
+	unsigned long flags;
 
 	if (mem_cgroup_disabled())
 		return 0;
 
 	pc = lookup_page_cgroup(page);
-	lock_page_cgroup(pc);
+	lock_page_cgroup(pc, flags);
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
 	}
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 
 	if (mem) {
 		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);


> ==
> ---
>  include/linux/page_cgroup.h |   14 ++++++++++++++
>  mm/memcontrol.c             |   27 +++++++++++++++++----------
>  2 files changed, 31 insertions(+), 10 deletions(-)
> 
> Index: mmotm-2.6.33-Feb11/include/linux/page_cgroup.h
> ===================================================================
> --- mmotm-2.6.33-Feb11.orig/include/linux/page_cgroup.h
> +++ mmotm-2.6.33-Feb11/include/linux/page_cgroup.h
> @@ -39,6 +39,7 @@ enum {
>  	PCG_CACHE, /* charged as cache */
>  	PCG_USED, /* this object is in use. */
>  	PCG_ACCT_LRU, /* page has been accounted for */
> +	PCG_MIGRATE, /* page cgroup is under memcg account migration */
>  };
>  
>  #define TESTPCGFLAG(uname, lname)			\
> @@ -73,6 +74,8 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
>  TESTPCGFLAG(AcctLRU, ACCT_LRU)
>  TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
>  
> +TESTPCGFLAG(Migrate, MIGRATE)
> +
>  static inline int page_cgroup_nid(struct page_cgroup *pc)
>  {
>  	return page_to_nid(pc->page);
> @@ -93,6 +96,17 @@ static inline void unlock_page_cgroup(st
>  	bit_spin_unlock(PCG_LOCK, &pc->flags);
>  }
>  
> +static inline unsigned long page_cgroup_migration_lock(struct page_cgroup *pc)
> +{
> +	local_irq_save(flags);
> +	bit_spin_lock(PCG_MIGRATE, &pc->flags);
> +}
> +static inline void
> +page_cgroup_migration_lock(struct page_cgroup *pc, unsigned long flags)
> +{
> +	bit_spin_lock(PCG_MIGRATE, &pc->flags);
> +	local_irq_restore(flags);
> +}
>  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>  struct page_cgroup;
>  
> Index: mmotm-2.6.33-Feb11/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c
> +++ mmotm-2.6.33-Feb11/mm/memcontrol.c
> @@ -1321,7 +1321,7 @@ bool mem_cgroup_handle_oom(struct mem_cg
>   * Currently used to update mapped file statistics, but the routine can be
>   * generalized to update other statistics as well.
>   */
> -void mem_cgroup_update_file_mapped(struct page *page, int val)
> +void mem_cgroup_update_file_mapped(struct page *page, int val, int locked)
>  {
>  	struct mem_cgroup *mem;
>  	struct page_cgroup *pc;
> @@ -1329,22 +1329,27 @@ void mem_cgroup_update_file_mapped(struc
>  	pc = lookup_page_cgroup(page);
>  	if (unlikely(!pc))
>  		return;
> -
> -	lock_page_cgroup(pc);
> +	/*
> +	 * if locked==1, mapping->tree_lock is held. We don't have to take
> +	 * care of charge/uncharge. just think about migration.
> +	 */
> +	if (!locked)
> +		lock_page_cgroup(pc);
> +	else
> +		page_cgroup_migration_lock(pc);
>  	mem = pc->mem_cgroup;
> -	if (!mem)
> +	if (!mem || !PageCgroupUsed(pc))
>  		goto done;
> -
> -	if (!PageCgroupUsed(pc))
> -		goto done;
> -
>  	/*
>  	 * Preemption is already disabled. We can use __this_cpu_xxx
>  	 */
>  	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
>  
>  done:
> -	unlock_page_cgroup(pc);
> +	if (!locked)
> +		lock_page_cgroup(pc);
> +	else
> +		page_cgroup_migration_unlock(pc);
>  }
>  
>  /*
> @@ -1785,7 +1790,8 @@ static void __mem_cgroup_move_account(st
>  	VM_BUG_ON(!PageCgroupLocked(pc));
>  	VM_BUG_ON(!PageCgroupUsed(pc));
>  	VM_BUG_ON(pc->mem_cgroup != from);
> -
> +		
> +	page_cgroup_migration_lock(pc);
>  	page = pc->page;
>  	if (page_mapped(page) && !PageAnon(page)) {
>  		/* Update mapped_file data for mem_cgroup */
> @@ -1802,6 +1808,7 @@ static void __mem_cgroup_move_account(st
>  	/* caller should have done css_get */
>  	pc->mem_cgroup = to;
>  	mem_cgroup_charge_statistics(to, pc, true);
> +	page_cgroup_migration_lock(pc);
>  	/*
>  	 * We charges against "to" which may not have any tasks. Then, "to"
>  	 * can be under rmdir(). But in current implementation, caller of
> 
> 
> 
> 
> Thanks,
> -Kame
> 
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers