[Devel] [PATCH rh7 2/2] mm/memcg: reclaim only kmem if kmem limit reached.

Mon Aug 28 12:02:11 MSK 2017

25.08.2017 18:38, Andrey Ryabinin пишет:
> If kmem limit on memcg reached, we go into memory reclaim,
> and reclaim everything we can, including page cache and anon.
> Reclaiming page cache or anon won't help since we need to lower
> only kmem usage. This patch fixes the problem by avoiding
> non-kmem reclaim on hitting the kmem limit.
> 

Can't there be a situation, when some object in anon mem or page cache holds some object in kmem (indirectly)?

> https://jira.sw.ru/browse/PSBM-69226
> Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
> ---
>  include/linux/memcontrol.h | 10 ++++++++++
>  include/linux/swap.h       |  2 +-
>  mm/memcontrol.c            | 30 ++++++++++++++++--------------
>  mm/vmscan.c                | 31 ++++++++++++++++++++++++-------
>  4 files changed, 51 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 1a52e58ab7de..1d6bc80c4c90 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -45,6 +45,16 @@ struct mem_cgroup_reclaim_cookie {
>  	unsigned int generation;
>  };
>  
> +/*
> + * Reclaim flags for mem_cgroup_hierarchical_reclaim
> + */
> +#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
> +#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> +#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
> +#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> +#define MEM_CGROUP_RECLAIM_KMEM_BIT	0x2
> +#define MEM_CGROUP_RECLAIM_KMEM		(1 << MEM_CGROUP_RECLAIM_KMEM_BIT)
> +
>  #ifdef CONFIG_MEMCG
>  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
>  			  gfp_t gfp_mask, struct mem_cgroup **memcgp);
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index bd162f9bef0d..bd47451ec95a 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -324,7 +324,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
>  extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
>  						  unsigned long nr_pages,
> -						  gfp_t gfp_mask, bool noswap);
> +						  gfp_t gfp_mask, int flags);
>  extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
>  						gfp_t gfp_mask, bool noswap,
>  						struct zone *zone,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 97824e281d7a..f9a5f3819a31 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -511,16 +511,6 @@ enum res_type {
>  #define OOM_CONTROL		(0)
>  
>  /*
> - * Reclaim flags for mem_cgroup_hierarchical_reclaim
> - */
> -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
> -#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> -#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
> -#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> -#define MEM_CGROUP_RECLAIM_KMEM_BIT	0x2
> -#define MEM_CGROUP_RECLAIM_KMEM		(1 << MEM_CGROUP_RECLAIM_KMEM_BIT)
> -
> -/*
>   * The memcg_create_mutex will be held whenever a new cgroup is created.
>   * As a consequence, any change that needs to protect against new child cgroups
>   * appearing has to hold it as well.
> @@ -2137,7 +2127,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
>  		if (loop)
>  			drain_all_stock_async(memcg);
>  		total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
> -						      gfp_mask, noswap);
> +						      gfp_mask, flags);
>  		if (test_thread_flag(TIF_MEMDIE) ||
>  		    fatal_signal_pending(current))
>  			return 1;
> @@ -2150,6 +2140,16 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
>  			break;
>  		if (mem_cgroup_margin(memcg, flags & MEM_CGROUP_RECLAIM_KMEM))
>  			break;
> +
> +		/*
> +		 * Try harder to reclaim dcache. dcache reclaim may
> +		 * temporarly fail due to dcache->dlock being held
> +		 * by someone else. We must try harder to avoid premature
> +		 * slab allocation failures.
> +		 */
> +		if (flags & MEM_CGROUP_RECLAIM_KMEM &&
> +		    page_counter_read(&memcg->dcache))
> +			continue;
>  		/*
>  		 * If nothing was reclaimed after two attempts, there
>  		 * may be no reclaimable pages in this hierarchy.
> @@ -2778,11 +2778,13 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
>  	struct mem_cgroup *mem_over_limit;
>  	struct page_counter *counter;
>  	unsigned long nr_reclaimed;
> -	unsigned long flags = 0;
> +	unsigned long flags;
>  
>  	if (mem_cgroup_is_root(memcg))
>  		goto done;
>  retry:
> +	flags = 0;
> +
>  	if (consume_stock(memcg, nr_pages)) {
>  		if (!kmem_charge)
>  			goto done;
> @@ -4138,7 +4140,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
>  			return -EINTR;
>  
>  		progress = try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
> -							GFP_KERNEL, false);
> +							GFP_KERNEL, 0);
>  		if (!progress) {
>  			nr_retries--;
>  			/* maybe some writeback is necessary */
> @@ -4573,7 +4575,7 @@ static int mem_cgroup_high_write(struct cgroup *cont, struct cftype *cft,
>  	usage = page_counter_read(&memcg->memory);
>  	if (usage > nr_pages)
>  		try_to_free_mem_cgroup_pages(memcg, usage - nr_pages,
> -					     GFP_KERNEL, false);
> +					     GFP_KERNEL, 0);
>  	return 0;
>  }
>  
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 277bd37bd430..a5db5940bb1e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -88,6 +88,9 @@ struct scan_control {
>  	/* Scan (total_size >> priority) pages at once */
>  	int priority;
>  
> +	/* Reclaim only slab */
> +	bool slab_only;
> +
>  	/*
>  	 * The memory cgroup that hit its limit and as a result is the
>  	 * primary target of this reclaim invocation.
> @@ -2346,6 +2349,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
>  	struct reclaim_state *reclaim_state = current->reclaim_state;
>  	unsigned long nr_reclaimed, nr_scanned;
>  	gfp_t slab_gfp = sc->gfp_mask;
> +	bool slab_only = sc->slab_only;
>  
>  	/* Disable fs-related IO for direct reclaim */
>  	if (!sc->target_mem_cgroup &&
> @@ -2372,14 +2376,24 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
>  			if (!sc->may_thrash && mem_cgroup_low(root, memcg))
>  				continue;
>  
> -			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
>  			scanned = sc->nr_scanned;
> -			shrink_lruvec(lruvec, sc, &lru_pages);
> -			zone_lru_pages += lru_pages;
>  
> -			if (memcg && is_classzone)
> +			if (!slab_only) {
> +				lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> +				shrink_lruvec(lruvec, sc, &lru_pages);
> +				zone_lru_pages += lru_pages;
> +			}
> +
> +			if (memcg && is_classzone) {
>  				shrink_slab(slab_gfp, zone_to_nid(zone),
>  					    memcg, sc->priority, false);
> +				if (reclaim_state) {
> +					sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> +					sc->nr_scanned += reclaim_state->reclaimed_slab;
> +					reclaim_state->reclaimed_slab = 0;
> +				}
> +
> +			}
>  
>  			/*
>  			 * Direct reclaim and kswapd have to scan all memory
> @@ -2902,15 +2916,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
>  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  					   unsigned long nr_pages,
>  					   gfp_t gfp_mask,
> -					   bool noswap)
> +					   int flags)
>  {
>  	struct zonelist *zonelist;
>  	unsigned long nr_reclaimed;
> +	struct reclaim_state reclaim_state = { 0 };
>  	int nid;
>  	struct scan_control sc = {
>  		.may_writepage = !laptop_mode,
>  		.may_unmap = 1,
> -		.may_swap = !noswap,
> +		.may_swap = !(flags & MEM_CGROUP_RECLAIM_NOSWAP),
> +		.slab_only = flags & MEM_CGROUP_RECLAIM_KMEM,
>  		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
>  		.order = 0,
>  		.priority = DEF_PRIORITY,
> @@ -2933,10 +2949,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  					    sc.may_writepage,
>  					    sc.gfp_mask);
>  
> +	current->reclaim_state = &reclaim_state;
>  	current->flags |= PF_MEMALLOC | PF_MEMCG_RECLAIM;
>  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
>  	current->flags &= ~(PF_MEMALLOC | PF_MEMCG_RECLAIM);
> -
> +	current->reclaim_state = NULL;
>  	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
>  
>  	return nr_reclaimed;
>