[Devel] [PATCH rh7 2/2] mm/memcg: reclaim only kmem if kmem limit reached.
Konstantin Khorenko
khorenko at virtuozzo.com
Thu Aug 31 12:58:51 MSK 2017
Do we want to push it to mainstream as well?
--
Best regards,
Konstantin Khorenko,
Virtuozzo Linux Kernel Team
On 08/25/2017 06:38 PM, Andrey Ryabinin wrote:
> If kmem limit on memcg reached, we go into memory reclaim,
> and reclaim everything we can, including page cache and anon.
> Reclaiming page cache or anon won't help since we need to lower
> only kmem usage. This patch fixes the problem by avoiding
> non-kmem reclaim on hitting the kmem limit.
>
> https://jira.sw.ru/browse/PSBM-69226
> Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
> ---
> include/linux/memcontrol.h | 10 ++++++++++
> include/linux/swap.h | 2 +-
> mm/memcontrol.c | 30 ++++++++++++++++--------------
> mm/vmscan.c | 31 ++++++++++++++++++++++++-------
> 4 files changed, 51 insertions(+), 22 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 1a52e58ab7de..1d6bc80c4c90 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -45,6 +45,16 @@ struct mem_cgroup_reclaim_cookie {
> unsigned int generation;
> };
>
> +/*
> + * Reclaim flags for mem_cgroup_hierarchical_reclaim
> + */
> +#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
> +#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> +#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
> +#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> +#define MEM_CGROUP_RECLAIM_KMEM_BIT 0x2
> +#define MEM_CGROUP_RECLAIM_KMEM (1 << MEM_CGROUP_RECLAIM_KMEM_BIT)
> +
> #ifdef CONFIG_MEMCG
> int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
> gfp_t gfp_mask, struct mem_cgroup **memcgp);
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index bd162f9bef0d..bd47451ec95a 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -324,7 +324,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
> extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
> unsigned long nr_pages,
> - gfp_t gfp_mask, bool noswap);
> + gfp_t gfp_mask, int flags);
> extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
> gfp_t gfp_mask, bool noswap,
> struct zone *zone,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 97824e281d7a..f9a5f3819a31 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -511,16 +511,6 @@ enum res_type {
> #define OOM_CONTROL (0)
>
> /*
> - * Reclaim flags for mem_cgroup_hierarchical_reclaim
> - */
> -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
> -#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> -#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
> -#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> -#define MEM_CGROUP_RECLAIM_KMEM_BIT 0x2
> -#define MEM_CGROUP_RECLAIM_KMEM (1 << MEM_CGROUP_RECLAIM_KMEM_BIT)
> -
> -/*
> * The memcg_create_mutex will be held whenever a new cgroup is created.
> * As a consequence, any change that needs to protect against new child cgroups
> * appearing has to hold it as well.
> @@ -2137,7 +2127,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
> if (loop)
> drain_all_stock_async(memcg);
> total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
> - gfp_mask, noswap);
> + gfp_mask, flags);
> if (test_thread_flag(TIF_MEMDIE) ||
> fatal_signal_pending(current))
> return 1;
> @@ -2150,6 +2140,16 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
> break;
> if (mem_cgroup_margin(memcg, flags & MEM_CGROUP_RECLAIM_KMEM))
> break;
> +
> + /*
> + * Try harder to reclaim dcache. dcache reclaim may
> + * temporarly fail due to dcache->dlock being held
> + * by someone else. We must try harder to avoid premature
> + * slab allocation failures.
> + */
> + if (flags & MEM_CGROUP_RECLAIM_KMEM &&
> + page_counter_read(&memcg->dcache))
> + continue;
> /*
> * If nothing was reclaimed after two attempts, there
> * may be no reclaimable pages in this hierarchy.
> @@ -2778,11 +2778,13 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
> struct mem_cgroup *mem_over_limit;
> struct page_counter *counter;
> unsigned long nr_reclaimed;
> - unsigned long flags = 0;
> + unsigned long flags;
>
> if (mem_cgroup_is_root(memcg))
> goto done;
> retry:
> + flags = 0;
> +
> if (consume_stock(memcg, nr_pages)) {
> if (!kmem_charge)
> goto done;
> @@ -4138,7 +4140,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
> return -EINTR;
>
> progress = try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
> - GFP_KERNEL, false);
> + GFP_KERNEL, 0);
> if (!progress) {
> nr_retries--;
> /* maybe some writeback is necessary */
> @@ -4573,7 +4575,7 @@ static int mem_cgroup_high_write(struct cgroup *cont, struct cftype *cft,
> usage = page_counter_read(&memcg->memory);
> if (usage > nr_pages)
> try_to_free_mem_cgroup_pages(memcg, usage - nr_pages,
> - GFP_KERNEL, false);
> + GFP_KERNEL, 0);
> return 0;
> }
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 277bd37bd430..a5db5940bb1e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -88,6 +88,9 @@ struct scan_control {
> /* Scan (total_size >> priority) pages at once */
> int priority;
>
> + /* Reclaim only slab */
> + bool slab_only;
> +
> /*
> * The memory cgroup that hit its limit and as a result is the
> * primary target of this reclaim invocation.
> @@ -2346,6 +2349,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
> struct reclaim_state *reclaim_state = current->reclaim_state;
> unsigned long nr_reclaimed, nr_scanned;
> gfp_t slab_gfp = sc->gfp_mask;
> + bool slab_only = sc->slab_only;
>
> /* Disable fs-related IO for direct reclaim */
> if (!sc->target_mem_cgroup &&
> @@ -2372,14 +2376,24 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
> if (!sc->may_thrash && mem_cgroup_low(root, memcg))
> continue;
>
> - lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> scanned = sc->nr_scanned;
> - shrink_lruvec(lruvec, sc, &lru_pages);
> - zone_lru_pages += lru_pages;
>
> - if (memcg && is_classzone)
> + if (!slab_only) {
> + lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> + shrink_lruvec(lruvec, sc, &lru_pages);
> + zone_lru_pages += lru_pages;
> + }
> +
> + if (memcg && is_classzone) {
> shrink_slab(slab_gfp, zone_to_nid(zone),
> memcg, sc->priority, false);
> + if (reclaim_state) {
> + sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> + sc->nr_scanned += reclaim_state->reclaimed_slab;
> + reclaim_state->reclaimed_slab = 0;
> + }
> +
> + }
>
> /*
> * Direct reclaim and kswapd have to scan all memory
> @@ -2902,15 +2916,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
> unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> unsigned long nr_pages,
> gfp_t gfp_mask,
> - bool noswap)
> + int flags)
> {
> struct zonelist *zonelist;
> unsigned long nr_reclaimed;
> + struct reclaim_state reclaim_state = { 0 };
> int nid;
> struct scan_control sc = {
> .may_writepage = !laptop_mode,
> .may_unmap = 1,
> - .may_swap = !noswap,
> + .may_swap = !(flags & MEM_CGROUP_RECLAIM_NOSWAP),
> + .slab_only = flags & MEM_CGROUP_RECLAIM_KMEM,
> .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> .order = 0,
> .priority = DEF_PRIORITY,
> @@ -2933,10 +2949,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> sc.may_writepage,
> sc.gfp_mask);
>
> + current->reclaim_state = &reclaim_state;
> current->flags |= PF_MEMALLOC | PF_MEMCG_RECLAIM;
> nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
> current->flags &= ~(PF_MEMALLOC | PF_MEMCG_RECLAIM);
> -
> + current->reclaim_state = NULL;
> trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
>
> return nr_reclaimed;
>
More information about the Devel
mailing list