[Devel] [PATCH RH8 2/5] mm/memcg: limit page cache in memcg hack

Fri Jul 30 15:49:32 MSK 2021

On 29.07.2021 18:20, Alexander Mikhalitsyn wrote:
> From: Andrey Ryabinin <aryabinin at virtuozzo.com>
> 
> Add new memcg file - memory.cache.limit_in_bytes.
> Used to limit page cache usage in cgroup.
> 
> https://jira.sw.ru/browse/PSBM-77547
> 
> Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
> 
> khorenko@: usecase:
> imagine a system service which anon memory you don't want to limit
> (in our case it's a vStorage cgroup which hosts CSes and MDSes, they can
> consume memory in some range and we don't want to set a limit for max possible
> consumption - too high, and we don't know the number of CSes on the node -
> admin can add CSes dynamically. And we don't want to dynamically
> increase/decrease the limit).
> 
> If the cgroup is "unlimited" it produces permanent memory pressure on the node
> because it generates a lot of pagecache and other cgroups on the node are
> affected (even taking into account the fact of proportional fair reclaim).
> 
> => solution is to limit pagecache only, so this is implemented.
> 
> Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
> (cherry picked from commit da9151c891819733762a178b4efd7e44766fb8b1)
> 
> Reworked:
> now we have no charge/cancel/commit/uncharge memcg API (we only have charge/uncharge)
> => we have to track pages which was charged as page cache => additional flag was introduced
> which implemented using mm/page_ext.c subsystem (see mm/page_vzext.c)
> 
> See ms commits:
> 0d1c2072 ("mm: memcontrol: switch to native NR_FILE_PAGES and NR_SHMEM counters")
> 3fea5a49 ("mm: memcontrol: convert page cache to a new mem_cgroup_charge() API")
> 
> https://jira.sw.ru/browse/PSBM-131957
> 
> Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
> ---
>   include/linux/memcontrol.h   |   3 +
>   include/linux/page_vzflags.h |  31 ++++++
>   mm/filemap.c                 |   2 +-
>   mm/memcontrol.c              | 188 ++++++++++++++++++++++++++++++-----
>   4 files changed, 198 insertions(+), 26 deletions(-)
>   create mode 100644 include/linux/page_vzflags.h
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index fdf93a2de456..2e053bd43050 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -285,6 +285,7 @@ struct mem_cgroup {
>   	struct page_counter kmem;
>   	struct page_counter tcpmem;
>   	RH_KABI_DEPRECATE(unsigned long, high)
> +	struct page_counter cache;
>   #else
>   	union {
>   		struct page_counter swap;	/* v2 only */
> @@ -294,6 +295,7 @@ struct mem_cgroup {
>   	/* Legacy consumer-oriented counters */
>   	struct page_counter kmem;		/* v1 only */
>   	struct page_counter tcpmem;		/* v1 only */
> +	struct page_counter cache;
>   #endif

Moving "cache" after endif should not change anything, but would save us 
from duplicating lines.

>   
>   	/* Range enforcement for interrupt charges */
> @@ -510,6 +512,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
>   						struct mem_cgroup *memcg);
>   
>   int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
> +int mem_cgroup_charge_cache(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);

We should probably also define it for !CONFIG_MEMCG case.

>   
>   void mem_cgroup_uncharge(struct page *page);
>   void mem_cgroup_uncharge_list(struct list_head *page_list);
> diff --git a/include/linux/page_vzflags.h b/include/linux/page_vzflags.h
> new file mode 100644
> index 000000000000..64a06ba8c9f6
> --- /dev/null
> +++ b/include/linux/page_vzflags.h
> @@ -0,0 +1,31 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __LINUX_PAGE_VZFLAGS_H
> +#define __LINUX_PAGE_VZFLAGS_H
> +
> +#include <linux/page_vzext.h>
> +#include <linux/page-flags.h>
> +
> +enum vzpageflags {
> +	PGVZ_pagecache,
> +};
> +
> +#define TESTVZPAGEFLAG(uname, lname)				\
> +static __always_inline int PageVz##uname(struct page *page)		\
> +	{ return get_page_vzext(page) && test_bit(PGVZ_##lname, &get_page_vzext(page)->vzflags); }
> +
> +#define SETVZPAGEFLAG(uname, lname)				\
> +static __always_inline void SetVzPage##uname(struct page *page)		\
> +	{ if (get_page_vzext(page)) set_bit(PGVZ_##lname, &get_page_vzext(page)->vzflags); }
> +
> +#define CLEARVZPAGEFLAG(uname, lname)				\
> +static __always_inline void ClearVzPage##uname(struct page *page)		\
> +	{ if (get_page_vzext(page)) clear_bit(PGVZ_##lname, &get_page_vzext(page)->vzflags); }
> +
> +#define VZPAGEFLAG(uname, lname)					\
> +	TESTVZPAGEFLAG(uname, lname)				\
> +	SETVZPAGEFLAG(uname, lname)				\
> +	CLEARVZPAGEFLAG(uname, lname)
> +
> +VZPAGEFLAG(PageCache, pagecache) > +
> +#endif /* __LINUX_PAGE_VZFLAGS_H */
> diff --git a/mm/filemap.c b/mm/filemap.c
> index f1a56ac4ec41..3d4e912f0930 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -862,7 +862,7 @@ static int __add_to_page_cache_locked(struct page *page,
>   	page->index = offset;
>   
>   	if (!huge) {
> -		error = mem_cgroup_charge(page, current->mm, gfp);
> +		error = mem_cgroup_charge_cache(page, current->mm, gfp);
>   		if (error)
>   			goto error;
>   		charged = true;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d3aa1cf20796..ac0ece68a17a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -42,6 +42,7 @@
>   #include <linux/vm_event_item.h>
>   #include <linux/smp.h>
>   #include <linux/page-flags.h>
> +#include <linux/page_vzflags.h>
>   #include <linux/backing-dev.h>
>   #include <linux/bit_spinlock.h>
>   #include <linux/rcupdate.h>
> @@ -225,6 +226,7 @@ enum res_type {
>   	_OOM_TYPE,
>   	_KMEM,
>   	_TCP,
> +	_CACHE,
>   };
>   
>   #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
> @@ -2799,7 +2801,7 @@ static bool kmem_reclaim_is_low(struct mem_cgroup *memcg)
>   }
>   
>   static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge,
> -		      unsigned int nr_pages)
> +		      unsigned int nr_pages, bool cache_charge)
>   {
>   	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
>   	int nr_retries = MAX_RECLAIM_RETRIES;
> @@ -2817,13 +2819,21 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
>   	may_swap = true;
>   	kmem_limit = false;
>   	if (consume_stock(memcg, nr_pages)) {
> -		if (!kmem_charge)
> -			return 0;
> -		if (page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
> -			return 0;
> -		refill_stock(memcg, nr_pages);
> +		if (kmem_charge && !page_counter_try_charge(
> +				&memcg->kmem, nr_pages, &counter)) {
> +			refill_stock(memcg, nr_pages);
> +			goto charge;
> +		}
> +
> +		if (cache_charge && !page_counter_try_charge(
> +				&memcg->cache, nr_pages, &counter)) {
> +			refill_stock(memcg, nr_pages);
> +			goto charge;
> +		}
> +		return 0;
>   	}
>   
> +charge:
>   	mem_over_limit = NULL;
>   	if (page_counter_try_charge(&memcg->memory, batch, &counter)) {
>   		if (do_memsw_account() && !page_counter_try_charge(
> @@ -2845,6 +2855,19 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
>   		}
>   	}
>   
> +	if (!mem_over_limit && cache_charge) {
> +		if (page_counter_try_charge(&memcg->cache, nr_pages, &counter))
> +			goto done_restock;
> +
> +		may_swap = false;
> +		mem_over_limit = mem_cgroup_from_counter(counter, cache);
> +		page_counter_uncharge(&memcg->memory, batch);
> +		if (do_memsw_account())
> +			page_counter_uncharge(&memcg->memsw, batch);
> +		if (kmem_charge)
> +			page_counter_uncharge(&memcg->kmem, nr_pages);
> +	}
> +
>   	if (!mem_over_limit)
>   		goto done_restock;
>   
> @@ -2970,10 +2993,17 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
>   	page_counter_charge(&memcg->memory, nr_pages);
>   	if (do_memsw_account())
>   		page_counter_charge(&memcg->memsw, nr_pages);
> +	if (kmem_charge)
> +		page_counter_charge(&memcg->kmem, nr_pages);
> +	if (cache_charge)
> +		page_counter_charge(&memcg->cache, nr_pages);
>   
>   	return 0;
>   
>   done_restock:
> +	if (cache_charge)
> +		page_counter_charge(&memcg->cache, batch);
> +

Not sure, but it seem like a hunk from "mm/memcg: reclaim 
memory.cache.limit_in_bytes from background", like you've merged part of 
the later patch in the earlier patch. Please check that it's OK.

>   	if (batch > nr_pages)
>   		refill_stock(memcg, batch - nr_pages);
>   
> @@ -3200,7 +3230,7 @@ int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
>   {
>   	int ret;
>   
> -	ret = try_charge(memcg, gfp, true, nr_pages);
> +	ret = try_charge(memcg, gfp, true, nr_pages, false);
>   	if (ret)
>   		return ret;
>   
> @@ -3411,7 +3441,7 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
>   {
>   	int ret = 0;
>   
> -	ret = try_charge(memcg, gfp, true, nr_pages);
> +	ret = try_charge(memcg, gfp, true, nr_pages, false);
>   	return ret;
>   }
>   
> @@ -3763,6 +3793,8 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
>   		break;
>   	case _TCP:
>   		counter = &memcg->tcpmem;
> +	case _CACHE:
> +		counter = &memcg->cache;
>   		break;
>   	default:
>   		BUG();
> @@ -3985,6 +4017,43 @@ static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
>   	return ret;
>   }
>   
> +static int memcg_update_cache_max(struct mem_cgroup *memcg,
> +				  unsigned long limit)
> +{
> +	unsigned long nr_pages;
> +	bool enlarge = false;
> +	int ret;
> +
> +	do {
> +		if (signal_pending(current)) {
> +			ret = -EINTR;
> +			break;
> +		}
> +		mutex_lock(&memcg_max_mutex);
> +
> +		if (limit > memcg->cache.max)
> +			enlarge = true;
> +
> +		ret = page_counter_set_max(&memcg->cache, limit);
> +		mutex_unlock(&memcg_max_mutex);
> +
> +		if (!ret)
> +			break;
> +
> +		nr_pages = max_t(long, 1, page_counter_read(&memcg->cache) - limit);
> +		if (!try_to_free_mem_cgroup_pages(memcg, nr_pages,
> +						GFP_KERNEL, false)) {
> +			ret = -EBUSY;
> +			break;
> +		}
> +	} while (1);
> +
> +	if (!ret && enlarge)
> +		memcg_oom_recover(memcg);
> +
> +	return ret;
> +}
> +
>   /*
>    * The user of this function is...
>    * RES_LIMIT.
> @@ -4019,6 +4088,8 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
>   			break;
>   		case _TCP:
>   			ret = memcg_update_tcp_max(memcg, nr_pages);
> +		case _CACHE:
> +			ret = memcg_update_cache_max(memcg, nr_pages);
>   			break;
>   		}
>   		break;
> @@ -4048,6 +4119,8 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
>   		break;
>   	case _TCP:
>   		counter = &memcg->tcpmem;
> +	case _CACHE:
> +		counter = &memcg->cache;
>   		break;
>   	default:
>   		BUG();
> @@ -5713,6 +5786,12 @@ static struct cftype mem_cgroup_legacy_files[] = {
>   	{
>   		.name = "pressure_level",
>   	},
> +	{
> +		.name = "cache.limit_in_bytes",
> +		.private = MEMFILE_PRIVATE(_CACHE, RES_LIMIT),
> +		.write = mem_cgroup_write,
> +		.read_u64 = mem_cgroup_read_u64,
> +	},
>   #ifdef CONFIG_NUMA
>   	{
>   		.name = "numa_stat",
> @@ -6031,17 +6110,20 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
>   		page_counter_init(&memcg->swap, NULL);
>   		page_counter_init(&memcg->kmem, NULL);
>   		page_counter_init(&memcg->tcpmem, NULL);
> +		page_counter_init(&memcg->cache, NULL);
>   	} else if (parent->use_hierarchy) {
>   		memcg->use_hierarchy = true;
>   		page_counter_init(&memcg->memory, &parent->memory);
>   		page_counter_init(&memcg->swap, &parent->swap);
>   		page_counter_init(&memcg->kmem, &parent->kmem);
>   		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
> +		page_counter_init(&memcg->cache, &parent->cache);
>   	} else {
>   		page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
>   		page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
>   		page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
>   		page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
> +		page_counter_init(&memcg->cache, &root_mem_cgroup->cache);
>   		/*
>   		 * Deeper hierachy with use_hierarchy == false doesn't make
>   		 * much sense so let cgroup subsystem know about this
> @@ -6170,6 +6252,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
>   	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
>   	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
>   	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
> +	page_counter_set_max(&memcg->cache, PAGE_COUNTER_MAX);
>   	page_counter_set_min(&memcg->memory, 0);
>   	page_counter_set_low(&memcg->memory, 0);
>   	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
> @@ -6185,7 +6268,8 @@ static int mem_cgroup_do_precharge(unsigned long count)
>   	int ret;
>   
>   	/* Try a single bulk charge without reclaim first, kswapd may wake */
> -	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, false, count);
> +	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, false,
> +			count, false);
>   	if (!ret) {
>   		mc.precharge += count;
>   		return ret;
> @@ -6193,7 +6277,8 @@ static int mem_cgroup_do_precharge(unsigned long count)
>   
>   	/* Try charges one by one with reclaim, but do not retry */
>   	while (count--) {
> -		ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, false, 1);
> +		ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, false, 1, false);
> +
>   		if (ret)
>   			return ret;
>   		mc.precharge++;
> @@ -7405,18 +7490,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
>   		return MEMCG_PROT_NONE;
>   }
>   
> -/**
> - * mem_cgroup_charge - charge a newly allocated page to a cgroup
> - * @page: page to charge
> - * @mm: mm context of the victim
> - * @gfp_mask: reclaim mode
> - *
> - * Try to charge @page to the memcg that @mm belongs to, reclaiming
> - * pages according to @gfp_mask if necessary.
> - *
> - * Returns 0 on success. Otherwise, an error code is returned.
> - */
> -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
> +static int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
> +			       gfp_t gfp_mask, bool cache_charge)
>   {
>   	unsigned int nr_pages = thp_nr_pages(page);
>   	struct mem_cgroup *memcg = NULL;
> @@ -7451,13 +7526,25 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
>   	if (!memcg)
>   		memcg = get_mem_cgroup_from_mm(mm);
>   
> -	ret = try_charge(memcg, gfp_mask, false, nr_pages);
> +	ret = try_charge(memcg, gfp_mask, false, nr_pages, cache_charge);
>   	if (ret)
>   		goto out_put;
>   
>   	css_get(&memcg->css);
>   	commit_charge(page, memcg);
>   
> +	/*
> +	 * Here we set extended flag (see page_vzflags.c)
> +	 * on page which indicates that page is charged as
> +	 * a "page cache" page.
> +	 *
> +	 * We always cleanup this flag on uncharging, it means
> +	 * that during charging page we shoudn't have this flag set.
> +	 */
> +	BUG_ON(PageVzPageCache(page));
> +	if (cache_charge)
> +		SetVzPagePageCache(page);

"PagePage" - maybe we can reconsider naming here to remove dublicated 
"Page" from name? Though I do not insist.

> +
>   	local_irq_disable();
>   	mem_cgroup_charge_statistics(memcg, page, nr_pages);
>   	memcg_check_events(memcg, page);
> @@ -7491,11 +7578,34 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
>   	return ret;
>   }
>   
> +/**
> + * mem_cgroup_charge - charge a newly allocated page to a cgroup
> + * @page: page to charge
> + * @mm: mm context of the victim
> + * @gfp_mask: reclaim mode
> + *
> + * Try to charge @page to the memcg that @mm belongs to, reclaiming
> + * pages according to @gfp_mask if necessary.
> + *
> + * Returns 0 on success. Otherwise, an error code is returned.
> + */
> +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
> +{
> +
> +	return __mem_cgroup_charge(page, mm, gfp_mask, false);
> +}
> +
> +int mem_cgroup_charge_cache(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
> +{
> +	return __mem_cgroup_charge(page, mm, gfp_mask, true);
> +}
> +
>   struct uncharge_gather {
>   	struct mem_cgroup *memcg;
>   	unsigned long nr_pages;
>   	unsigned long pgpgout;
>   	unsigned long nr_kmem;
> +	unsigned long nr_pgcache;
>   	struct page *dummy_page;
>   };
>   
> @@ -7514,6 +7624,9 @@ static void uncharge_batch(const struct uncharge_gather *ug)
>   			page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
>   		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
>   			page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
> +		if (ug->nr_pgcache)
> +			page_counter_uncharge(&ug->memcg->cache, ug->nr_pgcache);
> +
>   		memcg_oom_recover(ug->memcg);
>   	}
>   
> @@ -7557,6 +7670,16 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
>   	ug->nr_pages += nr_pages;
>   
>   	if (!PageKmemcg(page)) {
> +		if (PageVzPageCache(page)) {
> +			ug->nr_pgcache += nr_pages;
> +			/*
> +			 * If we are here, it means that page *will* be
> +			 * uncharged anyway. We can safely clean
> +			 * "page is charged as a page cache" flag here.
> +			 */
> +			ClearVzPagePageCache(page);
> +		}
> +
>   		ug->pgpgout++;
>   	} else {
>   		ug->nr_kmem += nr_pages;
> @@ -7672,6 +7795,21 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
>   	if (do_memsw_account())
>   		page_counter_charge(&memcg->memsw, nr_pages);
>   
> +	/*
> +	 * copy_page_vzflags() called before mem_cgroup_migrate()
> +	 * in migrate_page_states (mm/migrate.c)
> +	 *
> +	 * Let's check that all fine with flags:
> +	 * from one point of view page cache pages is always
> +	 * not anonimous and not swap backed;
> +	 * from another point of view we must have
> +	 * PageVzPageCache(page) ext flag set.
> +	 */
> +	WARN_ON((!PageAnon(newpage) && !PageSwapBacked(newpage)) !=
> +		PageVzPageCache(newpage));
> +	if (PageVzPageCache(newpage))
> +		page_counter_charge(&memcg->cache, nr_pages);
> +
>   	css_get(&memcg->css);
>   	commit_charge(newpage, memcg);
>   
> @@ -7753,10 +7891,10 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
>   
>   	mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
>   
> -	if (try_charge(memcg, gfp_mask, false, nr_pages) == 0)
> +	if (try_charge(memcg, gfp_mask, false, nr_pages, false) == 0)
>   		return true;
>   
> -	try_charge(memcg, gfp_mask|__GFP_NOFAIL, false, nr_pages);
> +	try_charge(memcg, gfp_mask|__GFP_NOFAIL, false, nr_pages, false);
>   	return false;
>   }
>   
> 

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.