[Devel] [PATCH RH8 2/5] mm/memcg: limit page cache in memcg hack
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Fri Jul 30 15:49:32 MSK 2021
On 29.07.2021 18:20, Alexander Mikhalitsyn wrote:
> From: Andrey Ryabinin <aryabinin at virtuozzo.com>
>
> Add new memcg file - memory.cache.limit_in_bytes.
> Used to limit page cache usage in cgroup.
>
> https://jira.sw.ru/browse/PSBM-77547
>
> Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
>
> khorenko@: usecase:
> imagine a system service which anon memory you don't want to limit
> (in our case it's a vStorage cgroup which hosts CSes and MDSes, they can
> consume memory in some range and we don't want to set a limit for max possible
> consumption - too high, and we don't know the number of CSes on the node -
> admin can add CSes dynamically. And we don't want to dynamically
> increase/decrease the limit).
>
> If the cgroup is "unlimited" it produces permanent memory pressure on the node
> because it generates a lot of pagecache and other cgroups on the node are
> affected (even taking into account the fact of proportional fair reclaim).
>
> => solution is to limit pagecache only, so this is implemented.
>
> Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
> (cherry picked from commit da9151c891819733762a178b4efd7e44766fb8b1)
>
> Reworked:
> now we have no charge/cancel/commit/uncharge memcg API (we only have charge/uncharge)
> => we have to track pages which was charged as page cache => additional flag was introduced
> which implemented using mm/page_ext.c subsystem (see mm/page_vzext.c)
>
> See ms commits:
> 0d1c2072 ("mm: memcontrol: switch to native NR_FILE_PAGES and NR_SHMEM counters")
> 3fea5a49 ("mm: memcontrol: convert page cache to a new mem_cgroup_charge() API")
>
> https://jira.sw.ru/browse/PSBM-131957
>
> Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
> ---
> include/linux/memcontrol.h | 3 +
> include/linux/page_vzflags.h | 31 ++++++
> mm/filemap.c | 2 +-
> mm/memcontrol.c | 188 ++++++++++++++++++++++++++++++-----
> 4 files changed, 198 insertions(+), 26 deletions(-)
> create mode 100644 include/linux/page_vzflags.h
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index fdf93a2de456..2e053bd43050 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -285,6 +285,7 @@ struct mem_cgroup {
> struct page_counter kmem;
> struct page_counter tcpmem;
> RH_KABI_DEPRECATE(unsigned long, high)
> + struct page_counter cache;
> #else
> union {
> struct page_counter swap; /* v2 only */
> @@ -294,6 +295,7 @@ struct mem_cgroup {
> /* Legacy consumer-oriented counters */
> struct page_counter kmem; /* v1 only */
> struct page_counter tcpmem; /* v1 only */
> + struct page_counter cache;
> #endif
Moving "cache" after endif should not change anything, but would save us
from duplicating lines.
>
> /* Range enforcement for interrupt charges */
> @@ -510,6 +512,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
> struct mem_cgroup *memcg);
>
> int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
> +int mem_cgroup_charge_cache(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
We should probably also define it for !CONFIG_MEMCG case.
>
> void mem_cgroup_uncharge(struct page *page);
> void mem_cgroup_uncharge_list(struct list_head *page_list);
> diff --git a/include/linux/page_vzflags.h b/include/linux/page_vzflags.h
> new file mode 100644
> index 000000000000..64a06ba8c9f6
> --- /dev/null
> +++ b/include/linux/page_vzflags.h
> @@ -0,0 +1,31 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __LINUX_PAGE_VZFLAGS_H
> +#define __LINUX_PAGE_VZFLAGS_H
> +
> +#include <linux/page_vzext.h>
> +#include <linux/page-flags.h>
> +
> +enum vzpageflags {
> + PGVZ_pagecache,
> +};
> +
> +#define TESTVZPAGEFLAG(uname, lname) \
> +static __always_inline int PageVz##uname(struct page *page) \
> + { return get_page_vzext(page) && test_bit(PGVZ_##lname, &get_page_vzext(page)->vzflags); }
> +
> +#define SETVZPAGEFLAG(uname, lname) \
> +static __always_inline void SetVzPage##uname(struct page *page) \
> + { if (get_page_vzext(page)) set_bit(PGVZ_##lname, &get_page_vzext(page)->vzflags); }
> +
> +#define CLEARVZPAGEFLAG(uname, lname) \
> +static __always_inline void ClearVzPage##uname(struct page *page) \
> + { if (get_page_vzext(page)) clear_bit(PGVZ_##lname, &get_page_vzext(page)->vzflags); }
> +
> +#define VZPAGEFLAG(uname, lname) \
> + TESTVZPAGEFLAG(uname, lname) \
> + SETVZPAGEFLAG(uname, lname) \
> + CLEARVZPAGEFLAG(uname, lname)
> +
> +VZPAGEFLAG(PageCache, pagecache) > +
> +#endif /* __LINUX_PAGE_VZFLAGS_H */
> diff --git a/mm/filemap.c b/mm/filemap.c
> index f1a56ac4ec41..3d4e912f0930 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -862,7 +862,7 @@ static int __add_to_page_cache_locked(struct page *page,
> page->index = offset;
>
> if (!huge) {
> - error = mem_cgroup_charge(page, current->mm, gfp);
> + error = mem_cgroup_charge_cache(page, current->mm, gfp);
> if (error)
> goto error;
> charged = true;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d3aa1cf20796..ac0ece68a17a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -42,6 +42,7 @@
> #include <linux/vm_event_item.h>
> #include <linux/smp.h>
> #include <linux/page-flags.h>
> +#include <linux/page_vzflags.h>
> #include <linux/backing-dev.h>
> #include <linux/bit_spinlock.h>
> #include <linux/rcupdate.h>
> @@ -225,6 +226,7 @@ enum res_type {
> _OOM_TYPE,
> _KMEM,
> _TCP,
> + _CACHE,
> };
>
> #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
> @@ -2799,7 +2801,7 @@ static bool kmem_reclaim_is_low(struct mem_cgroup *memcg)
> }
>
> static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge,
> - unsigned int nr_pages)
> + unsigned int nr_pages, bool cache_charge)
> {
> unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
> int nr_retries = MAX_RECLAIM_RETRIES;
> @@ -2817,13 +2819,21 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
> may_swap = true;
> kmem_limit = false;
> if (consume_stock(memcg, nr_pages)) {
> - if (!kmem_charge)
> - return 0;
> - if (page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
> - return 0;
> - refill_stock(memcg, nr_pages);
> + if (kmem_charge && !page_counter_try_charge(
> + &memcg->kmem, nr_pages, &counter)) {
> + refill_stock(memcg, nr_pages);
> + goto charge;
> + }
> +
> + if (cache_charge && !page_counter_try_charge(
> + &memcg->cache, nr_pages, &counter)) {
> + refill_stock(memcg, nr_pages);
> + goto charge;
> + }
> + return 0;
> }
>
> +charge:
> mem_over_limit = NULL;
> if (page_counter_try_charge(&memcg->memory, batch, &counter)) {
> if (do_memsw_account() && !page_counter_try_charge(
> @@ -2845,6 +2855,19 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
> }
> }
>
> + if (!mem_over_limit && cache_charge) {
> + if (page_counter_try_charge(&memcg->cache, nr_pages, &counter))
> + goto done_restock;
> +
> + may_swap = false;
> + mem_over_limit = mem_cgroup_from_counter(counter, cache);
> + page_counter_uncharge(&memcg->memory, batch);
> + if (do_memsw_account())
> + page_counter_uncharge(&memcg->memsw, batch);
> + if (kmem_charge)
> + page_counter_uncharge(&memcg->kmem, nr_pages);
> + }
> +
> if (!mem_over_limit)
> goto done_restock;
>
> @@ -2970,10 +2993,17 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
> page_counter_charge(&memcg->memory, nr_pages);
> if (do_memsw_account())
> page_counter_charge(&memcg->memsw, nr_pages);
> + if (kmem_charge)
> + page_counter_charge(&memcg->kmem, nr_pages);
> + if (cache_charge)
> + page_counter_charge(&memcg->cache, nr_pages);
>
> return 0;
>
> done_restock:
> + if (cache_charge)
> + page_counter_charge(&memcg->cache, batch);
> +
Not sure, but it seem like a hunk from "mm/memcg: reclaim
memory.cache.limit_in_bytes from background", like you've merged part of
the later patch in the earlier patch. Please check that it's OK.
> if (batch > nr_pages)
> refill_stock(memcg, batch - nr_pages);
>
> @@ -3200,7 +3230,7 @@ int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
> {
> int ret;
>
> - ret = try_charge(memcg, gfp, true, nr_pages);
> + ret = try_charge(memcg, gfp, true, nr_pages, false);
> if (ret)
> return ret;
>
> @@ -3411,7 +3441,7 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
> {
> int ret = 0;
>
> - ret = try_charge(memcg, gfp, true, nr_pages);
> + ret = try_charge(memcg, gfp, true, nr_pages, false);
> return ret;
> }
>
> @@ -3763,6 +3793,8 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
> break;
> case _TCP:
> counter = &memcg->tcpmem;
> + case _CACHE:
> + counter = &memcg->cache;
> break;
> default:
> BUG();
> @@ -3985,6 +4017,43 @@ static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
> return ret;
> }
>
> +static int memcg_update_cache_max(struct mem_cgroup *memcg,
> + unsigned long limit)
> +{
> + unsigned long nr_pages;
> + bool enlarge = false;
> + int ret;
> +
> + do {
> + if (signal_pending(current)) {
> + ret = -EINTR;
> + break;
> + }
> + mutex_lock(&memcg_max_mutex);
> +
> + if (limit > memcg->cache.max)
> + enlarge = true;
> +
> + ret = page_counter_set_max(&memcg->cache, limit);
> + mutex_unlock(&memcg_max_mutex);
> +
> + if (!ret)
> + break;
> +
> + nr_pages = max_t(long, 1, page_counter_read(&memcg->cache) - limit);
> + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages,
> + GFP_KERNEL, false)) {
> + ret = -EBUSY;
> + break;
> + }
> + } while (1);
> +
> + if (!ret && enlarge)
> + memcg_oom_recover(memcg);
> +
> + return ret;
> +}
> +
> /*
> * The user of this function is...
> * RES_LIMIT.
> @@ -4019,6 +4088,8 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
> break;
> case _TCP:
> ret = memcg_update_tcp_max(memcg, nr_pages);
> + case _CACHE:
> + ret = memcg_update_cache_max(memcg, nr_pages);
> break;
> }
> break;
> @@ -4048,6 +4119,8 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
> break;
> case _TCP:
> counter = &memcg->tcpmem;
> + case _CACHE:
> + counter = &memcg->cache;
> break;
> default:
> BUG();
> @@ -5713,6 +5786,12 @@ static struct cftype mem_cgroup_legacy_files[] = {
> {
> .name = "pressure_level",
> },
> + {
> + .name = "cache.limit_in_bytes",
> + .private = MEMFILE_PRIVATE(_CACHE, RES_LIMIT),
> + .write = mem_cgroup_write,
> + .read_u64 = mem_cgroup_read_u64,
> + },
> #ifdef CONFIG_NUMA
> {
> .name = "numa_stat",
> @@ -6031,17 +6110,20 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> page_counter_init(&memcg->swap, NULL);
> page_counter_init(&memcg->kmem, NULL);
> page_counter_init(&memcg->tcpmem, NULL);
> + page_counter_init(&memcg->cache, NULL);
> } else if (parent->use_hierarchy) {
> memcg->use_hierarchy = true;
> page_counter_init(&memcg->memory, &parent->memory);
> page_counter_init(&memcg->swap, &parent->swap);
> page_counter_init(&memcg->kmem, &parent->kmem);
> page_counter_init(&memcg->tcpmem, &parent->tcpmem);
> + page_counter_init(&memcg->cache, &parent->cache);
> } else {
> page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
> page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
> page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
> page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
> + page_counter_init(&memcg->cache, &root_mem_cgroup->cache);
> /*
> * Deeper hierachy with use_hierarchy == false doesn't make
> * much sense so let cgroup subsystem know about this
> @@ -6170,6 +6252,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
> page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
> page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
> page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
> + page_counter_set_max(&memcg->cache, PAGE_COUNTER_MAX);
> page_counter_set_min(&memcg->memory, 0);
> page_counter_set_low(&memcg->memory, 0);
> page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
> @@ -6185,7 +6268,8 @@ static int mem_cgroup_do_precharge(unsigned long count)
> int ret;
>
> /* Try a single bulk charge without reclaim first, kswapd may wake */
> - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, false, count);
> + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, false,
> + count, false);
> if (!ret) {
> mc.precharge += count;
> return ret;
> @@ -6193,7 +6277,8 @@ static int mem_cgroup_do_precharge(unsigned long count)
>
> /* Try charges one by one with reclaim, but do not retry */
> while (count--) {
> - ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, false, 1);
> + ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, false, 1, false);
> +
> if (ret)
> return ret;
> mc.precharge++;
> @@ -7405,18 +7490,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
> return MEMCG_PROT_NONE;
> }
>
> -/**
> - * mem_cgroup_charge - charge a newly allocated page to a cgroup
> - * @page: page to charge
> - * @mm: mm context of the victim
> - * @gfp_mask: reclaim mode
> - *
> - * Try to charge @page to the memcg that @mm belongs to, reclaiming
> - * pages according to @gfp_mask if necessary.
> - *
> - * Returns 0 on success. Otherwise, an error code is returned.
> - */
> -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
> +static int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
> + gfp_t gfp_mask, bool cache_charge)
> {
> unsigned int nr_pages = thp_nr_pages(page);
> struct mem_cgroup *memcg = NULL;
> @@ -7451,13 +7526,25 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
> if (!memcg)
> memcg = get_mem_cgroup_from_mm(mm);
>
> - ret = try_charge(memcg, gfp_mask, false, nr_pages);
> + ret = try_charge(memcg, gfp_mask, false, nr_pages, cache_charge);
> if (ret)
> goto out_put;
>
> css_get(&memcg->css);
> commit_charge(page, memcg);
>
> + /*
> + * Here we set extended flag (see page_vzflags.c)
> + * on page which indicates that page is charged as
> + * a "page cache" page.
> + *
> + * We always cleanup this flag on uncharging, it means
> + * that during charging page we shoudn't have this flag set.
> + */
> + BUG_ON(PageVzPageCache(page));
> + if (cache_charge)
> + SetVzPagePageCache(page);
"PagePage" - maybe we can reconsider naming here to remove dublicated
"Page" from name? Though I do not insist.
> +
> local_irq_disable();
> mem_cgroup_charge_statistics(memcg, page, nr_pages);
> memcg_check_events(memcg, page);
> @@ -7491,11 +7578,34 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
> return ret;
> }
>
> +/**
> + * mem_cgroup_charge - charge a newly allocated page to a cgroup
> + * @page: page to charge
> + * @mm: mm context of the victim
> + * @gfp_mask: reclaim mode
> + *
> + * Try to charge @page to the memcg that @mm belongs to, reclaiming
> + * pages according to @gfp_mask if necessary.
> + *
> + * Returns 0 on success. Otherwise, an error code is returned.
> + */
> +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
> +{
> +
> + return __mem_cgroup_charge(page, mm, gfp_mask, false);
> +}
> +
> +int mem_cgroup_charge_cache(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
> +{
> + return __mem_cgroup_charge(page, mm, gfp_mask, true);
> +}
> +
> struct uncharge_gather {
> struct mem_cgroup *memcg;
> unsigned long nr_pages;
> unsigned long pgpgout;
> unsigned long nr_kmem;
> + unsigned long nr_pgcache;
> struct page *dummy_page;
> };
>
> @@ -7514,6 +7624,9 @@ static void uncharge_batch(const struct uncharge_gather *ug)
> page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
> if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
> page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
> + if (ug->nr_pgcache)
> + page_counter_uncharge(&ug->memcg->cache, ug->nr_pgcache);
> +
> memcg_oom_recover(ug->memcg);
> }
>
> @@ -7557,6 +7670,16 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> ug->nr_pages += nr_pages;
>
> if (!PageKmemcg(page)) {
> + if (PageVzPageCache(page)) {
> + ug->nr_pgcache += nr_pages;
> + /*
> + * If we are here, it means that page *will* be
> + * uncharged anyway. We can safely clean
> + * "page is charged as a page cache" flag here.
> + */
> + ClearVzPagePageCache(page);
> + }
> +
> ug->pgpgout++;
> } else {
> ug->nr_kmem += nr_pages;
> @@ -7672,6 +7795,21 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
> if (do_memsw_account())
> page_counter_charge(&memcg->memsw, nr_pages);
>
> + /*
> + * copy_page_vzflags() called before mem_cgroup_migrate()
> + * in migrate_page_states (mm/migrate.c)
> + *
> + * Let's check that all fine with flags:
> + * from one point of view page cache pages is always
> + * not anonimous and not swap backed;
> + * from another point of view we must have
> + * PageVzPageCache(page) ext flag set.
> + */
> + WARN_ON((!PageAnon(newpage) && !PageSwapBacked(newpage)) !=
> + PageVzPageCache(newpage));
> + if (PageVzPageCache(newpage))
> + page_counter_charge(&memcg->cache, nr_pages);
> +
> css_get(&memcg->css);
> commit_charge(newpage, memcg);
>
> @@ -7753,10 +7891,10 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
>
> mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
>
> - if (try_charge(memcg, gfp_mask, false, nr_pages) == 0)
> + if (try_charge(memcg, gfp_mask, false, nr_pages, false) == 0)
> return true;
>
> - try_charge(memcg, gfp_mask|__GFP_NOFAIL, false, nr_pages);
> + try_charge(memcg, gfp_mask|__GFP_NOFAIL, false, nr_pages, false);
> return false;
> }
>
>
--
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.
More information about the Devel
mailing list