[Devel] [PATCH RHEL8 COMMIT] mm/memcg: limit page cache in memcg hack
Konstantin Khorenko
khorenko at virtuozzo.com
Fri Jul 30 20:23:10 MSK 2021
The commit is pushed to "branch-rh8-4.18.0-305.3.1.vz8.7.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-305.3.1.vz8.7.1
------>
commit 6e6b8cae9ba160d0d52caeaa6e8e75c1d54cca8b
Author: Andrey Ryabinin <aryabinin at virtuozzo.com>
Date: Thu Jul 29 18:20:46 2021 +0300
mm/memcg: limit page cache in memcg hack
Add new memcg file - memory.cache.limit_in_bytes.
Used to limit page cache usage in cgroup.
https://jira.sw.ru/browse/PSBM-77547
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
khorenko@: usecase:
imagine a system service which anon memory you don't want to limit
(in our case it's a vStorage cgroup which hosts CSes and MDSes, they can
consume memory in some range and we don't want to set a limit for max possible
consumption - too high, and we don't know the number of CSes on the node -
admin can add CSes dynamically. And we don't want to dynamically
increase/decrease the limit).
If the cgroup is "unlimited" it produces permanent memory pressure on the node
because it generates a lot of pagecache and other cgroups on the node are
affected (even taking into account the fact of proportional fair reclaim).
=> solution is to limit pagecache only, so this is implemented.
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
(cherry picked from commit da9151c891819733762a178b4efd7e44766fb8b1)
Reworked:
now we have no charge/cancel/commit/uncharge memcg API (we only have charge/uncharge)
=> we have to track pages which was charged as page cache => additional flag was introduced
which implemented using mm/page_ext.c subsystem (see mm/page_vzext.c)
See ms commits:
0d1c2072 ("mm: memcontrol: switch to native NR_FILE_PAGES and NR_SHMEM counters")
3fea5a49 ("mm: memcontrol: convert page cache to a new mem_cgroup_charge() API")
https://jira.sw.ru/browse/PSBM-131957
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
khorenko@:
v2:
1. hunk
===
done_restock:
+ if (cache_charge)
+ page_counter_charge(&memcg->cache, batch);
+
===
is moved to later commit ("mm/memcg: Use per-cpu stock charges for
->cache counter")
2. "cache" field in struct mem_cgroup has been moved out of ifdef
3. copyright added to include/linux/page_vzext.h
---
include/linux/memcontrol.h | 2 +
include/linux/page_vzflags.h | 37 +++++++++
mm/filemap.c | 2 +-
mm/memcontrol.c | 185 +++++++++++++++++++++++++++++++++++++------
4 files changed, 200 insertions(+), 26 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5458f18c0ae7..de2ebbd09143 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -285,6 +285,7 @@ struct mem_cgroup {
struct page_counter kmem; /* v1 only */
struct page_counter tcpmem; /* v1 only */
#endif
+ struct page_counter cache;
/* Range enforcement for interrupt charges */
struct work_struct high_work;
@@ -500,6 +501,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
struct mem_cgroup *memcg);
int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
+int mem_cgroup_charge_cache(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
void mem_cgroup_uncharge(struct page *page);
void mem_cgroup_uncharge_list(struct list_head *page_list);
diff --git a/include/linux/page_vzflags.h b/include/linux/page_vzflags.h
new file mode 100644
index 000000000000..d98e4ac619a7
--- /dev/null
+++ b/include/linux/page_vzflags.h
@@ -0,0 +1,37 @@
+/*
+ * include/linux/page_vzflags.h
+ *
+ * Copyright (c) 2021 Virtuozzo International GmbH. All rights reserved.
+ *
+ */
+
+#ifndef __LINUX_PAGE_VZFLAGS_H
+#define __LINUX_PAGE_VZFLAGS_H
+
+#include <linux/page_vzext.h>
+#include <linux/page-flags.h>
+
+enum vzpageflags {
+ PGVZ_pagecache,
+};
+
+#define TESTVZPAGEFLAG(uname, lname) \
+static __always_inline int PageVz##uname(struct page *page) \
+ { return get_page_vzext(page) && test_bit(PGVZ_##lname, &get_page_vzext(page)->vzflags); }
+
+#define SETVZPAGEFLAG(uname, lname) \
+static __always_inline void SetVzPage##uname(struct page *page) \
+ { if (get_page_vzext(page)) set_bit(PGVZ_##lname, &get_page_vzext(page)->vzflags); }
+
+#define CLEARVZPAGEFLAG(uname, lname) \
+static __always_inline void ClearVzPage##uname(struct page *page) \
+ { if (get_page_vzext(page)) clear_bit(PGVZ_##lname, &get_page_vzext(page)->vzflags); }
+
+#define VZPAGEFLAG(uname, lname) \
+ TESTVZPAGEFLAG(uname, lname) \
+ SETVZPAGEFLAG(uname, lname) \
+ CLEARVZPAGEFLAG(uname, lname)
+
+VZPAGEFLAG(PageCache, pagecache)
+
+#endif /* __LINUX_PAGE_VZFLAGS_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index f1a56ac4ec41..3d4e912f0930 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -862,7 +862,7 @@ static int __add_to_page_cache_locked(struct page *page,
page->index = offset;
if (!huge) {
- error = mem_cgroup_charge(page, current->mm, gfp);
+ error = mem_cgroup_charge_cache(page, current->mm, gfp);
if (error)
goto error;
charged = true;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4638d9fe1ebf..2e79ede5801b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -42,6 +42,7 @@
#include <linux/vm_event_item.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
+#include <linux/page_vzflags.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
@@ -225,6 +226,7 @@ enum res_type {
_OOM_TYPE,
_KMEM,
_TCP,
+ _CACHE,
};
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
@@ -2799,7 +2801,7 @@ static bool kmem_reclaim_is_low(struct mem_cgroup *memcg)
}
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge,
- unsigned int nr_pages)
+ unsigned int nr_pages, bool cache_charge)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MAX_RECLAIM_RETRIES;
@@ -2817,13 +2819,21 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
may_swap = true;
kmem_limit = false;
if (consume_stock(memcg, nr_pages)) {
- if (!kmem_charge)
- return 0;
- if (page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
- return 0;
- refill_stock(memcg, nr_pages);
+ if (kmem_charge && !page_counter_try_charge(
+ &memcg->kmem, nr_pages, &counter)) {
+ refill_stock(memcg, nr_pages);
+ goto charge;
+ }
+
+ if (cache_charge && !page_counter_try_charge(
+ &memcg->cache, nr_pages, &counter)) {
+ refill_stock(memcg, nr_pages);
+ goto charge;
+ }
+ return 0;
}
+charge:
mem_over_limit = NULL;
if (page_counter_try_charge(&memcg->memory, batch, &counter)) {
if (do_memsw_account() && !page_counter_try_charge(
@@ -2845,6 +2855,19 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
}
}
+ if (!mem_over_limit && cache_charge) {
+ if (page_counter_try_charge(&memcg->cache, nr_pages, &counter))
+ goto done_restock;
+
+ may_swap = false;
+ mem_over_limit = mem_cgroup_from_counter(counter, cache);
+ page_counter_uncharge(&memcg->memory, batch);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, batch);
+ if (kmem_charge)
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+ }
+
if (!mem_over_limit)
goto done_restock;
@@ -2970,6 +2993,10 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
+ if (kmem_charge)
+ page_counter_charge(&memcg->kmem, nr_pages);
+ if (cache_charge)
+ page_counter_charge(&memcg->cache, nr_pages);
return 0;
@@ -3200,7 +3227,7 @@ int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
{
int ret;
- ret = try_charge(memcg, gfp, true, nr_pages);
+ ret = try_charge(memcg, gfp, true, nr_pages, false);
if (ret)
return ret;
@@ -3411,7 +3438,7 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
{
int ret = 0;
- ret = try_charge(memcg, gfp, true, nr_pages);
+ ret = try_charge(memcg, gfp, true, nr_pages, false);
return ret;
}
@@ -3763,6 +3790,8 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
break;
case _TCP:
counter = &memcg->tcpmem;
+ case _CACHE:
+ counter = &memcg->cache;
break;
default:
BUG();
@@ -3985,6 +4014,43 @@ static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
return ret;
}
+static int memcg_update_cache_max(struct mem_cgroup *memcg,
+ unsigned long limit)
+{
+ unsigned long nr_pages;
+ bool enlarge = false;
+ int ret;
+
+ do {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ mutex_lock(&memcg_max_mutex);
+
+ if (limit > memcg->cache.max)
+ enlarge = true;
+
+ ret = page_counter_set_max(&memcg->cache, limit);
+ mutex_unlock(&memcg_max_mutex);
+
+ if (!ret)
+ break;
+
+ nr_pages = max_t(long, 1, page_counter_read(&memcg->cache) - limit);
+ if (!try_to_free_mem_cgroup_pages(memcg, nr_pages,
+ GFP_KERNEL, false)) {
+ ret = -EBUSY;
+ break;
+ }
+ } while (1);
+
+ if (!ret && enlarge)
+ memcg_oom_recover(memcg);
+
+ return ret;
+}
+
/*
* The user of this function is...
* RES_LIMIT.
@@ -4019,6 +4085,8 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
break;
case _TCP:
ret = memcg_update_tcp_max(memcg, nr_pages);
+ case _CACHE:
+ ret = memcg_update_cache_max(memcg, nr_pages);
break;
}
break;
@@ -4048,6 +4116,8 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
break;
case _TCP:
counter = &memcg->tcpmem;
+ case _CACHE:
+ counter = &memcg->cache;
break;
default:
BUG();
@@ -5689,6 +5759,12 @@ static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "pressure_level",
},
+ {
+ .name = "cache.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_CACHE, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
@@ -6007,17 +6083,20 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->kmem, NULL);
page_counter_init(&memcg->tcpmem, NULL);
+ page_counter_init(&memcg->cache, NULL);
} else if (parent->use_hierarchy) {
memcg->use_hierarchy = true;
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
+ page_counter_init(&memcg->cache, &parent->cache);
} else {
page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
+ page_counter_init(&memcg->cache, &root_mem_cgroup->cache);
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
@@ -6146,6 +6225,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->cache, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
@@ -6161,7 +6241,8 @@ static int mem_cgroup_do_precharge(unsigned long count)
int ret;
/* Try a single bulk charge without reclaim first, kswapd may wake */
- ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, false, count);
+ ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, false,
+ count, false);
if (!ret) {
mc.precharge += count;
return ret;
@@ -6169,7 +6250,8 @@ static int mem_cgroup_do_precharge(unsigned long count)
/* Try charges one by one with reclaim, but do not retry */
while (count--) {
- ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, false, 1);
+ ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, false, 1, false);
+
if (ret)
return ret;
mc.precharge++;
@@ -7381,18 +7463,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
return MEMCG_PROT_NONE;
}
-/**
- * mem_cgroup_charge - charge a newly allocated page to a cgroup
- * @page: page to charge
- * @mm: mm context of the victim
- * @gfp_mask: reclaim mode
- *
- * Try to charge @page to the memcg that @mm belongs to, reclaiming
- * pages according to @gfp_mask if necessary.
- *
- * Returns 0 on success. Otherwise, an error code is returned.
- */
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+static int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask, bool cache_charge)
{
unsigned int nr_pages = thp_nr_pages(page);
struct mem_cgroup *memcg = NULL;
@@ -7427,13 +7499,25 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
- ret = try_charge(memcg, gfp_mask, false, nr_pages);
+ ret = try_charge(memcg, gfp_mask, false, nr_pages, cache_charge);
if (ret)
goto out_put;
css_get(&memcg->css);
commit_charge(page, memcg);
+ /*
+ * Here we set extended flag (see page_vzflags.c)
+ * on page which indicates that page is charged as
+ * a "page cache" page.
+ *
+ * We always cleanup this flag on uncharging, it means
+ * that during charging page we shoudn't have this flag set.
+ */
+ BUG_ON(PageVzPageCache(page));
+ if (cache_charge)
+ SetVzPagePageCache(page);
+
local_irq_disable();
mem_cgroup_charge_statistics(memcg, page, nr_pages);
memcg_check_events(memcg, page);
@@ -7467,11 +7551,34 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
return ret;
}
+/**
+ * mem_cgroup_charge - charge a newly allocated page to a cgroup
+ * @page: page to charge
+ * @mm: mm context of the victim
+ * @gfp_mask: reclaim mode
+ *
+ * Try to charge @page to the memcg that @mm belongs to, reclaiming
+ * pages according to @gfp_mask if necessary.
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
+ */
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+{
+
+ return __mem_cgroup_charge(page, mm, gfp_mask, false);
+}
+
+int mem_cgroup_charge_cache(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+{
+ return __mem_cgroup_charge(page, mm, gfp_mask, true);
+}
+
struct uncharge_gather {
struct mem_cgroup *memcg;
unsigned long nr_pages;
unsigned long pgpgout;
unsigned long nr_kmem;
+ unsigned long nr_pgcache;
struct page *dummy_page;
};
@@ -7490,6 +7597,9 @@ static void uncharge_batch(const struct uncharge_gather *ug)
page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
+ if (ug->nr_pgcache)
+ page_counter_uncharge(&ug->memcg->cache, ug->nr_pgcache);
+
memcg_oom_recover(ug->memcg);
}
@@ -7533,6 +7643,16 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
ug->nr_pages += nr_pages;
if (!PageKmemcg(page)) {
+ if (PageVzPageCache(page)) {
+ ug->nr_pgcache += nr_pages;
+ /*
+ * If we are here, it means that page *will* be
+ * uncharged anyway. We can safely clean
+ * "page is charged as a page cache" flag here.
+ */
+ ClearVzPagePageCache(page);
+ }
+
ug->pgpgout++;
} else {
ug->nr_kmem += nr_pages;
@@ -7648,6 +7768,21 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
+ /*
+ * copy_page_vzflags() called before mem_cgroup_migrate()
+ * in migrate_page_states (mm/migrate.c)
+ *
+ * Let's check that all fine with flags:
+ * from one point of view page cache pages is always
+ * not anonimous and not swap backed;
+ * from another point of view we must have
+ * PageVzPageCache(page) ext flag set.
+ */
+ WARN_ON((!PageAnon(newpage) && !PageSwapBacked(newpage)) !=
+ PageVzPageCache(newpage));
+ if (PageVzPageCache(newpage))
+ page_counter_charge(&memcg->cache, nr_pages);
+
css_get(&memcg->css);
commit_charge(newpage, memcg);
@@ -7729,10 +7864,10 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
- if (try_charge(memcg, gfp_mask, false, nr_pages) == 0)
+ if (try_charge(memcg, gfp_mask, false, nr_pages, false) == 0)
return true;
- try_charge(memcg, gfp_mask|__GFP_NOFAIL, false, nr_pages);
+ try_charge(memcg, gfp_mask|__GFP_NOFAIL, false, nr_pages, false);
return false;
}
More information about the Devel
mailing list