[Devel] [PATCH RHEL9 COMMIT] mm/memcontrol: Add page cache limit to cgroup-v2

Konstantin Khorenko khorenko at virtuozzo.com
Wed Mar 27 14:55:00 MSK 2024


The commit is pushed to "branch-rh9-5.14.0-362.18.1.vz9.40.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-362.18.1.vz9.40.3
------>
commit b6447a93cbb07e269a5152d1d845b1ee0558eae5
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Fri Mar 22 12:58:10 2024 +0800

    mm/memcontrol: Add page cache limit to cgroup-v2
    
    The interface is slightly reworked to be more v2 like:
    
     - rename memory.cache.limit/usage_in_bytes -> memory.cache.max/current
     - show "max" when uninitialized and allow to write it
     - memcg_max_mutex with page_counter_set_max replaced with simple xchg
     - we set limit first before looping and then try to enforce it if
       needed, no more enforce before setting logic
     - retry reclaim couple of times if it fails to enforce the limit and
       then just give up (memory_max_write triggers oom in this case, but we
       probably do not want to trigger oom due to cache limit)
    
    https://virtuozzo.atlassian.net/browse/PSBM-154207
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    khorenko@ notes:
     - function memcg_update_cache_max() (same as cache_max_write() but for
       cgroup v1) is able to return error codes and this is the difference between
       cgroups v1 and v2 Pasha is talking about: cgroup v2 does not return error
       codes, ever. But in case the memory reclaim fails (for memory, not cache) OOM
       is called. For cache OOM is an overkill, so let's just pretend we have
       performed the reclaim successfully even if we could not reach the desired
       limit.
     - number of reclaim iterations in cache_max_write(): MAX_RECLAIM_RETRIES == 16
       attempts, but this number is for reclaims which returned 0 (zero) pages
       reclaimed. Otherwise the number of loop cycles is unlimited.
       So in case someone fills the cache with new data and each reclaim attempt
       brings us few pages reclaimed - it could take very long.
       Butthe process should be interruptible via signal.
    
    Feature: mm: Memory cgroup page cache limit
---
 mm/memcontrol.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 506ce3ee4e6a..1722a49855ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -8821,3 +8821,78 @@ static int __init mem_cgroup_swap_init(void)
 subsys_initcall(mem_cgroup_swap_init);
 
 #endif /* CONFIG_SWAP */
+
+static int cache_max_show(struct seq_file *m, void *v)
+{
+	return seq_puts_memcg_tunable(m,
+		READ_ONCE(mem_cgroup_from_seq(m)->cache.max));
+}
+
+static ssize_t cache_max_write(struct kernfs_open_file *of,
+			       char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
+	unsigned long max;
+	int err;
+
+	buf = strstrip(buf);
+	err = page_counter_memparse(buf, "max", &max);
+	if (err)
+		return err;
+
+	xchg(&memcg->cache.max, max);
+
+	for (;;) {
+		unsigned long nr_cache = page_counter_read(&memcg->cache);
+
+		if (nr_cache <= max)
+			break;
+
+		if (signal_pending(current))
+			break;
+
+		if (!nr_reclaims)
+			break;
+
+		if (!try_to_free_mem_cgroup_pages(memcg, nr_cache - max,
+		    GFP_KERNEL, false))
+			nr_reclaims--;
+	}
+
+	memcg_wb_domain_size_changed(memcg);
+	return nbytes;
+}
+
+static u64 cache_current_read(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+	return (u64)page_counter_read(&memcg->cache) * PAGE_SIZE;
+}
+
+static struct cftype cache_files[] = {
+	{
+		.name = "cache.max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cache_max_show,
+		.write = cache_max_write,
+	},
+	{
+		.name = "cache.current",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cache_current_read,
+	},
+	{ }	/* terminate */
+};
+
+static int __init mem_cgroup_cache_init(void)
+{
+	if (mem_cgroup_disabled())
+		return 0;
+
+	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, cache_files));
+	return 0;
+}
+subsys_initcall(mem_cgroup_cache_init);


More information about the Devel mailing list