[Devel] [PATCH vz10 v2] mm/memcontrol: fix cache counter imbalance in mem_cgroup_replace_folio()

Fri Jul 25 06:24:52 MSK 2025

On 7/25/25 01:42, Konstantin Khorenko wrote:
> The function was checking cache flag on the new folio instead of the old one,
> causing cache counter imbalance when replacing cache pages.
> The new folio is freshly allocated and never has the cache flag set, so
> we need to copy the flag from the old folio and charge the cache counter
> accordingly.
> 
> This fixes potential cache accounting issues during page cache replacement
> operations like page migration between NUMA nodes.
> 
> Another possible cause of the imbalance is the commit_charge() function
> which unconditionally drops MEMCG_DATA_PGCACHE flag on
> folio->memcg_data, so enhance commit_charge() to preserve the flag.
> 
> The way of adding a new arguments was chosen in order not to miss the
> appearance of new commit_charge() calls on rebases.
> 
> Fixes: 6a2ca4d515c5 ("mm: Memory cgroup page cache limit")
> https://virtuozzo.atlassian.net/browse/VSTOR-111756
> 
> Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
> 
> Feature: mm: Memory cgroup page cache limit
> ---
> v2:
>   * fixed 2 other functions where commit_charge() could lose the cache flag
>   * fixed mem_cgroup_move_account() which also rewrites the memdata not
>     preserving cache flag
>   * moved sanity checks under if condition
> 
> 
>   mm/memcontrol-v1.c |  5 +++-
>   mm/memcontrol.c    | 57 ++++++++++++++++++++++++----------------------
>   2 files changed, 34 insertions(+), 28 deletions(-)
> 
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index 756bc518cee3..fd0a563a352b 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -852,7 +852,10 @@ static int mem_cgroup_move_account(struct folio *folio,
>   
>   	/* Warning should never happen, so don't worry about refcount non-0 */
>   	WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
> -	folio->memcg_data = (unsigned long)to;
> +	/* Preserve MEMCG_DATA_PGCACHE flag if it was set */
> +	WRITE_ONCE(folio->memcg_data,
> +		   (unsigned long)to |
> +		   (READ_ONCE(folio->memcg_data) & MEMCG_DATA_PGCACHE));

For me it looks that to support mem_cgroup_move_account case we also 
need to:

1) add to move_charge_struct the "moved_cache" field (similar to 
moved_swap), and handle uncharge from "from" in __mem_cgroup_clear_mc. 
Else leak folio charge in from->cache.

2) add to move_charge_struct the "precharge_cache" field and precharge 
cache in mem_cgroup_do_precharge(), and update precharge_cache 
accordingly after mem_cgroup_move_account which is cache related. Else 
we do not charge to->cache, and on uncharge_folio() we get negative 
cache usage in memcg "to".

>   
>   	__folio_memcg_unlock(from);
>   
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d0fe610306a0..831873efc975 100644

The part below looks good.

> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2524,7 +2524,8 @@ void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
>   		page_counter_uncharge(&memcg->memsw, nr_pages);
>   }
>   
> -static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
> +static void commit_charge(struct folio *folio, struct mem_cgroup *memcg,
> +			  unsigned long cache_flag)
>   {
>   	VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio);
>   	/*
> @@ -2536,7 +2537,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
>   	 * - exclusive reference
>   	 * - mem_cgroup_trylock_pages()
>   	 */
> -	folio->memcg_data = (unsigned long)memcg;
> +	WRITE_ONCE(folio->memcg_data, (unsigned long)memcg | cache_flag);
>   }
>   
>   /**
> @@ -2547,7 +2548,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
>   void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
>   {
>   	css_get(&memcg->css);
> -	commit_charge(folio, memcg);
> +	commit_charge(folio, memcg, 0);
>   	memcg1_commit_charge(folio, memcg);
>   }
>   
> @@ -5551,6 +5552,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
>   {
>   	struct mem_cgroup *memcg;
>   	long nr_pages = folio_nr_pages(new);
> +	unsigned long cache_flag;
>   
>   	VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
>   	VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
> @@ -5575,32 +5577,33 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
>   		if (do_memsw_account())
>   			page_counter_charge(&memcg->memsw, nr_pages);
>   	}
> -
> -	/*
> -	 * finist explained the idea behind adding a WARN_ON() here:
> -	 * - we do not want to check flags correctness on each flag change
> -	 *   because of performance
> -	 * - we do want to have a warning in case we somehow messed-up and
> -	 *   have got a folio with wrong bits set
> -	 * - we do not insist to catch every first buggy page with wrong
> -	 *   bits
> -	 *
> -	 * But we still want to get a warning about the problem sooner or
> -	 * later if the problem with flags exists.
> -	 *
> -	 * To achieve this check if a folio that is marked as cache does
> -	 * not have any other incompatible flags set.
> -	 */
> -	WARN_ON(folio_memcg_cache(new) &&
> -		(folio_test_slab(new)	      || folio_test_anon(new)	   ||
> -		 folio_test_swapbacked(new)   || folio_test_swapcache(new) ||
> -		 folio_test_mappedtodisk(new) || folio_test_ksm(new)));
> -
> -	if (folio_memcg_cache(new))
> +	cache_flag = READ_ONCE(old->memcg_data) & MEMCG_DATA_PGCACHE;
> +	if (cache_flag) {
>   		page_counter_charge(&memcg->cache, nr_pages);
>   
> +		/*
> +		 * finist explained the idea behind adding a WARN_ON() here:
> +		 * - we do not want to check flags correctness on each flag change
> +		 *   because of performance
> +		 * - we do want to have a warning in case we somehow messed-up and
> +		 *   have got a folio with wrong bits set
> +		 * - we do not insist to catch every first buggy page with wrong
> +		 *   bits
> +		 *
> +		 * But we still want to get a warning about the problem sooner or
> +		 * later if the problem with flags exists.
> +		 *
> +		 * To achieve this check if a folio that is marked as cache does
> +		 * not have any other incompatible flags set.
> +		 */
> +		WARN_ON(folio_test_slab(new) || folio_test_swapcache(new)  ||
> +			folio_test_anon(new) || folio_test_swapbacked(new) ||
> +			folio_test_ksm(new)  || folio_test_mappedtodisk(new));
> +	}
> +
>   	css_get(&memcg->css);
> -	commit_charge(new, memcg);
> +	commit_charge(new, memcg, cache_flag);
> +
>   	memcg1_commit_charge(new, memcg);
>   }
>   
> @@ -5639,7 +5642,7 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
>   		return;
>   
>   	/* Transfer the charge and the css ref */
> -	commit_charge(new, memcg);
> +	commit_charge(new, memcg, READ_ONCE(old->memcg_data) & MEMCG_DATA_PGCACHE);
>   
>   	/* Warning should never happen, so don't worry about refcount non-0 */
>   	WARN_ON_ONCE(folio_unqueue_deferred_split(old));

-- 
Best regards, Pavel Tikhomirov
Senior Software Developer, Virtuozzo.