[Devel] [PATCH VZ9] fs/fuse kio: Refactor pcs_mr to avoid large contiguous memory allocation

Konstantin Khorenko khorenko at virtuozzo.com
Mon Aug 4 12:00:53 MSK 2025


i've already committed the patch, but looks like i was too fast, please verify me, see below.


On 01.08.2025 10:54, Liu Kui wrote:
> Fix page allocation failure in kcalloc by converting the large 1-dimensional
> array into a 2-dimensional array of order-0 (4KB) pages. This eliminates the
> need for non-order-0 allocations, improving allocation reliability under memory
> pressure. The 2D array ensures virtual contiguity for the caller maintaining
> functional equivalence.
> 
> Related to #VSTOR-112413
> https://virtuozzo.atlassian.net/browse/VSTOR-112413
> 
> Signed-off-by: Liu Kui<kui.liu at virtuozzo.com>
> ---
>   fs/fuse/kio/pcs/pcs_mr.c | 55 +++++++++++++++++++++++++---------------
>   fs/fuse/kio/pcs/pcs_mr.h | 21 ++++++++++-----
>   2 files changed, 50 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/fuse/kio/pcs/pcs_mr.c b/fs/fuse/kio/pcs/pcs_mr.c
> index cbd3b440dd1b..8e1dbc1121ce 100644
> --- a/fs/fuse/kio/pcs/pcs_mr.c
> +++ b/fs/fuse/kio/pcs/pcs_mr.c
> @@ -13,11 +13,18 @@
>   void pcs_umem_release(struct pcs_umem *umem)
>   {
>   	struct mm_struct *mm_s = umem->mm;
> +	int i, npages = umem->npages;
>   
> -	unpin_user_pages(umem->pages, umem->npages);
> +	for (i = 0; npages; i++) {
> +		int to_free = min_t(int, PCS_PAGES_PER_CHUNK, npages);
> +
> +		unpin_user_pages(umem->page_chunk[i].pages, to_free);
> +		kfree(umem->page_chunk[i].pages);
> +		npages -= to_free;
> +	}
>   	atomic64_sub(umem->npages, &mm_s->pinned_vm);
>   	mmdrop(mm_s);
> -	kfree(umem->pages);
> +	kfree(umem->page_chunk);
>   	kfree(umem);
>   }
>   
> @@ -27,14 +34,13 @@ void pcs_umem_release(struct pcs_umem *umem)
>   struct pcs_umem *pcs_umem_get(u64 start, u64 len)
>   {
>   	struct pcs_umem *umem = NULL;
> -	struct page **pages;
> -	int npages;
> -	u64 fp_va;
>   	struct mm_struct *mm_s;
> -	int got, ret;
> +	u64 fp_va;
> +	int npages, nchunks, i, ret;
>   
>   	fp_va = start & PAGE_MASK;
>   	npages = PAGE_ALIGN(start + len - fp_va) >> PAGE_SHIFT;
> +	nchunks = (npages >> PCS_PAGE_CHUNK_SHIFT) + 1;

This always adds 1, even when npages is perfectly divisible by PCS_PAGES_PER_CHUNK, leading to 
over-allocation.

nchunks = DIV_ROUND_UP(npages, PCS_PAGES_PER_CHUNK);
?

>   
>   	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
>   	if (!umem)
> @@ -48,25 +54,34 @@ struct pcs_umem *pcs_umem_get(u64 start, u64 len)
>   	mmap_read_lock(mm_s);
>   
>   	umem->fp_addr = fp_va;
> -	umem->pages = kcalloc(npages, sizeof(struct page *), GFP_KERNEL);
> -	if (!umem->pages) {
> +	umem->page_chunk = kcalloc(nchunks, sizeof(struct pcs_page_chunk *), GFP_KERNEL);
                                                                        ^^^^^
umem->page_chunk = kcalloc(nchunks, sizeof(struct pcs_page_chunk), GFP_KERNEL);
?

> +	if (!umem->page_chunk) {
>   		ret = -ENOMEM;
>   		goto out_err;
>   	}
>   
> -	got = 0;
> -	while (npages) {
> -		pages = &umem->pages[got];
> -		ret = pin_user_pages(fp_va, npages, FOLL_WRITE | FOLL_LONGTERM, pages, NULL);
> -		if (ret < 0)
> -			goto out_err;
> +	for (i = 0; npages; i++) {
> +		int n = min_t(int, npages, PCS_PAGES_PER_CHUNK);
> +		struct page **pages = kcalloc(n, sizeof(struct page *), GFP_KERNEL);
>   
> -		WARN_ON(ret == 0);
> -		umem->npages += ret;
> -		atomic64_add(ret, &mm_s->pinned_vm);
> -		fp_va += ret * PAGE_SIZE;
> -		npages -= ret;
> -		got += ret;
> +		if (!pages) {
> +			ret = -ENOMEM;
> +			goto out_err;
> +		}
> +		umem->page_chunk[i].pages = pages;
> +
> +		while (n) {
> +			ret = pin_user_pages(fp_va, n, FOLL_WRITE | FOLL_LONGTERM, pages, NULL);
> +			if (ret < 0)
> +				goto out_err;
> +
> +			atomic64_add(ret, &mm_s->pinned_vm);
> +			umem->npages += ret;
> +			fp_va += ret * PAGE_SIZE;
> +			pages += ret;
> +			n -= ret;
> +			npages -= ret;
> +		}
>   	}
>   	mmap_read_unlock(mm_s);
>   
> diff --git a/fs/fuse/kio/pcs/pcs_mr.h b/fs/fuse/kio/pcs/pcs_mr.h
> index dae9931d9967..64f237f57dec 100644
> --- a/fs/fuse/kio/pcs/pcs_mr.h
> +++ b/fs/fuse/kio/pcs/pcs_mr.h
> @@ -11,6 +11,8 @@
>   struct pcs_umem;
>   
>   #define PCS_MAX_MR		0x10000
> +#define PCS_PAGE_CHUNK_SHIFT 9
> +#define PCS_PAGES_PER_CHUNK (1 << PCS_PAGE_CHUNK_SHIFT)
>   
>   struct pcs_mr_set {
>   	struct xarray	mr_xa;	/* array of registered MRs*/
> @@ -18,10 +20,15 @@ struct pcs_mr_set {
>   	atomic_t	mr_num;		/* number of registered MRs*/
>   };
>   
> +struct pcs_page_chunk
> +{
> +	struct page **pages; /* array of pinned pages */
> +};
> +
>   struct pcs_umem {
> -	u64	fp_addr;	/* First page base address */
> -	int	npages;		/* number of pinned pages */
> -	struct page	**pages;	/* array of pinned pages */
> +	struct pcs_page_chunk *page_chunk;
> +	int npages; /* number of pinned pages */
> +	u64 fp_addr; /* First page base address */
>   	struct mm_struct *mm;	/* mm the memory belongs to */
>   };
>   
> @@ -40,10 +47,12 @@ struct pcs_mr {
>    */
>   static inline struct page *pcs_umem_page(struct pcs_umem *umem, u64 addr)
>   {
> -	unsigned int idx = (addr - umem->fp_addr) >> PAGE_SHIFT;
> +	unsigned int page_idx = (addr - umem->fp_addr) >> PAGE_SHIFT;
> +	unsigned int chunk_idx = page_idx >> PCS_PAGE_CHUNK_SHIFT;
> +	unsigned int page_in_chunk = page_idx & (PCS_PAGES_PER_CHUNK - 1);
>   
> -	if (likely(idx < umem->npages))
> -		return umem->pages[idx];
> +	if (likely(page_idx < umem->npages))
> +		return umem->page_chunk[chunk_idx].pages[page_in_chunk];
>   	return NULL;
>   }
>   
> -- 2.39.5 (Apple Git-154)
> 



More information about the Devel mailing list