[Devel] [PATCH RHEL9 COMMIT] oracle/mm: use padata for copying page ranges in vma_dup()

Konstantin Khorenko khorenko at virtuozzo.com
Thu Jan 23 23:35:48 MSK 2025


The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.44.1.vz9.80.5
------>
commit c596f82fac95faf0af5f74d4ccd7c597d179f94e
Author: Anthony Yznaga <anthony.yznaga at oracle.com>
Date:   Wed Dec 7 09:50:29 2022 -0800

    oracle/mm: use padata for copying page ranges in vma_dup()
    
    When a VMA marked for preservation via MADV_DOEXEC is copied to a new mm
    during exec, its pagetable entries are copied using copy_page_range().
    The time to complete the copy increases linearly with size and becomes
    excessive when preserving memory for very large VMs. Use padata to speed
    up the copying by parallelizing the work.
    
    Performance results for this patch series:
    
        System:           X6-2
        CPU:              2 nodes * 10 cores/node * 2 threads/core = 40 CPUs
                          Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz
        Memory:           251G split evenly between nodes
    
        Test:             Time to exec measured in ms/GB.
    
                          Exec after mmap'ing and touching a 200GB range
                          of anon memory and then preserving it with
                          MADV_DOEXEC:
    
                           kernel     speedup    avg ms/GB
                         --------    --------    ---------
                         baseline                     17.4
                           padata        7.2x          2.4
    
                          Exec after mmap'ing and touching a 200GB range
                          of shared memory backed by shmem.
    
                           kernel     speedup    avg ms/GB
                         --------    --------    ---------
                         baseline                     21.7
                           padata          7x          3.1
    
    Orabug: 35054621
    Signed-off-by: Anthony Yznaga <anthony.yznaga at oracle.com>
    Reviewed-by: Daniel Jordan <daniel.m.jordan at oracle.com>
    
    https://virtuozzo.atlassian.net/browse/VSTOR-96305
    
    (cherry picked from Oracle commit 800339ff06da9ffcc0e26fb13a43513792a6aa5e)
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    Feature: oracle/mm: MADV_DOEXEC madvise() flag
---
 include/linux/mm.h |  4 ++++
 mm/memory.c        | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/mmap.c          |  5 ++++-
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d2ce7bded6da..9b1ceb0db308 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1958,6 +1958,10 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
+#ifdef CONFIG_PADATA
+int copy_page_range_mt(struct vm_area_struct *dst_vma,
+			struct vm_area_struct *src_vma);
+#endif
 int follow_pte(struct mm_struct *mm, unsigned long address,
 	       pte_t **ptepp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index 872de67ca2d7..b431a43b68fd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1325,6 +1325,66 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 	return ret;
 }
 
+#ifdef CONFIG_PADATA
+
+struct copy_page_range_args {
+	struct vm_area_struct *dst_vma;
+	struct vm_area_struct *src_vma;
+};
+
+static int copy_page_range_chunk(unsigned long addr,
+				 unsigned long end, void *arg)
+{
+	struct copy_page_range_args *args = arg;
+	struct vm_area_struct *dst_vma = args->dst_vma;
+	struct vm_area_struct *src_vma = args->src_vma;
+	struct mm_struct *dst_mm = dst_vma->vm_mm;
+	struct mm_struct *src_mm = src_vma->vm_mm;
+	pgd_t *src_pgd, *dst_pgd;
+	unsigned long next;
+	int ret = 0;
+
+	dst_pgd = pgd_offset(dst_mm, addr);
+	src_pgd = pgd_offset(src_mm, addr);
+
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(src_pgd))
+			continue;
+		if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
+					    addr, next))) {
+			ret = -ENOMEM;
+			break;
+		}
+	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+	return ret;
+}
+
+/*
+ * A stripped down version of copy_page_range() used to copy a VMA as part
+ * of preserving it across exec. Multithreading via padata is used to speed
+ * up the copying of very large VMAs.
+ */
+int copy_page_range_mt(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
+{
+	struct copy_page_range_args args = { dst_vma, src_vma };
+	struct padata_mt_job job = {
+		.thread_fn   = copy_page_range_chunk,
+		.fn_arg      = &args,
+		.start       = src_vma->vm_start,
+		.size        = src_vma->vm_end - src_vma->vm_start,
+		.align       = PMD_SIZE,
+		.min_chunk   = max(1ul << 27, PMD_SIZE),
+		.max_threads = 16,
+	};
+
+	BUG_ON(!(src_vma->vm_flags & VM_EXEC_KEEP));
+
+	return padata_do_multithreaded(&job);
+}
+#endif /* CONFIG_PADATA */
+
 /* Whether we should zap all COWed (private) pages too */
 static inline bool should_zap_cows(struct zap_details *details)
 {
diff --git a/mm/mmap.c b/mm/mmap.c
index 9bb2382d9101..04b769eb27a4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3318,8 +3318,11 @@ int vma_dup(struct vm_area_struct *old_vma, struct mm_struct *mm)
 	 */
 	old_vma->vm_flags &= ~VM_ACCOUNT;
 
+#ifdef CONFIG_PADATA
+	ret = copy_page_range_mt(vma, old_vma);
+#else
 	ret = copy_page_range(vma, old_vma);
-
+#endif
 	vma->vm_flags &= ~VM_EXEC_KEEP;
 
 	return ret;


More information about the Devel mailing list