[Devel] [PATCH RHEL9 COMMIT] oracle/mm: introduce VM_EXEC_KEEP

Konstantin Khorenko khorenko at virtuozzo.com
Thu Jan 23 23:35:46 MSK 2025


The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.44.1.vz9.80.5
------>
commit 21ca20fe80817f1575ce1805b2074625a73a1e27
Author: Steve Sistare <steven.sistare at oracle.com>
Date:   Tue Oct 27 16:46:45 2020 -0700

    oracle/mm: introduce VM_EXEC_KEEP
    
    A vma with the VM_EXEC_KEEP flag is preserved across exec.  The flag
    is supported for anonymous vmas only and is cleared automatically after
    exec and for forked child processes.  For safety, overlap with fixed
    address VMAs created in the new mm during exec (e.g. the stack and elf
    load segments) is not permitted and will cause the exec to fail.  The
    possibility of a collision can be made extremely small by only applying
    the flag to VMAs representing ranges mapped with system-selected
    addresses in the mmap region.
    
    Orabug: 32387875
    Signed-off-by: Steve Sistare <steven.sistare at oracle.com>
    Signed-off-by: Anthony Yznaga <anthony.yznaga at oracle.com>
    Reviewed-by: Liam R. Howlett <Liam.Howlett at Oracle.com>
    
    https://virtuozzo.atlassian.net/browse/VSTOR-96305
    
    (cherry picked from Oracle commit 908e35f55cfe67a1ef40712b81dc357468c3b731)
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    Feature: oracle/mm: MADV_DOEXEC madvise() flag
---
 fs/exec.c          | 20 +++++++++++++++++
 include/linux/mm.h |  9 ++++++++
 kernel/fork.c      |  2 +-
 mm/mmap.c          | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/fs/exec.c b/fs/exec.c
index ac71b84baf72..c03f9d3aa347 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -990,6 +990,20 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
 EXPORT_SYMBOL(read_code);
 #endif
 
+static int vma_dup_some(struct mm_struct *old_mm, struct mm_struct *new_mm)
+{
+	struct vm_area_struct *vma;
+	int ret;
+
+	for (vma = old_mm->mmap; vma; vma = vma->vm_next)
+		if (vma->vm_flags & VM_EXEC_KEEP) {
+			ret = vma_dup(vma, new_mm);
+			if (ret)
+				return ret;
+		}
+	return 0;
+}
+
 /*
  * Maps the mm_struct mm into the current task struct.
  * On success, this function returns with exec_update_lock
@@ -1023,6 +1037,12 @@ static int exec_mmap(struct mm_struct *mm)
 			up_write(&tsk->signal->exec_update_lock);
 			return ret;
 		}
+		ret = vma_dup_some(old_mm, mm);
+		if (ret) {
+			mmap_read_unlock(old_mm);
+			up_write(&tsk->signal->exec_update_lock);
+			return ret;
+		}
 	}
 
 	mm->vps_dumpable = VD_PTRACE_COREDUMP;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b68c8e1868bf..0aa3e411fedf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -313,11 +313,13 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_BIT_2	34	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_3	35	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_4	36	/* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_18	50	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_0	BIT(VM_HIGH_ARCH_BIT_0)
 #define VM_HIGH_ARCH_1	BIT(VM_HIGH_ARCH_BIT_1)
 #define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
 #define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_HIGH_ARCH_18	BIT(VM_HIGH_ARCH_BIT_18)
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
@@ -359,6 +361,12 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_MTE_ALLOWED	VM_NONE
 #endif
 
+#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
+# define VM_EXEC_KEEP		VM_HIGH_ARCH_18 /* preserve VMA across exec */
+#else
+# define VM_EXEC_KEEP		VM_NONE
+#endif
+
 #ifndef VM_GROWSUP
 # define VM_GROWSUP	VM_NONE
 #endif
@@ -2759,6 +2767,7 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff,
 	bool *need_rmap_locks);
 extern void exit_mmap(struct mm_struct *);
+extern int vma_dup(struct vm_area_struct *vma, struct mm_struct *mm);
 
 static inline int check_data_rlimit(unsigned long rlim,
 				    unsigned long new,
diff --git a/kernel/fork.c b/kernel/fork.c
index ea9f7fdb3331..1bfdcccc43aa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -664,7 +664,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 			tmp->anon_vma = NULL;
 		} else if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
-		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT | VM_EXEC_KEEP);
 		/*
 		 * Copy/update hugetlb private vma information.
 		 */
diff --git a/mm/mmap.c b/mm/mmap.c
index 48991a476643..f87d284bd17b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3266,6 +3266,69 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	return NULL;
 }
 
+/*
+ * Duplicate an anonymous VMA into a new mm as part of preserving
+ * it across exec.
+ */
+int vma_dup(struct vm_area_struct *old_vma, struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	unsigned long npages;
+	int ret = -ENOMEM;
+
+	if (WARN_ON(!vma_is_anonymous(old_vma)))
+		return -EINVAL;
+
+	if (find_vma_intersection(mm, old_vma->vm_start, old_vma->vm_end))
+		return -EEXIST;
+
+	npages = vma_pages(old_vma);
+	vm_stat_account(mm, old_vma->vm_flags, npages);
+
+	vma = vm_area_dup(old_vma);
+	if (!vma)
+		goto fail_nomem;
+
+	ret = vma_dup_policy(old_vma, vma);
+	if (ret)
+		goto fail_nomem_policy;
+
+	vma->vm_mm = mm;
+	ret = anon_vma_fork(vma, old_vma);
+	if (ret)
+		goto fail_nomem_anon_vma_fork;
+
+	/*
+	 * Clear functionality that should not carry over to the new
+	 * process.any memory locking, userfaultfd, and preservation over
+	 * exec flags.
+	 */
+	vma->vm_flags &= ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP|VM_EXEC_KEEP);
+	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+	__insert_vm_struct(mm, vma);
+
+	/*
+	 * Now that the dup vma is inserted into the mm, clear VM_ACCOUNT
+	 * from old_vma.  Since vma_dup() is only called during exec to
+	 * duplicate a vma from the outgoing mm into the mm of the new
+	 * process, this effectively transfers the accounting from the old
+	 * vma to new one.
+	 */
+	old_vma->vm_flags &= ~VM_ACCOUNT;
+
+	ret = copy_page_range(vma, old_vma);
+	return ret;
+
+fail_nomem_anon_vma_fork:
+	mpol_put(vma_policy(vma));
+fail_nomem_policy:
+	vm_area_free(vma);
+fail_nomem:
+	vm_stat_account(mm, old_vma->vm_flags, -npages);
+	return -ENOMEM;
+}
+
 /*
  * Return true if the calling process may expand its vm space by the passed
  * number of pages


More information about the Devel mailing list