[Devel] [PATCH RHEL8 COMMIT] userfaultfd: wp: apply _PAGE_UFFD_WP bit

Konstantin Khorenko khorenko at virtuozzo.com
Mon Apr 20 10:34:38 MSK 2020


The commit is pushed to "branch-rh8-4.18.0-80.1.2.vz8.3.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-80.1.2.vz8.3.6
------>
commit 0c5014a4bd27d0ef1eb70aa07c93e76d270170b9
Author: Peter Xu <peterx at redhat.com>
Date:   Mon Apr 20 10:34:37 2020 +0300

    userfaultfd: wp: apply _PAGE_UFFD_WP bit
    
    Firstly, introduce two new flags MM_CP_UFFD_WP[_RESOLVE] for
    change_protection() when used with uffd-wp and make sure the two new flags
    are exclusively used.  Then,
    
      - For MM_CP_UFFD_WP: apply the _PAGE_UFFD_WP bit and remove _PAGE_RW
        when a range of memory is write protected by uffd
    
      - For MM_CP_UFFD_WP_RESOLVE: remove the _PAGE_UFFD_WP bit and recover
        _PAGE_RW when write protection is resolved from userspace
    
    And use this new interface in mwriteprotect_range() to replace the old
    MM_CP_DIRTY_ACCT.
    
    Do this change for both PTEs and huge PMDs.  Then we can start to identify
    which PTE/PMD is write protected by general (e.g., COW or soft dirty
    tracking), and which is for userfaultfd-wp.
    
    Since we should keep the _PAGE_UFFD_WP when doing pte_modify(), add it
    into _PAGE_CHG_MASK as well.  Meanwhile, since we have this new bit, we
    can be even more strict when detecting uffd-wp page faults in either
    do_wp_page() or wp_huge_pmd().
    
    After we're with _PAGE_UFFD_WP, a special case is when a page is both
    protected by the general COW logic and also userfault-wp.  Here the
    userfault-wp will have higher priority and will be handled first.  Only
    after the uffd-wp bit is cleared on the PTE/PMD will we continue to handle
    the general COW.  These are the steps on what will happen with such a
    page:
    
      1. CPU accesses write protected shared page (so both protected by
         general COW and uffd-wp), blocked by uffd-wp first because in
         do_wp_page we'll handle uffd-wp first, so it has higher priority
         than general COW.
    
      2. Uffd service thread receives the request, do UFFDIO_WRITEPROTECT
         to remove the uffd-wp bit upon the PTE/PMD.  However here we
         still keep the write bit cleared.  Notify the blocked CPU.
    
      3. The blocked CPU resumes the page fault process with a fault
         retry, during retry it'll notice it was not with the uffd-wp bit
         this time but it is still write protected by general COW, then
         it'll go though the COW path in the fault handler, copy the page,
         apply write bit where necessary, and retry again.
    
      4. The CPU will be able to access this page with write bit set.
    
    Suggested-by: Andrea Arcangeli <aarcange at redhat.com>
    Signed-off-by: Peter Xu <peterx at redhat.com>
    Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
    Cc: Brian Geffon <bgeffon at google.com>
    Cc: Pavel Emelyanov <xemul at openvz.org>
    Cc: Mike Kravetz <mike.kravetz at oracle.com>
    Cc: David Hildenbrand <david at redhat.com>
    Cc: Martin Cracauer <cracauer at cons.org>
    Cc: Mel Gorman <mgorman at suse.de>
    Cc: Bobby Powers <bobbypowers at gmail.com>
    Cc: Mike Rapoport <rppt at linux.vnet.ibm.com>
    Cc: "Kirill A . Shutemov" <kirill at shutemov.name>
    Cc: Maya Gokhale <gokhale2 at llnl.gov>
    Cc: Johannes Weiner <hannes at cmpxchg.org>
    Cc: Marty McFadden <mcfadden8 at llnl.gov>
    Cc: Denis Plotnikov <dplotnikov at virtuozzo.com>
    Cc: Hugh Dickins <hughd at google.com>
    Cc: "Dr . David Alan Gilbert" <dgilbert at redhat.com>
    Cc: Jerome Glisse <jglisse at redhat.com>
    Cc: Rik van Riel <riel at redhat.com>
    Cc: Shaohua Li <shli at fb.com>
    Link: http://lkml.kernel.org/r/20200220163112.11409-8-peterx@redhat.com
    Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
    
    https://jira.sw.ru/browse/PSBM-102938
    (cherry picked from commit 292924b260247483a58916f6d3550d8c92f32f55)
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 include/linux/mm.h |  5 +++++
 mm/huge_memory.c   | 18 +++++++++++++++++-
 mm/memory.c        |  4 ++--
 mm/mprotect.c      | 17 +++++++++++++++++
 mm/userfaultfd.c   |  8 ++++++--
 5 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8973ce5aa46b..be1cc2467b37 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1673,6 +1673,11 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
 #define  MM_CP_DIRTY_ACCT                  (1UL << 0)
 /* Whether this protection change is for NUMA hints */
 #define  MM_CP_PROT_NUMA                   (1UL << 1)
+/* Whether this change is for write protecting */
+#define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
+#define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
+#define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
+					    MM_CP_UFFD_WP_RESOLVE)
 
 extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
 			      unsigned long end, pgprot_t newprot,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7b652b3a70e6..be580b411a6a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1843,6 +1843,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	bool preserve_write;
 	int ret;
 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 
 	ptl = __pmd_trans_huge_lock(pmd, vma);
 	if (!ptl)
@@ -1909,6 +1911,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	entry = pmd_modify(entry, newprot);
 	if (preserve_write)
 		entry = pmd_mk_savedwrite(entry);
+	if (uffd_wp) {
+		entry = pmd_wrprotect(entry);
+		entry = pmd_mkuffd_wp(entry);
+	} else if (uffd_wp_resolve) {
+		/*
+		 * Leave the write bit to be handled by PF interrupt
+		 * handler, then things like COW could be properly
+		 * handled.
+		 */
+		entry = pmd_clear_uffd_wp(entry);
+	}
 	ret = HPAGE_PMD_NR;
 	set_pmd_at(mm, addr, pmd, entry);
 	BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
@@ -2058,7 +2071,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	struct page *page;
 	pgtable_t pgtable;
 	pmd_t old_pmd, _pmd;
-	bool young, write, soft_dirty, pmd_migration = false;
+	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
 	unsigned long addr;
 	int i;
 
@@ -2140,6 +2153,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		write = pmd_write(old_pmd);
 		young = pmd_young(old_pmd);
 		soft_dirty = pmd_soft_dirty(old_pmd);
+		uffd_wp = pmd_uffd_wp(old_pmd);
 	}
 	VM_BUG_ON_PAGE(!page_count(page), page);
 	page_ref_add(page, HPAGE_PMD_NR - 1);
@@ -2173,6 +2187,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 				entry = pte_mkold(entry);
 			if (soft_dirty)
 				entry = pte_mksoft_dirty(entry);
+			if (uffd_wp)
+				entry = pte_mkuffd_wp(entry);
 		}
 		pte = pte_offset_map(&_pmd, addr);
 		BUG_ON(!pte_none(*pte));
diff --git a/mm/memory.c b/mm/memory.c
index b261dec43bf4..dc267ae5b293 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2740,7 +2740,7 @@ static int do_wp_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 
-	if (userfaultfd_wp(vma)) {
+	if (userfaultfd_pte_wp(vma, *vmf->pte)) {
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		return handle_userfault(vmf, VM_UFFD_WP);
 	}
@@ -3898,7 +3898,7 @@ static inline int create_huge_pmd(struct vm_fault *vmf)
 static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 {
 	if (vma_is_anonymous(vmf->vma)) {
-		if (userfaultfd_wp(vmf->vma))
+		if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
 			return handle_userfault(vmf, VM_UFFD_WP);
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
 	}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 69bcfb33314e..2df60e64f139 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -46,6 +46,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	int target_node = NUMA_NO_NODE;
 	bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 
 	/*
 	 * Can be called with only the mmap_sem for reading by
@@ -117,6 +119,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			if (preserve_write)
 				ptent = pte_mk_savedwrite(ptent);
 
+			if (uffd_wp) {
+				ptent = pte_wrprotect(ptent);
+				ptent = pte_mkuffd_wp(ptent);
+			} else if (uffd_wp_resolve) {
+				/*
+				 * Leave the write bit to be handled
+				 * by PF interrupt handler, then
+				 * things like COW could be properly
+				 * handled.
+				 */
+				ptent = pte_clear_uffd_wp(ptent);
+			}
+
 			/* Avoid taking write faults for known dirty pages */
 			if (dirty_accountable && pte_dirty(ptent) &&
 					(pte_soft_dirty(ptent) ||
@@ -300,6 +315,8 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
 {
 	unsigned long pages;
 
+	BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
+
 	if (is_vm_hugetlb_page(vma))
 		pages = hugetlb_change_protection(vma, start, end, newprot);
 	else
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index be9fc77575de..2db2b80b3c72 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -103,8 +103,12 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 		goto out_release;
 
 	_dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
-	if ((dst_vma->vm_flags & VM_WRITE) && !wp_copy)
-		_dst_pte = pte_mkwrite(_dst_pte);
+	if (dst_vma->vm_flags & VM_WRITE) {
+		if (wp_copy)
+			_dst_pte = pte_mkuffd_wp(_dst_pte);
+		else
+			_dst_pte = pte_mkwrite(_dst_pte);
+	}
 
 	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
 	if (dst_vma->vm_file) {


More information about the Devel mailing list