[Devel] [PATCH RHEL7 COMMIT] kvm: nVMX: support EPT accessed/dirty bits

Konstantin Khorenko khorenko at virtuozzo.com
Wed May 16 12:50:45 MSK 2018


The commit is pushed to "branch-rh7-3.10.0-693.21.1.vz7.50.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.21.1.vz7.47.6
------>
commit 07b049443681e810bf600c10402f6f66ee086aaf
Author: Paolo Bonzini <pbonzini at redhat.com>
Date:   Wed May 16 12:50:45 2018 +0300

    kvm: nVMX: support EPT accessed/dirty bits
    
    Now use bit 6 of EPTP to optionally enable A/D bits for EPTP.  Another
    thing to change is that, when EPT accessed and dirty bits are not in use,
    VMX treats accesses to guest paging structures as data reads.  When they
    are in use (bit 6 of EPTP is set), they are treated as writes and the
    corresponding EPT dirty bit is set.  The MMU didn't know this detail,
    so this patch adds it.
    
    We also have to fix up the exit qualification.  It may be wrong because
    KVM sets bit 6 but the guest might not.
    
    L1 emulates EPT A/D bits using write permissions, so in principle it may
    be possible for EPT A/D bits to be used by L1 even though not available
    in hardware.  The problem is that guest page-table walks will be treated
    as reads rather than writes, so they would not cause an EPT violation.
    
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
    [Fixed typo in walk_addr_generic() comment and changed bit clear +
     conditional-set pattern in handle_ept_violation() to conditional-clear]
    Signed-off-by: Radim Krčmář <rkrcmar at redhat.com>
    
    (cherry picked from commit ae1e2d1082ae6969ff8c626ef80804d950bf256b)
    Signed-off-by: Jan Dakinevich <jan.dakinevich at virtuozzo.com>
    
    =====================
    Patchset description:
    
    EPT fixes and enhancements
    
    Backport of EPT fixes from upstream for
    https://jira.sw.ru/browse/PSBM-84046
    
    Bandan Das (3):
      kvm: mmu: don't set the present bit unconditionally
      kvm: mmu: track read permission explicitly for shadow EPT page tables
      kvm: vmx: advertise support for ept execute only
    
    Junaid Shahid (2):
      kvm: x86: mmu: Use symbolic constants for EPT Violation Exit
        Qualifications
      kvm: x86: mmu: Rename EPT_VIOLATION_READ/WRITE/INSTR constants
    
    KarimAllah Ahmed (2):
      kvm: Map PFN-type memory regions as writable (if possible)
      KVM: x86: Update the exit_qualification access bits while walking an
        address
    
    Paolo Bonzini (5):
      KVM: nVMX: we support 1GB EPT pages
      kvm: x86: MMU support for EPT accessed/dirty bits
      kvm: nVMX: support EPT accessed/dirty bits
      KVM: MMU: return page fault error code from permission_fault
      KVM: nVMX: fix EPT permissions as reported in exit qualification
---
 arch/x86/include/asm/kvm_host.h |  5 +++--
 arch/x86/include/asm/vmx.h      |  2 ++
 arch/x86/kvm/mmu.c              |  4 +++-
 arch/x86/kvm/mmu.h              |  3 ++-
 arch/x86/kvm/paging_tmpl.h      | 33 ++++++++++++++++-----------------
 arch/x86/kvm/vmx.c              | 32 ++++++++++++++++++++++++++++----
 6 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0d77f961eb83..3bb58048e24a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -303,9 +303,10 @@ struct kvm_mmu {
 	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			   u64 *spte, const void *pte);
 	hpa_t root_hpa;
-	int root_level;
-	int shadow_root_level;
 	union kvm_mmu_page_role base_role;
+	u8 root_level;
+	u8 shadow_root_level;
+	u8 ept_ad;
 	bool direct_map;
 
 	/*
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index fc43716c01c9..b3256903652d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -476,12 +476,14 @@ struct vmx_msr_entry {
 #define EPT_VIOLATION_READABLE_BIT	3
 #define EPT_VIOLATION_WRITABLE_BIT	4
 #define EPT_VIOLATION_EXECUTABLE_BIT	5
+#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8
 #define EPT_VIOLATION_ACC_READ		(1 << EPT_VIOLATION_ACC_READ_BIT)
 #define EPT_VIOLATION_ACC_WRITE		(1 << EPT_VIOLATION_ACC_WRITE_BIT)
 #define EPT_VIOLATION_ACC_INSTR		(1 << EPT_VIOLATION_ACC_INSTR_BIT)
 #define EPT_VIOLATION_READABLE		(1 << EPT_VIOLATION_READABLE_BIT)
 #define EPT_VIOLATION_WRITABLE		(1 << EPT_VIOLATION_WRITABLE_BIT)
 #define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
+#define EPT_VIOLATION_GVA_TRANSLATED	(1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
 
 /*
  * VM-instruction error numbers
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 00cafeec60ca..75c2575c5677 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4104,7 +4104,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+			     bool accessed_dirty)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
@@ -4113,6 +4114,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 
 	context->nx = true;
+	context->ept_ad = accessed_dirty;
 	context->page_fault = ept_page_fault;
 	context->gva_to_gpa = ept_gva_to_gpa;
 	context->sync_page = ept_sync_page;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index b3fb4f3202e8..8330a96fa1c9 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -73,7 +73,8 @@ enum {
 
 int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+			     bool accessed_dirty);
 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 				u64 fault_address, char *insn, int insn_len,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 93cc0bd7caee..3b7b51589692 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,13 +23,6 @@
  * so the code in this file is compiled twice, once per pte size.
  */
 
-/*
- * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
- * uses for EPT without A/D paging type.
- */
-extern u64 __pure __using_nonexistent_pte_bit(void)
-	       __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
-
 #if PTTYPE == 64
 	#define pt_element_t u64
 	#define guest_walker guest_walker64
@@ -39,8 +32,6 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
 	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
 	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
 	#define PT_LEVEL_BITS PT64_LEVEL_BITS
-	#define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
-	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
 	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
 	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
 	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
@@ -61,8 +52,6 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
 	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
 	#define PT_LEVEL_BITS PT32_LEVEL_BITS
 	#define PT_MAX_FULL_LEVELS 2
-	#define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
-	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
 	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
 	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
 	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
@@ -76,17 +65,18 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
 	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
 	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
 	#define PT_LEVEL_BITS PT64_LEVEL_BITS
-	#define PT_GUEST_ACCESSED_MASK 0
-	#define PT_GUEST_DIRTY_MASK 0
-	#define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
-	#define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
-	#define PT_HAVE_ACCESSED_DIRTY(mmu) false
+	#define PT_GUEST_DIRTY_SHIFT 9
+	#define PT_GUEST_ACCESSED_SHIFT 8
+	#define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
 	#define CMPXCHG cmpxchg64
 	#define PT_MAX_FULL_LEVELS 4
 #else
 	#error Invalid PTTYPE value
 #endif
 
+#define PT_GUEST_DIRTY_MASK    (1 << PT_GUEST_DIRTY_SHIFT)
+#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
+
 #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
 #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
 
@@ -279,6 +269,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	pt_element_t __user *uninitialized_var(ptep_user);
 	gfn_t table_gfn;
 	unsigned index, pt_access, pte_access, accessed_dirty;
+	unsigned nested_access;
 	gpa_t pte_gpa;
 	bool have_ad;
 	int offset;
@@ -308,6 +299,14 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
 
 	accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0;
+
+	/*
+	 * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
+	 * by the MOV to CR instruction are treated as reads and do not cause the
+	 * processor to set the dirty flag in any EPT paging-structure entry.
+	 */
+	nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
+
 	pt_access = pte_access = ACC_ALL;
 	++walker->level;
 
@@ -328,7 +327,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 
 		real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
-					      PFERR_USER_MASK|PFERR_WRITE_MASK,
+					      nested_access,
 					      &walker->fault);
 
 		/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9df8bcb2e44b..ddd112becfe8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2662,6 +2662,8 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
 			VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
 			VMX_EPT_1GB_PAGE_BIT;
+	       if (enable_ept_ad_bits)
+		       vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 
@@ -5936,6 +5938,17 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
+	if (is_guest_mode(vcpu)
+	    && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) {
+		/*
+		 * Fix up exit_qualification according to whether guest
+		 * page table accesses are reads or writes.
+		 */
+		u64 eptp = nested_ept_get_cr3(vcpu);
+		if (eptp & VMX_EPT_AD_ENABLE_BIT)
+			exit_qualification &= ~EPT_VIOLATION_ACC_WRITE;
+	}
+
 	/*
 	 * EPT violation happened while executing iret from NMI,
 	 * "blocked by NMI" bit has to be set before next VM entry.
@@ -9145,17 +9158,26 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
 	return get_vmcs12(vcpu)->ept_pointer;
 }
 
-static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
+	u64 eptp;
+
 	WARN_ON(mmu_is_nested(vcpu));
+	eptp = nested_ept_get_cr3(vcpu);
+	if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits)
+		return 1;
+
+	kvm_mmu_unload(vcpu);
 	kvm_init_shadow_ept_mmu(vcpu,
 			to_vmx(vcpu)->nested.nested_vmx_ept_caps &
-			VMX_EPT_EXECUTE_ONLY_BIT);
+			VMX_EPT_EXECUTE_ONLY_BIT,
+			eptp & VMX_EPT_AD_ENABLE_BIT);
 	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
 	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
 	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
 
 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+	return 0;
 }
 
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
@@ -9920,8 +9942,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	}
 
 	if (nested_cpu_has_ept(vmcs12)) {
-		kvm_mmu_unload(vcpu);
-		nested_ept_init_mmu_context(vcpu);
+		if (nested_ept_init_mmu_context(vcpu)) {
+			*entry_failure_code = ENTRY_FAIL_DEFAULT;
+			return 1;
+		}
 	}
 
 	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)


More information about the Devel mailing list