[Devel] [PATCH RHEL7 COMMIT] ms/mmu-notifier: add clear_young callback

Konstantin Khorenko khorenko at virtuozzo.com
Thu Nov 5 05:36:24 PST 2015


The commit is pushed to "branch-rh7-3.10.0-229.7.2.vz7.9.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.9.6
------>
commit 46b5e25daf766724b9554b9b0d4e6b327fe3c117
Author: Vladimir Davydov <vdavydov at virtuozzo.com>
Date:   Thu Nov 5 17:36:24 2015 +0400

    ms/mmu-notifier: add clear_young callback
    
    https://jira.sw.ru/browse/PSBM-32460
    
    From: Vladimir Davydov <vdavydov at parallels.com>
    
    In the scope of the idle memory tracking feature, which is introduced by
    the following patch, we need to clear the referenced/accessed bit not only
    in primary, but also in secondary ptes.  The latter is required in order
    to estimate wss of KVM VMs.  At the same time we want to avoid flushing
    tlb, because it is quite expensive and it won't really affect the final
    result.
    
    Currently, there is no function for clearing pte young bit that would meet
    our requirements, so this patch introduces one.  To achieve that we have
    to add a new mmu-notifier callback, clear_young, since there is no method
    for testing-and-clearing a secondary pte w/o flushing tlb.  The new method
    is not mandatory and currently only implemented by KVM.
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
    
    Reviewed-by: Andres Lagar-Cavilla <andreslc at google.com>
    Acked-by: Paolo Bonzini <pbonzini at redhat.com>
    Cc: Minchan Kim <minchan at kernel.org>
    Cc: Raghavendra K T <raghavendra.kt at linux.vnet.ibm.com>
    Cc: Johannes Weiner <hannes at cmpxchg.org>
    Cc: Michal Hocko <mhocko at suse.cz>
    Cc: Greg Thelen <gthelen at google.com>
    Cc: Michel Lespinasse <walken at google.com>
    Cc: David Rientjes <rientjes at google.com>
    Cc: Pavel Emelyanov <xemul at parallels.com>
    Cc: Cyrill Gorcunov <gorcunov at openvz.org>
    Cc: Jonathan Corbet <corbet at lwn.net>
    Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
    (cherry picked from commit 1d7715c676a1566c2e4c3e77d16b1f9bb4909025)
    Signed-off-by: Vladimir Davydov <vdavydov at virtuozzo.com>
    
    Conflicts:
    	include/linux/mmu_notifier.h
    	mm/mmu_notifier.c
    	virt/kvm/kvm_main.c
---
 include/linux/mmu_notifier.h | 41 +++++++++++++++++++++++++++++++++++++++++
 mm/mmu_notifier.c            | 16 ++++++++++++++++
 virt/kvm/kvm_main.c          | 30 ++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index deca874..2306e84 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -63,6 +63,15 @@ struct mmu_notifier_ops {
 				 unsigned long address);
 
 	/*
+	 * clear_young is a lightweight version of clear_flush_young. Like the
+	 * latter, it is supposed to test-and-clear the young/accessed bitflag
+	 * in the secondary pte, but it may omit flushing the secondary tlb.
+	 */
+	int (*clear_young)(struct mmu_notifier *mn,
+			   struct mm_struct *mm,
+			   unsigned long address);
+
+	/*
 	 * test_young is called to check the young/accessed bitflag in
 	 * the secondary pte. This is used to know if the page is
 	 * frequently used without actually clearing the flag or tearing
@@ -174,6 +183,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
 extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 					  unsigned long address);
+extern int __mmu_notifier_clear_young(struct mm_struct *mm,
+				      unsigned long address);
 extern int __mmu_notifier_test_young(struct mm_struct *mm,
 				     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
@@ -199,6 +210,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+					   unsigned long address)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_clear_young(mm, address);
+	return 0;
+}
+
 static inline int mmu_notifier_test_young(struct mm_struct *mm,
 					  unsigned long address)
 {
@@ -268,6 +287,26 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	__young;							\
 })
 
+#define ptep_clear_young_notify(__vma, __address, __ptep)		\
+({									\
+	int __young;							\
+	struct vm_area_struct *___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	__young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
+	__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address);	\
+	__young;							\
+})
+
+#define pmdp_clear_young_notify(__vma, __address, __pmdp)		\
+({									\
+	int __young;							\
+	struct vm_area_struct *___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	__young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
+	__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address);	\
+	__young;							\
+})
+
 /*
  * set_pte_at_notify() sets the pte _after_ running the notifier.
  * This is safe to start by updating the secondary MMUs, because the primary MMU
@@ -336,6 +375,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
+#define ptep_clear_young_notify ptep_test_and_clear_young
+#define pmdp_clear_young_notify pmdp_test_and_clear_young
 #define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 6725ff1..b21681c 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -103,6 +103,22 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return young;
 }
 
+int __mmu_notifier_clear_young(struct mm_struct *mm,
+			       unsigned long address)
+{
+	struct mmu_notifier *mn;
+	int young = 0, id;
+
+	id = srcu_read_lock(&srcu);
+	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+		if (mn->ops->clear_young)
+			young |= mn->ops->clear_young(mn, mm, address);
+	}
+	srcu_read_unlock(&srcu, id);
+
+	return young;
+}
+
 int __mmu_notifier_test_young(struct mm_struct *mm,
 			      unsigned long address)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c3a2bdc..f7b06a5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -389,6 +389,35 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 	return young;
 }
 
+static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
+					struct mm_struct *mm,
+					unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int young, idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+	/*
+	 * Even though we do not flush TLB, this will still adversely
+	 * affect performance on pre-Haswell Intel EPT, where there is
+	 * no EPT Access Bit to clear so that we have to tear down EPT
+	 * tables instead. If we find this unacceptable, we can always
+	 * add a parameter to kvm_age_hva so that it effectively doesn't
+	 * do anything on clear_young.
+	 *
+	 * Also note that currently we never issue secondary TLB flushes
+	 * from clear_young, leaving this job up to the regular system
+	 * cadence. If we find this inaccurate, we might come up with a
+	 * more sophisticated heuristic later.
+	 */
+	young = kvm_age_hva(kvm, address);
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	return young;
+}
+
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
 				       unsigned long address)
@@ -421,6 +450,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+	.clear_young		= kvm_mmu_notifier_clear_young,
 	.test_young		= kvm_mmu_notifier_test_young,
 	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,



More information about the Devel mailing list