[Devel] [PATCH RH7 draft] mm/huge_memory.c: disable transparent hugepages in CTs

Thu May 17 13:07:55 MSK 2018

These is a draft patch just to save working prototype, now we want
just to disable thp from userspace for density scenarios.

We might as some point want to disable transparent huge pages for CTs
but leave them enabled for host(VMs). That is because: 1) thp decrease
the density of CTs at dvd-store test, 2) thp increase performance for
VMs at some tests.

https://jira.sw.ru/browse/PSBM-83199

In /sys/kernel/mm/transparent_hugepage/ve_stats one can find aditional
stats about how many pages were stopped from been made thp by these
patch.

To test that no in-ct pages were made thp I used these script:

for i in /proc/*/smaps; do
	PID=$(echo $i | awk -F'/' '{print $3}')
	HPS=$(cat $i | grep AnonHugePages | grep -v "\<0\>" | awk 'BEGIN{sum=0}{sum+=$2}END{print sum}')

	if [ $HPS -gt 0 ]; then
		CTUID=$(cat /proc/$PID/status | grep TaskUB | awk '{print $2}')
		if [ "$CTUID" != "0" ]; then
			echo $PID:$CTUID:$HPS
		fi
	fi
done

On dvd-store test it shows no thp in CT.

Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 mm/huge_memory.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6341e0c79bbd..5599596f9ee4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -59,6 +59,10 @@ static struct task_struct *khugepaged_thread __read_mostly;
 static DEFINE_MUTEX(khugepaged_mutex);
 static DEFINE_SPINLOCK(khugepaged_mm_lock);
 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+static unsigned long khugepaged_ve_scans;
+static unsigned long khugepaged_all_scans;
+static unsigned long khugepaged_fault_fallback_ve;
+
 /*
  * default collapse hugepages if there is at least 1/4th ptes mapped
  * to avoid memory footprint growth due to fragmentation
@@ -480,6 +484,17 @@ static struct kobj_attribute pages_to_scan_attr =
 	__ATTR(pages_to_scan, 0644, pages_to_scan_show,
 	       pages_to_scan_store);
 
+static ssize_t ve_stats_show(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    char *buf)
+{
+	return sprintf(buf, "ve_scans %lu all_scans %lu fault_fallback_ve %lu\n",
+		       khugepaged_ve_scans, khugepaged_all_scans,
+		       khugepaged_fault_fallback_ve);
+}
+static struct kobj_attribute ve_stats_attr =
+	__ATTR_RO(ve_stats);
+
 static ssize_t pages_collapsed_show(struct kobject *kobj,
 				    struct kobj_attribute *attr,
 				    char *buf)
@@ -556,6 +571,7 @@ static struct attribute *khugepaged_attr[] = {
 	&full_scans_attr.attr,
 	&scan_sleep_millisecs_attr.attr,
 	&alloc_sleep_millisecs_attr.attr,
+	&ve_stats_attr.attr,
 	NULL,
 };
 
@@ -798,6 +814,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 	return true;
 }
 
+bool is_vma_ve0(struct vm_area_struct *vma);
+
 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			       unsigned long address, pmd_t *pmd,
 			       unsigned int flags)
@@ -812,6 +830,12 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 	if (unlikely(khugepaged_enter(vma)))
 		return VM_FAULT_OOM;
+
+	if (!is_vma_ve0(vma)) {
+		khugepaged_fault_fallback_ve++;
+		return VM_FAULT_FALLBACK;
+	}
+
 	if (!(flags & FAULT_FLAG_WRITE) &&
 			transparent_hugepage_use_zero_page()) {
 		spinlock_t *ptl;
@@ -1223,6 +1247,11 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	get_page(page);
 	spin_unlock(ptl);
 alloc:
+	if (!is_vma_ve0(vma)) {
+		khugepaged_fault_fallback_ve++;
+		return VM_FAULT_FALLBACK;
+	}
+
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow()) {
 		gfp_t gfp;
@@ -2657,6 +2686,47 @@ static void collapse_huge_page(struct mm_struct *mm,
 	goto out_up_write;
 }
 
+bool is_vma_ve0(struct vm_area_struct *vma)
+{
+	bool ret;
+
+	rcu_read_lock();
+	ret = ve_is_super(rcu_dereference(vma->vm_mm->owner)->task_ve);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static bool is_page_ve0(struct page *page)
+{
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct anon_vma_chain *avc;
+	struct anon_vma *anon_vma;
+	bool ret = true;
+
+	anon_vma = page_get_anon_vma(page);
+	if (!anon_vma) {
+		printk_once("%s: DEBUG no anon_vma %p\n", __func__, page);
+		return true;
+	}
+
+	anon_vma_lock_read(anon_vma);
+	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
+		struct vm_area_struct *vma = avc->vma;
+
+		if (!is_vma_ve0(vma)) {
+			ret = false;
+			break;
+		}
+	}
+	anon_vma_unlock_read(anon_vma);
+
+	if (!ret)
+		khugepaged_ve_scans++;
+
+	return ret;
+}
+
 static int khugepaged_scan_pmd(struct mm_struct *mm,
 			       struct vm_area_struct *vma,
 			       unsigned long address,
@@ -2676,6 +2746,17 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 	if (!pmd)
 		goto out;
 
+	khugepaged_all_scans++;
+	/* Restart counter on overflow */
+	if (khugepaged_all_scans == 0) {
+		khugepaged_all_scans = 1;
+		khugepaged_ve_scans = 0;
+	}
+	if (!is_vma_ve0(vma)) {
+		khugepaged_ve_scans++;
+		goto out;
+	}
+
 	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
@@ -2693,6 +2774,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 		page = vm_normal_page(vma, _address, pteval);
 		if (unlikely(!page))
 			goto out_unmap;
+
+		if (!is_page_ve0(page))
+			goto out_unmap;
+
 		/*
 		 * Record which node the original page is from and save this
 		 * information to khugepaged_node_load[].
-- 
2.17.0