[Devel] [PATCH RH7] mm: make PR_SET_THP_DISABLE immediately active

Andrey Zhadchenko andrey.zhadchenko at virtuozzo.com
Fri Sep 3 00:11:00 MSK 2021


From: Michal Hocko <mhocko at suse.com>

PR_SET_THP_DISABLE has a rather subtle semantic.  It doesn't affect any
existing mapping because it only updated mm->def_flags which is a
template for new mappings.

The mappings created after prctl(PR_SET_THP_DISABLE) have VM_NOHUGEPAGE
flag set.  This can be quite surprising for all those applications which
do not do prctl(); fork() & exec() and want to control their own THP
behavior.

Another usecase when the immediate semantic of the prctl might be useful
is a combination of pre- and post-copy migration of containers with
CRIU.  In this case CRIU populates a part of a memory region with data
that was saved during the pre-copy stage.  Afterwards, the region is
registered with userfaultfd and CRIU expects to get page faults for the
parts of the region that were not yet populated.  However, khugepaged
collapses the pages and the expected page faults do not occur.

In more general case, the prctl(PR_SET_THP_DISABLE) could be used as a
temporary mechanism for enabling/disabling THP process wide.

Implementation wise, a new MMF_DISABLE_THP flag is added.  This flag is
tested when decision whether to use huge pages is taken either during
page fault of at the time of THP collapse.

It should be noted, that the new implementation makes PR_SET_THP_DISABLE
master override to any per-VMA setting, which was not the case
previously.

Fixes: a0715cc22601 ("mm, thp: add VM_INIT_DEF_MASK and PRCTL_THP_DISABLE")
Link: http://lkml.kernel.org/r/1496415802-30944-1-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Michal Hocko <mhocko at suse.com>
Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
Cc: Vlastimil Babka <vbabka at suse.cz>
Cc: Andrea Arcangeli <aarcange at redhat.com>
Cc: Arnd Bergmann <arnd at arndb.de>
Cc: "Kirill A. Shutemov" <kirill.shutemov at linux.intel.com>
Cc: Pavel Emelyanov <xemul at virtuozzo.com>
Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>

https://jira.sw.ru/browse/PSBM-124504
(cherry-picked from ms commit 1860033237d4be09c5d7382585f0c7229367a534)
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
---
 include/linux/huge_mm.h    | 5 ++++-
 include/linux/khugepaged.h | 3 ++-
 include/linux/sched.h      | 5 ++++-
 kernel/sys.c               | 6 +++---
 mm/huge_memory.c           | 3 ++-
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 12d5f1a..fbe76dd 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_HUGE_MM_H
 #define _LINUX_HUGE_MM_H
 
+#include <linux/sched.h>
+
 #ifndef __GENKSYMS__
 #include <linux/fs.h> /* only for vma_is_dax() */
 #endif
@@ -90,7 +92,8 @@ extern unsigned long transparent_hugepage_flags;
 
 static inline bool transparent_hugepage_enabled(struct vm_area_struct *vma)
 {
-	if (vma->vm_flags & VM_NOHUGEPAGE)
+	if ((vma->vm_flags & VM_NOHUGEPAGE) ||
+	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
 		return false;
 
 	if (is_vma_temporary_stack(vma))
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index eeb3079..f16adb0 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -42,7 +42,8 @@ static inline int khugepaged_enter(struct vm_area_struct *vma,
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
 		if ((khugepaged_always() ||
 		     (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) &&
-		    !(vm_flags & VM_NOHUGEPAGE))
+		    !(vm_flags & VM_NOHUGEPAGE) &&
+		    !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
 			if (__khugepaged_enter(vma->vm_mm))
 				return -ENOMEM;
 	return 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 73e80a0..f49ea92 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -482,8 +482,11 @@ extern int get_dumpable(struct mm_struct *mm);
 
 #define MMF_HAS_UPROBES		19	/* has uprobes */
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
+#define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
+#define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 
-#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
+#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
+				 MMF_DISABLE_THP_MASK)
 
 struct sighand_struct {
 	atomic_t		count;
diff --git a/kernel/sys.c b/kernel/sys.c
index 8687707..1ec78ff 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2676,16 +2676,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_GET_THP_DISABLE:
 		if (arg2 || arg3 || arg4 || arg5)
 			return -EINVAL;
-		error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
+		error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
 		break;
 	case PR_SET_THP_DISABLE:
 		if (arg3 || arg4 || arg5)
 			return -EINVAL;
 		down_write(&me->mm->mmap_sem);
 		if (arg2)
-			me->mm->def_flags |= VM_NOHUGEPAGE;
+			set_bit(MMF_DISABLE_THP, &me->mm->flags);
 		else
-			me->mm->def_flags &= ~VM_NOHUGEPAGE;
+			clear_bit(MMF_DISABLE_THP, &me->mm->flags);
 		up_write(&me->mm->mmap_sem);
 		break;
 	case PR_MPX_ENABLE_MANAGEMENT:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c79581b..bcdfe7b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2893,7 +2893,8 @@ static struct page
 static bool hugepage_vma_check(struct vm_area_struct *vma)
 {
 	if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
-	    (vma->vm_flags & VM_NOHUGEPAGE))
+	    (vma->vm_flags & VM_NOHUGEPAGE) ||
+	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
 		return false;
 
 	if (!vma->anon_vma || vma->vm_ops)
-- 
1.8.3.1



More information about the Devel mailing list