[Devel] [PATCH RHEL7 COMMIT] ms/mm: introduce kv[mz]alloc helpers

Konstantin Khorenko khorenko at virtuozzo.com
Tue Nov 7 12:59:17 MSK 2017


The commit is pushed to "branch-rh7-3.10.0-693.1.1.vz7.37.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.1.1.vz7.37.22
------>
commit e297497b1e36380d5022ed98411c4f3ab3741bee
Author: Andrey Ryabinin <aryabinin at virtuozzo.com>
Date:   Tue Nov 7 12:59:17 2017 +0300

    ms/mm: introduce kv[mz]alloc helpers
    
    This only small part of the upstream commit
    a7c3e901a46ff54c016d040847eda598a9e3e653. I backported only
    part that introduces kv[mz]alloc helpers.
    
    Description of the original patch:
    
    commit a7c3e901a46ff54c016d040847eda598a9e3e653
    Author: Michal Hocko <mhocko at suse.com>
    Date:   Mon May 8 15:57:09 2017 -0700
    
        mm: introduce kv[mz]alloc helpers
    
        Patch series "kvmalloc", v5.
    
        There are many open coded kmalloc with vmalloc fallback instances in the
        tree.  Most of them are not careful enough or simply do not care about
        the underlying semantic of the kmalloc/page allocator which means that
        a) some vmalloc fallbacks are basically unreachable because the kmalloc
        part will keep retrying until it succeeds b) the page allocator can
        invoke a really disruptive steps like the OOM killer to move forward
        which doesn't sound appropriate when we consider that the vmalloc
        fallback is available.
    
        As it can be seen implementing kvmalloc requires quite an intimate
        knowledge if the page allocator and the memory reclaim internals which
        strongly suggests that a helper should be implemented in the memory
        subsystem proper.
    
        Most callers, I could find, have been converted to use the helper
        instead.  This is patch 6.  There are some more relying on __GFP_REPEAT
        in the networking stack which I have converted as well and Eric Dumazet
        was not opposed [2] to convert them as well.
    
        [1] http://lkml.kernel.org/r/20170130094940.13546-1-mhocko@kernel.org
        [2] http://lkml.kernel.org/r/1485273626.16328.301.camel@edumazet-glaptop3.roam.corp.google.com
    
        This patch (of 9):
    
        Using kmalloc with the vmalloc fallback for larger allocations is a
        common pattern in the kernel code.  Yet we do not have any common helper
        for that and so users have invented their own helpers.  Some of them are
        really creative when doing so.  Let's just add kv[mz]alloc and make sure
        it is implemented properly.  This implementation makes sure to not make
        a large memory pressure for > PAGE_SZE requests (__GFP_NORETRY) and also
        to not warn about allocation failures.  This also rules out the OOM
        killer as the vmalloc is a more approapriate fallback than a disruptive
        user visible action.
    
        This patch also changes some existing users and removes helpers which
        are specific for them.  In some cases this is not possible (e.g.
        ext4_kvmalloc, libcfs_kvzalloc) because those seems to be broken and
        require GFP_NO{FS,IO} context which is not vmalloc compatible in general
        (note that the page table allocation is GFP_KERNEL).  Those need to be
        fixed separately.
    
        While we are at it, document that __vmalloc{_node} about unsupported gfp
        mask because there seems to be a lot of confusion out there.
        kvmalloc_node will warn about GFP_KERNEL incompatible (which are not
        superset) flags to catch new abusers.  Existing ones would have to die
        slowly.
    
    https://jira.sw.ru/browse/PSBM-76752
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 include/linux/mm.h                   | 14 +++++++++
 include/linux/vmalloc.h              |  1 +
 mm/nommu.c                           |  5 +++
 mm/util.c                            | 45 ++++++++++++++++++++++++++
 mm/vmalloc.c                         |  2 +-
 security/apparmor/apparmorfs.c       |  2 +-
 security/apparmor/include/apparmor.h |  2 --
 security/apparmor/lib.c              | 61 ------------------------------------
 security/apparmor/match.c            |  2 +-
 9 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c806f43..897d7cf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -401,6 +401,20 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 }
 #endif
 
+extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc_node(size, flags, NUMA_NO_NODE);
+}
+static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
+{
+	return kvmalloc_node(size, flags | __GFP_ZERO, node);
+}
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
 extern void kvfree(const void *addr);
 
 static inline void compound_lock(struct page *page)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 6ea82cf..59c80dd 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -81,6 +81,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
 			pgprot_t prot, unsigned long vm_flags, int node,
 			const void *caller);
+extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
 
 extern void vfree(const void *addr);
 
diff --git a/mm/nommu.c b/mm/nommu.c
index beecd95..a16aee9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -282,6 +282,11 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
+void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags)
+{
+	return __vmalloc(size, flags, PAGE_KERNEL);
+}
+
 void *vmalloc_user(unsigned long size)
 {
 	void *ret;
diff --git a/mm/util.c b/mm/util.c
index 79e918c..85addcd8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -355,6 +355,51 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 }
 EXPORT_SYMBOL(vm_mmap);
 
+/**
+ * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
+ * @node: numa node to allocate from
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use kvfree for freeing the memory.
+ *
+ * Reclaim modifiers - __GFP_NORETRY, __GFP_REPEAT and __GFP_NOFAIL are not supported
+ *
+ * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people.
+ */
+void *kvmalloc_node(size_t size, gfp_t flags, int node)
+{
+	gfp_t kmalloc_flags = flags;
+	void *ret;
+
+	/*
+	 * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
+	 * so the given set of flags has to be compatible.
+	 */
+	WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);
+
+	/*
+	 * Make sure that larger requests are not too disruptive - no OOM
+	 * killer and no allocation failure warnings as we have a fallback
+	 */
+	if (size > PAGE_SIZE)
+		kmalloc_flags |= __GFP_NORETRY | __GFP_NOWARN;
+
+	ret = kmalloc_node(size, kmalloc_flags, node);
+
+	/*
+	 * It doesn't really make sense to fallback to vmalloc for sub page
+	 * requests
+	 */
+	if (ret || size <= PAGE_SIZE)
+		return ret;
+
+	return __vmalloc_node_flags(size, node, flags | __GFP_HIGHMEM);
+}
+EXPORT_SYMBOL(kvmalloc_node);
+
 void kvfree(const void *addr)
 {
 	if (is_vmalloc_addr(addr))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5684c15..b027cb4 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1747,7 +1747,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
-static inline void *__vmalloc_node_flags(unsigned long size,
+void *__vmalloc_node_flags(unsigned long size,
 					int node, gfp_t flags)
 {
 	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 16c15ec..e34574d 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -58,7 +58,7 @@ static char *aa_simple_write_to_buffer(int op, const char __user *userbuf,
 		return ERR_PTR(-EACCES);
 
 	/* freed by caller to simple_write_to_buffer */
-	data = kvmalloc(alloc_size);
+	data = kvmalloc(alloc_size, GFP_KERNEL);
 	if (data == NULL)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h
index 40aedd9..dbf6eb7 100644
--- a/security/apparmor/include/apparmor.h
+++ b/security/apparmor/include/apparmor.h
@@ -64,8 +64,6 @@ extern int apparmor_initialized __initdata;
 /* fn's in lib */
 char *aa_split_fqname(char *args, char **ns_name);
 void aa_info_message(const char *str);
-void *kvmalloc(size_t size);
-void kvfree(void *buffer);
 
 
 /**
diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c
index 7430298..348704e 100644
--- a/security/apparmor/lib.c
+++ b/security/apparmor/lib.c
@@ -74,64 +74,3 @@ void aa_info_message(const char *str)
 	printk(KERN_INFO "AppArmor: %s\n", str);
 }
 
-/**
- * kvmalloc - do allocation preferring kmalloc but falling back to vmalloc
- * @size: size of allocation
- *
- * Return: allocated buffer or NULL if failed
- *
- * It is possible that policy being loaded from the user is larger than
- * what can be allocated by kmalloc, in those cases fall back to vmalloc.
- */
-void *kvmalloc(size_t size)
-{
-	void *buffer = NULL;
-
-	if (size == 0)
-		return NULL;
-
-	/* do not attempt kmalloc if we need more than 16 pages at once */
-	if (size <= (16*PAGE_SIZE))
-		buffer = kmalloc(size, GFP_NOIO | __GFP_NOWARN);
-	if (!buffer) {
-		/* see kvfree for why size must be at least work_struct size
-		 * when allocated via vmalloc
-		 */
-		if (size < sizeof(struct work_struct))
-			size = sizeof(struct work_struct);
-		buffer = vmalloc(size);
-	}
-	return buffer;
-}
-
-/**
- * do_vfree - workqueue routine for freeing vmalloced memory
- * @work: data to be freed
- *
- * The work_struct is overlaid to the data being freed, as at the point
- * the work is scheduled the data is no longer valid, be its freeing
- * needs to be delayed until safe.
- */
-static void do_vfree(struct work_struct *work)
-{
-	vfree(work);
-}
-
-/**
- * kvfree - free an allocation do by kvmalloc
- * @buffer: buffer to free (MAYBE_NULL)
- *
- * Free a buffer allocated by kvmalloc
- */
-void kvfree(void *buffer)
-{
-	if (is_vmalloc_addr(buffer)) {
-		/* Data is no longer valid so just use the allocated space
-		 * as the work_struct
-		 */
-		struct work_struct *work = (struct work_struct *) buffer;
-		INIT_WORK(work, do_vfree);
-		schedule_work(work);
-	} else
-		kfree(buffer);
-}
diff --git a/security/apparmor/match.c b/security/apparmor/match.c
index 90971a8..c007425 100644
--- a/security/apparmor/match.c
+++ b/security/apparmor/match.c
@@ -57,7 +57,7 @@ static struct table_header *unpack_table(char *blob, size_t bsize)
 	if (bsize < tsize)
 		goto out;
 
-	table = kvmalloc(tsize);
+	table = kvmalloc(tsize, GFP_KERNEL);
 	if (table) {
 		*table = th;
 		if (th.td_flags == YYTD_DATA8)


More information about the Devel mailing list