[Devel] [PATCH RHEL7 COMMIT] ms/memcg: port memory.high

Fri May 29 00:55:46 PDT 2015

The commit is pushed to "branch-rh7-3.10.0-123.1.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.7
------>
commit 4038cd0e029ddee1c3308216bdc5da6c4485656b
Author: Vladimir Davydov <vdavydov at parallels.com>
Date:   Fri May 29 11:55:46 2015 +0400

    ms/memcg: port memory.high
    
    This patch backports memory.high knob, which was introduced upstream by
    commit 241994ed8649 ("mm: memcontrol: default hierarchy interface for
    memory"):
    
      - memory.high configures the upper end of the cgroup's expected
        memory consumption range.  A cgroup whose consumption grows beyond
        this threshold is forced into direct reclaim, to work off the
        excess and to throttle new allocations heavily, but is generally
        allowed to continue and the OOM killer is not invoked.
    
    It may come in handy to avoid ENOMEM when hitting memcg limit with
    GFP_NOWAIT allocations - with memory.high < memory.limit_in_bytes such a
    situation becomes less likely.
    
    This knob is supposed to be tuned by vcmmd.
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
    Reviewed-by: Kirill Tkhai <ktkhai at odin.com>
---
 include/linux/swap.h |  1 +
 mm/memcontrol.c      | 61 +++++++++++++++++++++++++++++++++++++++++++++++-----
 mm/vmscan.c          |  3 ++-
 3 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index b2164be..12a0433 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -306,6 +306,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
+						  unsigned long nr_pages,
 						  gfp_t gfp_mask, bool noswap);
 extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a0eee75..ed76b71 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -264,7 +264,9 @@ struct mem_cgroup {
 	 */
 	struct res_counter res;
 
+	/* Normal memory consumption range */
 	unsigned long long low;
+	unsigned long long high;
 
 	/* vmpressure notifications */
 	struct vmpressure vmpressure;
@@ -2002,7 +2004,8 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
 		if (loop)
 			drain_all_stock_async(memcg);
-		total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
+		total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
+						      gfp_mask, noswap);
 		/*
 		 * Allow limit shrinkers, which are triggered directly
 		 * by userspace, to catch signals and stop reclaim
@@ -2704,10 +2707,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	if (likely(!ret)) {
 		if (!do_swap_account)
-			return CHARGE_OK;
+			goto done;
 		ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
 		if (likely(!ret))
-			return CHARGE_OK;
+			goto done;
 
 		res_counter_uncharge(&memcg->res, csize);
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
@@ -2764,6 +2767,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		return CHARGE_OOM_DIE;
 
 	return CHARGE_RETRY;
+
+done:
+	/*
+	 * If the hierarchy is above the normal consumption range,
+	 * make the charging task trim their excess contribution.
+	 */
+	do {
+		if (res_counter_read_u64(&memcg->res, RES_USAGE) <= memcg->high)
+			continue;
+		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, false);
+	} while ((memcg = parent_mem_cgroup(memcg)));
+	return CHARGE_OK;
 }
 
 /*
@@ -4822,8 +4837,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 		if (signal_pending(current))
 			return -EINTR;
 
-		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
-						false);
+		progress = try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
+							GFP_KERNEL, false);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
@@ -5169,6 +5184,33 @@ static int mem_cgroup_low_write(struct cgroup *cont, struct cftype *cft,
 	return 0;
 }
 
+static ssize_t mem_cgroup_high_read(struct cgroup *cont, struct cftype *cft,
+				    struct file *file, char __user *buf,
+				    size_t nbytes, loff_t *ppos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	char str[64];
+	int len;
+
+	len = scnprintf(str, sizeof(str), "%llu\n", memcg->high);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int mem_cgroup_high_write(struct cgroup *cont, struct cftype *cft,
+				 const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	unsigned long long val;
+	int ret;
+
+	ret = res_counter_memparse_write_strategy(buffer, &val);
+	if (ret)
+		return ret;
+
+	memcg->high = val;
+	return 0;
+}
+
 static ssize_t mem_cgroup_oom_guarantee_read(struct cgroup *cont,
 		struct cftype *cft, struct file *file, char __user *buf,
 		size_t nbytes, loff_t *ppos)
@@ -6086,6 +6128,12 @@ static struct cftype mem_cgroup_files[] = {
 		.read = mem_cgroup_low_read,
 	},
 	{
+		.name = "high",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write_string = mem_cgroup_high_write,
+		.read = mem_cgroup_high_read,
+	},
+	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
@@ -6353,6 +6401,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 	if (cont->parent == NULL) {
 		root_mem_cgroup = memcg;
 		res_counter_init(&memcg->res, NULL);
+		memcg->high = RESOURCE_MAX;
 		res_counter_init(&memcg->memsw, NULL);
 		res_counter_init(&memcg->kmem, NULL);
 		res_counter_init(&memcg->dcache, NULL);
@@ -6395,6 +6444,7 @@ mem_cgroup_css_online(struct cgroup *cont)
 
 	if (parent->use_hierarchy) {
 		res_counter_init(&memcg->res, &parent->res);
+		memcg->high = RESOURCE_MAX;
 		res_counter_init(&memcg->memsw, &parent->memsw);
 		res_counter_init(&memcg->kmem, &parent->kmem);
 		res_counter_init(&memcg->dcache, &parent->dcache);
@@ -6405,6 +6455,7 @@ mem_cgroup_css_online(struct cgroup *cont)
 		 */
 	} else {
 		res_counter_init(&memcg->res, NULL);
+		memcg->high = RESOURCE_MAX;
 		res_counter_init(&memcg->memsw, NULL);
 		res_counter_init(&memcg->kmem, NULL);
 		res_counter_init(&memcg->dcache, NULL);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cd97aed..ed0aade 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2731,6 +2731,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 }
 
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
 					   bool noswap)
 {
@@ -2741,7 +2742,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = !noswap,
-		.nr_to_reclaim = SWAP_CLUSTER_MAX,
+		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.order = 0,
 		.priority = DEF_PRIORITY,
 		.target_mem_cgroup = memcg,