[Devel] [PATCH rh7 3/3] memcg: port memory.high

Vladimir Davydov vdavydov at parallels.com
Mon May 25 07:05:48 PDT 2015


This patch backports memory.high knob, which was introduced upstream by
commit 241994ed8649 ("mm: memcontrol: default hierarchy interface for
memory"):

  - memory.high configures the upper end of the cgroup's expected
    memory consumption range.  A cgroup whose consumption grows beyond
    this threshold is forced into direct reclaim, to work off the
    excess and to throttle new allocations heavily, but is generally
    allowed to continue and the OOM killer is not invoked.

It may come in handy to avoid ENOMEM when hitting memcg limit with
GFP_NOWAIT allocations - with memory.high < memory.limit_in_bytes such a
situation becomes less likely.

This knob is supposed to be tuned by vcmmd.

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 include/linux/swap.h |  1 +
 mm/memcontrol.c      | 61 +++++++++++++++++++++++++++++++++++++++++++++++-----
 mm/vmscan.c          |  3 ++-
 3 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index b2164bebe0ad..12a04334acbf 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -306,6 +306,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
+						  unsigned long nr_pages,
 						  gfp_t gfp_mask, bool noswap);
 extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 144a2720b604..6409fb28f00b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -264,7 +264,9 @@ struct mem_cgroup {
 	 */
 	struct res_counter res;
 
+	/* Normal memory consumption range */
 	unsigned long long low;
+	unsigned long long high;
 
 	/* vmpressure notifications */
 	struct vmpressure vmpressure;
@@ -2033,7 +2035,8 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
 		if (loop)
 			drain_all_stock_async(memcg);
-		total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
+		total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
+						      gfp_mask, noswap);
 		/*
 		 * Allow limit shrinkers, which are triggered directly
 		 * by userspace, to catch signals and stop reclaim
@@ -2735,10 +2738,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	if (likely(!ret)) {
 		if (!do_swap_account)
-			return CHARGE_OK;
+			goto done;
 		ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
 		if (likely(!ret))
-			return CHARGE_OK;
+			goto done;
 
 		res_counter_uncharge(&memcg->res, csize);
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
@@ -2795,6 +2798,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		return CHARGE_OOM_DIE;
 
 	return CHARGE_RETRY;
+
+done:
+	/*
+	 * If the hierarchy is above the normal consumption range,
+	 * make the charging task trim their excess contribution.
+	 */
+	do {
+		if (res_counter_read_u64(&memcg->res, RES_USAGE) <= memcg->high)
+			continue;
+		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, false);
+	} while ((memcg = parent_mem_cgroup(memcg)));
+	return CHARGE_OK;
 }
 
 /*
@@ -4853,8 +4868,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 		if (signal_pending(current))
 			return -EINTR;
 
-		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
-						false);
+		progress = try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
+							GFP_KERNEL, false);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
@@ -5200,6 +5215,33 @@ static int mem_cgroup_low_write(struct cgroup *cont, struct cftype *cft,
 	return 0;
 }
 
+static ssize_t mem_cgroup_high_read(struct cgroup *cont, struct cftype *cft,
+				    struct file *file, char __user *buf,
+				    size_t nbytes, loff_t *ppos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	char str[64];
+	int len;
+
+	len = scnprintf(str, sizeof(str), "%llu\n", memcg->high);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int mem_cgroup_high_write(struct cgroup *cont, struct cftype *cft,
+				 const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	unsigned long long val;
+	int ret;
+
+	ret = res_counter_memparse_write_strategy(buffer, &val);
+	if (ret)
+		return ret;
+
+	memcg->high = val;
+	return 0;
+}
+
 static ssize_t mem_cgroup_oom_guarantee_read(struct cgroup *cont,
 		struct cftype *cft, struct file *file, char __user *buf,
 		size_t nbytes, loff_t *ppos)
@@ -6117,6 +6159,12 @@ static struct cftype mem_cgroup_files[] = {
 		.read = mem_cgroup_low_read,
 	},
 	{
+		.name = "high",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write_string = mem_cgroup_high_write,
+		.read = mem_cgroup_high_read,
+	},
+	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
@@ -6384,6 +6432,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 	if (cont->parent == NULL) {
 		root_mem_cgroup = memcg;
 		res_counter_init(&memcg->res, NULL);
+		memcg->high = RESOURCE_MAX;
 		res_counter_init(&memcg->memsw, NULL);
 		res_counter_init(&memcg->kmem, NULL);
 		res_counter_init(&memcg->dcache, NULL);
@@ -6426,6 +6475,7 @@ mem_cgroup_css_online(struct cgroup *cont)
 
 	if (parent->use_hierarchy) {
 		res_counter_init(&memcg->res, &parent->res);
+		memcg->high = RESOURCE_MAX;
 		res_counter_init(&memcg->memsw, &parent->memsw);
 		res_counter_init(&memcg->kmem, &parent->kmem);
 		res_counter_init(&memcg->dcache, &parent->dcache);
@@ -6436,6 +6486,7 @@ mem_cgroup_css_online(struct cgroup *cont)
 		 */
 	} else {
 		res_counter_init(&memcg->res, NULL);
+		memcg->high = RESOURCE_MAX;
 		res_counter_init(&memcg->memsw, NULL);
 		res_counter_init(&memcg->kmem, NULL);
 		res_counter_init(&memcg->dcache, NULL);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cd97aed61d7e..ed0aadeec7f2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2731,6 +2731,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 }
 
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
 					   bool noswap)
 {
@@ -2741,7 +2742,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = !noswap,
-		.nr_to_reclaim = SWAP_CLUSTER_MAX,
+		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.order = 0,
 		.priority = DEF_PRIORITY,
 		.target_mem_cgroup = memcg,
-- 
2.1.4




More information about the Devel mailing list