[Devel] [PATCH RHEL7 COMMIT] memcg: add oom_guarantee

Konstantin Khorenko khorenko at virtuozzo.com
Thu May 28 09:03:35 PDT 2015


The commit is pushed to "branch-rh7-3.10.0-123.1.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.7
------>
commit 27b2d925e69623a220156bb8c30efd40b6e8c154
Author: Vladimir Davydov <vdavydov at parallels.com>
Date:   Thu May 28 20:03:35 2015 +0400

    memcg: add oom_guarantee
    
    Patchset description:
    
    This patch set adds memory.oom_guarantee file to memory cgroup which
    allows to protect a memory cgroup from OOM killer. It works as follows:
    OOM killer first selects from processes in cgroups that are above their
    OOM guarantee, and only if there is no such it switches to scanning
    processes from all cgroups. This behavior is similar to UB_OOMGUARPAGES.
    
    It also adds OOM kills counter to each memory cgroup and synchronizes
    beancounters' UB_OOMGUARPAGES resource with oom_guarantee/oom_kill_cnt
    obtained from mem_cgroup.
    
    Related to https://jira.sw.ru/browse/PSBM-20089
    
    Vladimir Davydov (3):
      memcg: add oom_guarantee
      memcg: count oom kills
      memcg: sync UB_OOMGUARPAGES
    
    This patch description:
    
    OOM guarantee works exactly like low limit, but for OOM, i.e. tasks
    inside cgroups above the limit are killed first.
    
    Read/write via memory.oom_guarantee.
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 include/linux/memcontrol.h |  6 +++
 include/linux/oom.h        |  2 +-
 mm/memcontrol.c            | 97 +++++++++++++++++++++++++++++++++++++++++++++-
 mm/oom_kill.c              | 14 ++++++-
 4 files changed, 114 insertions(+), 5 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f5b3031..98bd35f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -120,6 +120,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
+extern bool mem_cgroup_below_oom_guarantee(struct task_struct *p);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 					struct task_struct *p);
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
@@ -342,6 +343,11 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 {
 }
 
+static inline bool mem_cgroup_below_oom_guarantee(struct task_struct *p)
+{
+	return false;
+}
+
 static inline void
 mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
diff --git a/include/linux/oom.h b/include/linux/oom.h
index c13af3f..17100d0 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -67,7 +67,7 @@ extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 
 extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill);
+		bool force_kill, bool ignore_memcg_guarantee);
 
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *mask, bool force_kill);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bfc081c..1849b48 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -292,6 +292,8 @@ struct mem_cgroup {
 	atomic_long_t mem_failcnt;
 	atomic_long_t swap_failcnt;
 
+	unsigned long long oom_guarantee;
+
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
@@ -1558,6 +1560,51 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
 	return true;
 }
 
+static bool __mem_cgroup_below_oom_guarantee(struct mem_cgroup *root,
+					     struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled())
+		return false;
+
+	if (memcg == root_mem_cgroup)
+		return false;
+
+	if (res_counter_read_u64(&memcg->memsw, RES_USAGE) >=
+					memcg->oom_guarantee)
+		return false;
+
+	while (memcg != root) {
+		memcg = parent_mem_cgroup(memcg);
+		if (!memcg)
+			break;
+
+		if (memcg == root_mem_cgroup)
+			break;
+
+		if (res_counter_read_u64(&memcg->memsw, RES_USAGE) >=
+						memcg->oom_guarantee)
+			return false;
+	}
+	return true;
+}
+
+bool mem_cgroup_below_oom_guarantee(struct task_struct *p)
+{
+	struct mem_cgroup *memcg = NULL;
+	bool ret = false;
+
+	p = find_lock_task_mm(p);
+	if (p) {
+		memcg = try_get_mem_cgroup_from_mm(p->mm);
+		task_unlock(p);
+	}
+	if (memcg) {
+		ret = __mem_cgroup_below_oom_guarantee(root_mem_cgroup, memcg);
+		css_put(&memcg->css);
+	}
+	return ret;
+}
+
 #define mem_cgroup_from_res_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 
@@ -1846,6 +1893,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	unsigned long totalpages;
 	unsigned int points = 0;
 	struct task_struct *chosen = NULL;
+	bool ignore_memcg_guarantee = false;
 
 	/*
 	 * If current has a pending SIGKILL or is exiting, then automatically
@@ -1859,15 +1907,20 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
 	totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
+retry:
 	for_each_mem_cgroup_tree(iter, memcg) {
 		struct cgroup *cgroup = iter->css.cgroup;
 		struct cgroup_iter it;
 		struct task_struct *task;
 
+		if (!ignore_memcg_guarantee &&
+		    __mem_cgroup_below_oom_guarantee(memcg, iter))
+			continue;
+
 		cgroup_iter_start(cgroup, &it);
 		while ((task = cgroup_iter_next(cgroup, &it))) {
 			switch (oom_scan_process_thread(task, totalpages, NULL,
-							false)) {
+							false, true)) {
 			case OOM_SCAN_SELECT:
 				if (chosen)
 					put_task_struct(chosen);
@@ -1898,8 +1951,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		cgroup_iter_end(cgroup, &it);
 	}
 
-	if (!chosen)
+	if (!chosen) {
+		if (!ignore_memcg_guarantee) {
+			ignore_memcg_guarantee = true;
+			goto retry;
+		}
 		return;
+	}
 	points = chosen_points * 1000 / totalpages;
 	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
 			 NULL, "Memory cgroup out of memory");
@@ -5091,6 +5149,36 @@ static int mem_cgroup_low_write(struct cgroup *cont, struct cftype *cft,
 	return 0;
 }
 
+static ssize_t mem_cgroup_oom_guarantee_read(struct cgroup *cont,
+		struct cftype *cft, struct file *file, char __user *buf,
+		size_t nbytes, loff_t *ppos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	char str[64];
+	int len;
+
+	len = scnprintf(str, sizeof(str), "%llu\n", memcg->oom_guarantee);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int mem_cgroup_oom_guarantee_write(struct cgroup *cont,
+		struct cftype *cft, const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	unsigned long long val;
+	int ret;
+
+	if (mem_cgroup_is_root(memcg))
+		return -EINVAL;
+
+	ret = res_counter_memparse_write_strategy(buffer, &val);
+	if (ret)
+		return ret;
+
+	memcg->oom_guarantee = val;
+	return 0;
+}
+
 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
 		unsigned long long *mem_limit, unsigned long long *memsw_limit)
 {
@@ -6001,6 +6089,11 @@ static struct cftype mem_cgroup_files[] = {
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
 	{
+		.name = "oom_guarantee",
+		.write_string = mem_cgroup_oom_guarantee_write,
+		.read = mem_cgroup_oom_guarantee_read,
+	},
+	{
 		.name = "pressure_level",
 		.register_event = vmpressure_register_event,
 		.unregister_event = vmpressure_unregister_event,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 61c8693..a6928b4 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 
 enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill)
+		bool force_kill, bool ignore_memcg_guarantee)
 {
 	if (task->exit_state)
 		return OOM_SCAN_CONTINUE;
@@ -291,6 +291,10 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
 			return OOM_SCAN_ABORT;
 	}
+
+	if (!ignore_memcg_guarantee && mem_cgroup_below_oom_guarantee(task))
+		return OOM_SCAN_CONTINUE;
+
 	return OOM_SCAN_OK;
 }
 
@@ -307,13 +311,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
 	unsigned long chosen_points = 0;
+	bool ignore_memcg_guarantee = false;
 
 	rcu_read_lock();
+retry:
 	for_each_process_thread(g, p) {
 		unsigned int points;
 
 		switch (oom_scan_process_thread(p, totalpages, nodemask,
-						force_kill)) {
+					force_kill, ignore_memcg_guarantee)) {
 		case OOM_SCAN_SELECT:
 			chosen = p;
 			chosen_points = ULONG_MAX;
@@ -334,6 +340,10 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 	}
 	if (chosen)
 		get_task_struct(chosen);
+	else if (!ignore_memcg_guarantee) {
+		ignore_memcg_guarantee = true;
+		goto retry;
+	}
 	rcu_read_unlock();
 
 	*ppoints = chosen_points * 1000 / totalpages;



More information about the Devel mailing list