[Devel] [RFC 3/4] memcg: background reclaim
KAMEZAWA Hiroyuki
kamezawa.hiroyu at jp.fujitsu.com
Mon May 26 22:08:46 PDT 2008
Do background reclaim based on high-low watermarks.
This feature helps smooth work of processes under memcg by reclaiming memory
in the kernel thread. # of limitation failure at mem_cgroup_charge() will
dramatically reduced. But this also means a CPU is continuously used for
reclaiming memory.
This one is very simple. Anyway, we need to update this when we add new
complexity to memcg.
Major logic:
- add high-low watermark support to memory resource controller.
- create a kernel thread for cgroup when hwmark is changed. (once)
- stop a kernel thread at rmdir().
- start background reclaim if res_counter is over high-watermark.
- stop background reclaim if res_coutner is below low-watermark.
- for reclaiiming, just calls try_to_free_mem_cgroup_pages().
- kthread for reclaim 's priority is nice(0). default is (-5).
(weaker is better ?)
- kthread for reclaim calls yield() on each loop.
TODO:
- add an interface to start/stop daemon ?
- wise numa support
- too small "low watermark" targe just consumes CPU. Should we warn ?
and what is the best value for hwmark/lwmark in general....?
Chagelog: old one -> this (v1)
- start a thread at write of hwmark.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu at jp.fujitsu.com>
---
mm/memcontrol.c | 147 +++++++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 130 insertions(+), 17 deletions(-)
Index: mm-2.6.26-rc2-mm1/mm/memcontrol.c
===================================================================
--- mm-2.6.26-rc2-mm1.orig/mm/memcontrol.c
+++ mm-2.6.26-rc2-mm1/mm/memcontrol.c
@@ -32,7 +32,8 @@
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
-
+#include <linux/freezer.h>
+#include <linux/kthread.h>
#include <asm/uaccess.h>
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
@@ -119,10 +120,6 @@ struct mem_cgroup_lru_info {
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
* to help the administrator determine what knobs to tune.
*
- * TODO: Add a water mark for the memory controller. Reclaim will begin when
- * we hit the water mark. May be even add a low water mark, such that
- * no reclaim occurs from a cgroup at it's low water mark, this is
- * a feature that will be implemented much later in the future.
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
@@ -131,6 +128,13 @@ struct mem_cgroup {
*/
struct res_counter res;
/*
+ * background reclaim.
+ */
+ struct {
+ wait_queue_head_t waitq;
+ struct task_struct *thread;
+ } daemon;
+ /*
* Per cgroup active and inactive list, similar to the
* per zone LRU lists.
*/
@@ -143,6 +147,7 @@ struct mem_cgroup {
struct mem_cgroup_stat stat;
};
static struct mem_cgroup init_mem_cgroup;
+static DEFINE_MUTEX(memcont_daemon_lock);
/*
* We use the lower bit of the page->page_cgroup pointer as a bit spin
@@ -374,6 +379,15 @@ void mem_cgroup_move_lists(struct page *
unlock_page_cgroup(page);
}
+static void mem_cgroup_schedule_reclaim(struct mem_cgroup *mem)
+{
+ if (!mem->daemon.thread)
+ return;
+ if (!waitqueue_active(&mem->daemon.waitq))
+ return;
+ wake_up_interruptible(&mem->daemon.waitq);
+}
+
/*
* Calculate mapped_ratio under memory controller. This will be used in
* vmscan.c for deteremining we have to reclaim mapped pages.
@@ -532,6 +546,7 @@ static int mem_cgroup_charge_common(stru
unsigned long flags;
unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup_per_zone *mz;
+ enum res_state state;
if (mem_cgroup_subsys.disabled)
return 0;
@@ -558,23 +573,23 @@ static int mem_cgroup_charge_common(stru
mem = memcg;
css_get(&memcg->css);
}
-
- while (res_counter_charge(&mem->res, PAGE_SIZE) == RES_OVER_LIMIT) {
+retry:
+ state = res_counter_charge(&mem->res, PAGE_SIZE);
+ if (state == RES_OVER_LIMIT) {
if (!(gfp_mask & __GFP_WAIT))
goto out;
-
if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
- continue;
-
+ goto retry;
/*
- * try_to_free_mem_cgroup_pages() might not give us a full
- * picture of reclaim. Some pages are reclaimed and might be
- * moved to swap cache or just unmapped from the cgroup.
- * Check the limit again to see if the reclaim reduced the
- * current usage of the cgroup before giving up
+ * try_to_free_mem_cgroup_pages() might not give us a
+ * full picture of reclaim. Some pages are reclaimed
+ * and might be moved to swap cache or just unmapped
+ * from the cgroup. Check the limit again to see if
+ * the reclaim reduced the current usage of the cgroup
+ * before giving up
*/
if (res_counter_check_under_limit(&mem->res))
- continue;
+ goto retry;
if (!nr_retries--) {
mem_cgroup_out_of_memory(mem, gfp_mask);
@@ -609,6 +624,9 @@ static int mem_cgroup_charge_common(stru
spin_unlock_irqrestore(&mz->lru_lock, flags);
unlock_page_cgroup(page);
+
+ if (state > RES_BELOW_HIGH)
+ mem_cgroup_schedule_reclaim(mem);
done:
return 0;
out:
@@ -891,6 +909,74 @@ out:
css_put(&mem->css);
return ret;
}
+/*
+ * background reclaim daemon.
+ */
+static int mem_cgroup_reclaim_daemon(void *data)
+{
+ DEFINE_WAIT(wait);
+ struct mem_cgroup *mem = data;
+ int ret;
+
+ css_get(&mem->css);
+ current->flags |= PF_SWAPWRITE;
+ /* we don't want to use cpu too much. */
+ set_user_nice(current, 0);
+ set_freezable();
+
+ while (!kthread_should_stop()) {
+ prepare_to_wait(&mem->daemon.waitq, &wait, TASK_INTERRUPTIBLE);
+ if (res_counter_state(&mem->res) == RES_BELOW_LOW) {
+ if (!kthread_should_stop()) {
+ schedule();
+ try_to_freeze();
+ }
+ finish_wait(&mem->daemon.waitq, &wait);
+ continue;
+ }
+ finish_wait(&mem->daemon.waitq, &wait);
+ /*
+ * memory resource controller doesn't see NUMA memory usage
+ * balancing, becasue we cannot know what balancing is good.
+ * TODO: some annotation or heuristics to detect which node
+ * we should start reclaim from.
+ */
+ ret = try_to_free_mem_cgroup_pages(mem, GFP_HIGHUSER_MOVABLE);
+
+ yield();
+ }
+ css_put(&mem->css);
+ return 0;
+}
+
+static int mem_cgroup_start_daemon(struct mem_cgroup *mem)
+{
+ int ret = 0;
+ struct task_struct *thr;
+
+ mutex_lock(&memcont_daemon_lock);
+ if (!mem->daemon.thread) {
+ thr = kthread_run(mem_cgroup_reclaim_daemon, mem, "memcontd");
+ if (IS_ERR(thr))
+ ret = PTR_ERR(thr);
+ else
+ mem->daemon.thread = thr;
+ }
+ mutex_unlock(&memcont_daemon_lock);
+ return ret;
+}
+
+static void mem_cgroup_stop_daemon(struct mem_cgroup *mem)
+{
+ mutex_lock(&memcont_daemon_lock);
+ if (mem->daemon.thread) {
+ kthread_stop(mem->daemon.thread);
+ mem->daemon.thread = NULL;
+ }
+ mutex_unlock(&memcont_daemon_lock);
+ return;
+}
+
static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
{
@@ -915,6 +1001,19 @@ static ssize_t mem_cgroup_write(struct c
struct file *file, const char __user *userbuf,
size_t nbytes, loff_t *ppos)
{
+ int ret;
+ /*
+ * start daemon can fail. But we should start daemon always
+ * when changes to HWMARK is succeeded. So, we start daemon before
+ * changes to HWMARK. We don't stop this daemon even if
+ * res_counter_write fails. To do that, we need ugly codes and
+ * it's not so big problem.
+ */
+ if (cft->private == RES_HWMARK) {
+ ret = mem_cgroup_start_daemon(mem_cgroup_from_cont(cont));
+ if (ret)
+ return ret;
+ }
return res_counter_write(&mem_cgroup_from_cont(cont)->res,
cft->private, userbuf, nbytes, ppos,
mem_cgroup_write_strategy);
@@ -1004,6 +1103,18 @@ static struct cftype mem_cgroup_files[]
.read_u64 = mem_cgroup_read,
},
{
+ .name = "high_wmark_in_bytes",
+ .private = RES_HWMARK,
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read,
+ },
+ {
+ .name = "low_wmark_in_bytes",
+ .private = RES_LWMARK,
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read,
+ },
+ {
.name = "force_empty",
.trigger = mem_force_empty_write,
},
@@ -1087,7 +1198,8 @@ mem_cgroup_create(struct cgroup_subsys *
return ERR_PTR(-ENOMEM);
}
- res_counter_init(&mem->res);
+ res_counter_init_wmark(&mem->res);
+ init_waitqueue_head(&mem->daemon.waitq);
for_each_node_state(node, N_POSSIBLE)
if (alloc_mem_cgroup_per_zone_info(mem, node))
@@ -1106,6 +1218,7 @@ static void mem_cgroup_pre_destroy(struc
struct cgroup *cont)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ mem_cgroup_stop_daemon(mem);
mem_cgroup_force_empty(mem);
}
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list