[Devel] [RFC] memory controller : backgorund reclaim and avoid excessive locking [2/5] background reclaim.
KAMEZAWA Hiroyuki
kamezawa.hiroyu at jp.fujitsu.com
Thu Feb 14 00:29:10 PST 2008
A patch for background reclaim based on high-low watermak in res_counter.
The daemon is called as "memcontd", here.
Implements following:
* If res->usage is higher than res->hwmark, start memcontd.
* memcontd calls try_to_free_pages.
* memcontd stops if res->usage is lower than res->lwmark.
Maybe we can add more tunings but no extra params now.
ChangeLog:
- start "memcontd" at first change in hwmark.
(In old verion, it started at cgroup creation.)
- changed "relax" logic in memcontd daemon.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu at jp.fujitsu.com>
mm/memcontrol.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 109 insertions(+), 3 deletions(-)
Index: linux-2.6.24-mm1/mm/memcontrol.c
===================================================================
--- linux-2.6.24-mm1.orig/mm/memcontrol.c
+++ linux-2.6.24-mm1/mm/memcontrol.c
@@ -30,6 +30,8 @@
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <asm/uaccess.h>
@@ -136,6 +138,13 @@ struct mem_cgroup {
* statistics.
*/
struct mem_cgroup_stat stat;
+ /*
+ * background reclaim.
+ */
+ struct {
+ wait_queue_head_t waitq;
+ struct task_struct *kthread;
+ } daemon;
};
/*
@@ -504,6 +513,14 @@ long mem_cgroup_calc_reclaim_inactive(st
return (nr_inactive >> priority);
}
+static inline void mem_cgroup_schedule_daemon(struct mem_cgroup *mem)
+{
+ if (likely(mem->daemon.kthread) && /* can be NULL at boot */
+ waitqueue_active(&mem->daemon.waitq))
+ wake_up_interruptible(&mem->daemon.waitq);
+}
+
+
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct list_head *dst,
unsigned long *scanned, int order,
@@ -658,6 +675,9 @@ retry:
congestion_wait(WRITE, HZ/10);
}
+ if (res_counter_above_hwmark(&mem->res))
+ mem_cgroup_schedule_daemon(mem);
+
atomic_set(&pc->ref_cnt, 1);
pc->mem_cgroup = mem;
pc->page = page;
@@ -762,6 +782,50 @@ void mem_cgroup_uncharge_page(struct pag
}
/*
+ * background page reclaim routine for cgroup.
+ */
+static int mem_cgroup_reclaim_daemon(void *data)
+{
+ DEFINE_WAIT(wait);
+ struct mem_cgroup *mem = data;
+
+ css_get(&mem->css);
+ current->flags |= PF_SWAPWRITE;
+ set_freezable();
+
+ while (!kthread_should_stop()) {
+ prepare_to_wait(&mem->daemon.waitq, &wait, TASK_INTERRUPTIBLE);
+ if (res_counter_below_lwmark(&mem->res)) {
+ if (!kthread_should_stop()) {
+ schedule();
+ try_to_freeze();
+ }
+ finish_wait(&mem->daemon.waitq, &wait);
+ continue;
+ }
+ finish_wait(&mem->daemon.waitq, &wait);
+ try_to_free_mem_cgroup_pages(mem, GFP_HIGHUSER_MOVABLE);
+ /* Am I in hurry ? */
+ if (!res_counter_above_hwmark(&mem->res)) {
+ /*
+ * Extra relaxing..memory reclaim is hevay work.
+ * we don't know there is I/O congestion or not.
+ * So use just relax rather than congesiton_wait().
+ * HZ/10 is widely used value under /mm.
+ */
+ schedule_timeout(HZ/10);
+ } else {
+ /* Avoid occupation */
+ yield();
+ }
+ }
+
+ css_put(&mem->css);
+ return 0;
+}
+
+
+/*
* Returns non-zero if a page (under migration) has valid page_cgroup member.
* Refcnt of page_cgroup is incremented.
*/
@@ -931,15 +995,40 @@ static ssize_t mem_cgroup_read(struct cg
NULL);
}
+static DEFINE_MUTEX(modify_param_mutex);
static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
struct file *file, const char __user *userbuf,
size_t nbytes, loff_t *ppos)
{
- return res_counter_write(&mem_cgroup_from_cont(cont)->res,
- cft->private, userbuf, nbytes, ppos,
+ int ret;
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+
+ mutex_lock(&modify_param_mutex);
+ /* Attach new background reclaim daemon.
+ This must be done before change values (for easy error handling */
+
+ if (cft->private == RES_HWMARK &&
+ !mem->daemon.kthread) {
+ struct task_struct *thr;
+ thr = kthread_run(mem_cgroup_reclaim_daemon, mem, "memcontd");
+ if (IS_ERR(thr)) {
+ ret = PTR_ERR(thr);
+ goto out;
+ }
+ mem->daemon.kthread = thr;
+ }
+ ret = res_counter_write(&mem->res, cft->private, userbuf, nbytes, ppos,
mem_cgroup_write_strategy);
+
+ /* Even on error, don't stop reclaim daemon here. not so problematic. */
+
+out:
+ mutex_unlock(&modify_param_mutex);
+ return ret;
}
+
+
static ssize_t mem_force_empty_write(struct cgroup *cont,
struct cftype *cft, struct file *file,
const char __user *userbuf,
@@ -1032,6 +1121,20 @@ static struct cftype mem_cgroup_files[]
.write = mem_cgroup_write,
.read = mem_cgroup_read,
},
+
+ {
+ .name = "lwmark_in_bytes",
+ .private = RES_LWMARK,
+ .write = mem_cgroup_write,
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "hwmark_in_bytes",
+ .private = RES_HWMARK,
+ .write = mem_cgroup_write,
+ .read = mem_cgroup_read,
+ },
+
{
.name = "failcnt",
.private = RES_FAILCNT,
@@ -1110,7 +1213,8 @@ mem_cgroup_create(struct cgroup_subsys *
for_each_node_state(node, N_POSSIBLE)
if (alloc_mem_cgroup_per_zone_info(mem, node))
goto free_out;
-
+ init_waitqueue_head(&mem->daemon.waitq);
+ mem->daemon.kthread = NULL;
return &mem->css;
free_out:
for_each_node_state(node, N_POSSIBLE)
@@ -1125,6 +1229,8 @@ static void mem_cgroup_pre_destroy(struc
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
mem_cgroup_force_empty(mem);
+ if (mem->daemon.kthread)
+ kthread_stop(mem->daemon.kthread);
}
static void mem_cgroup_destroy(struct cgroup_subsys *ss,
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list