[Devel] [PATCH 09/12] Move mmap checkpoint/restart into mm/mmap.c

Matt Helsley matthltc at us.ibm.com
Fri Feb 26 00:45:10 PST 2010


Moving the memory pieces is more complicated because portions of it are
shared with ipc. Split the mm header bits into another
header file: include/linux/mm_checkpoint.h file so it's clear that these
are the only pieces we need in ipc/

Signed-off-by: Matt Helsley <matthltc at us.ibm.com>
---
 checkpoint/Makefile           |    3 +-
 checkpoint/memory.c           | 1364 -----------------------------------------
 checkpoint/process.c          |    1 +
 checkpoint/sys.c              |    1 +
 include/linux/checkpoint.h    |   50 +--
 include/linux/mm.h            |   18 +
 include/linux/mm_checkpoint.h |   45 ++
 include/linux/proc_fs.h       |   19 -
 ipc/checkpoint.c              |    3 +-
 ipc/checkpoint_msg.c          |    3 +-
 ipc/checkpoint_sem.c          |    3 +-
 ipc/checkpoint_shm.c          |    3 +-
 ipc/shm.c                     |    2 +-
 mm/filemap.c                  |    2 +-
 mm/mmap.c                     | 1348 ++++++++++++++++++++++++++++++++++++++++
 mm/shmem.c                    |    2 +-
 16 files changed, 1424 insertions(+), 1443 deletions(-)
 delete mode 100644 checkpoint/memory.c
 create mode 100644 include/linux/mm_checkpoint.h

diff --git a/checkpoint/Makefile b/checkpoint/Makefile
index 5bc8468..9571af3 100644
--- a/checkpoint/Makefile
+++ b/checkpoint/Makefile
@@ -8,5 +8,4 @@ obj-$(CONFIG_CHECKPOINT) += \
 	checkpoint.o \
 	restart.o \
 	process.o \
-	namespace.o \
-	memory.o \
+	namespace.o
diff --git a/checkpoint/memory.c b/checkpoint/memory.c
deleted file mode 100644
index e0b3b54..0000000
--- a/checkpoint/memory.c
+++ /dev/null
@@ -1,1364 +0,0 @@
-/*
- *  Checkpoint/restart memory contents
- *
- *  Copyright (C) 2008-2009 Oren Laadan
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of the Linux
- *  distribution for more details.
- */
-
-/* default debug level for output */
-#define CKPT_DFLAG  CKPT_DMEM
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/file.h>
-#include <linux/aio.h>
-#include <linux/err.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/pagemap.h>
-#include <linux/mm_types.h>
-#include <linux/shm.h>
-#include <linux/proc_fs.h>
-#include <linux/swap.h>
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
-
-/*
- * page-array chains: each ckpt_pgarr describes a set of <struct page *,vaddr>
- * tuples (where vaddr is the virtual address of a page in a particular mm).
- * Specifically, we use separate arrays so that all vaddrs can be written
- * and read at once.
- */
-
-struct ckpt_pgarr {
-	unsigned long *vaddrs;
-	struct page **pages;
-	unsigned int nr_used;
-	struct list_head list;
-};
-
-#define CKPT_PGARR_TOTAL  (PAGE_SIZE / sizeof(void *))
-#define CKPT_PGARR_BATCH  (16 * CKPT_PGARR_TOTAL)
-
-static inline int pgarr_is_full(struct ckpt_pgarr *pgarr)
-{
-	return (pgarr->nr_used == CKPT_PGARR_TOTAL);
-}
-
-static inline int pgarr_nr_free(struct ckpt_pgarr *pgarr)
-{
-	return CKPT_PGARR_TOTAL - pgarr->nr_used;
-}
-
-/*
- * utilities to alloc, free, and handle 'struct ckpt_pgarr' (page-arrays)
- * (common to ckpt_mem.c and rstr_mem.c).
- *
- * The checkpoint context structure has two members for page-arrays:
- *   ctx->pgarr_list: list head of populated page-array chain
- *   ctx->pgarr_pool: list head of empty page-array pool chain
- *
- * During checkpoint (and restart) the chain tracks the dirty pages (page
- * pointer and virtual address) of each MM. For a particular MM, these are
- * always added to the head of the page-array chain (ctx->pgarr_list).
- * Before the next chunk of pages, the chain is reset (by dereferencing
- * all pages) but not freed; instead, empty descsriptors are kept in pool.
- *
- * The head of the chain page-array ("current") advances as necessary. When
- * it gets full, a new page-array descriptor is pushed in front of it. The
- * new descriptor is taken from first empty descriptor (if one exists, for
- * instance, after a chain reset), or allocated on-demand.
- *
- * When dumping the data, the chain is traversed in reverse order.
- */
-
-/* return first page-array in the chain */
-static inline struct ckpt_pgarr *pgarr_first(struct ckpt_ctx *ctx)
-{
-	if (list_empty(&ctx->pgarr_list))
-		return NULL;
-	return list_first_entry(&ctx->pgarr_list, struct ckpt_pgarr, list);
-}
-
-/* return (and detach) first empty page-array in the pool, if exists */
-static inline struct ckpt_pgarr *pgarr_from_pool(struct ckpt_ctx *ctx)
-{
-	struct ckpt_pgarr *pgarr;
-
-	if (list_empty(&ctx->pgarr_pool))
-		return NULL;
-	pgarr = list_first_entry(&ctx->pgarr_pool, struct ckpt_pgarr, list);
-	list_del(&pgarr->list);
-	return pgarr;
-}
-
-/* release pages referenced by a page-array */
-static void pgarr_release_pages(struct ckpt_pgarr *pgarr)
-{
-	ckpt_debug("total pages %d\n", pgarr->nr_used);
-	/*
-	 * both checkpoint and restart use 'nr_used', however we only
-	 * collect pages during checkpoint; in restart we simply return
-	 * because pgarr->pages remains NULL.
-	 */
-	if (pgarr->pages) {
-		struct page **pages = pgarr->pages;
-		int nr = pgarr->nr_used;
-
-		while (nr--)
-			page_cache_release(pages[nr]);
-	}
-
-	pgarr->nr_used = 0;
-}
-
-/* free a single page-array object */
-static void pgarr_free_one(struct ckpt_pgarr *pgarr)
-{
-	pgarr_release_pages(pgarr);
-	kfree(pgarr->pages);
-	kfree(pgarr->vaddrs);
-	kfree(pgarr);
-}
-
-/* free the chains of page-arrays (populated and empty pool) */
-void ckpt_pgarr_free(struct ckpt_ctx *ctx)
-{
-	struct ckpt_pgarr *pgarr, *tmp;
-
-	list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr_list, list) {
-		list_del(&pgarr->list);
-		pgarr_free_one(pgarr);
-	}
-
-	list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr_pool, list) {
-		list_del(&pgarr->list);
-		pgarr_free_one(pgarr);
-	}
-}
-
-/* allocate a single page-array object */
-static struct ckpt_pgarr *pgarr_alloc_one(unsigned long flags)
-{
-	struct ckpt_pgarr *pgarr;
-
-	pgarr = kzalloc(sizeof(*pgarr), GFP_KERNEL);
-	if (!pgarr)
-		return NULL;
-	pgarr->vaddrs = kmalloc(CKPT_PGARR_TOTAL * sizeof(unsigned long),
-				GFP_KERNEL);
-	if (!pgarr->vaddrs)
-		goto nomem;
-
-	/* pgarr->pages is needed only for checkpoint */
-	if (flags & CKPT_CTX_CHECKPOINT) {
-		pgarr->pages = kmalloc(CKPT_PGARR_TOTAL *
-				       sizeof(struct page *), GFP_KERNEL);
-		if (!pgarr->pages)
-			goto nomem;
-	}
-
-	return pgarr;
- nomem:
-	pgarr_free_one(pgarr);
-	return NULL;
-}
-
-/* pgarr_current - return the next available page-array in the chain
- * @ctx: checkpoint context
- *
- * Returns the first page-array in the list that has space. Otherwise,
- * try the next page-array after the last non-empty one, and move it to
- * the front of the chain. Extends the list if none has space.
- */
-static struct ckpt_pgarr *pgarr_current(struct ckpt_ctx *ctx)
-{
-	struct ckpt_pgarr *pgarr;
-
-	pgarr = pgarr_first(ctx);
-	if (pgarr && !pgarr_is_full(pgarr))
-		return pgarr;
-
-	pgarr = pgarr_from_pool(ctx);
-	if (!pgarr)
-		pgarr = pgarr_alloc_one(ctx->kflags);
-	if (!pgarr)
-		return NULL;
-
-	list_add(&pgarr->list, &ctx->pgarr_list);
-	return pgarr;
-}
-
-/* reset the page-array chain (dropping page references if necessary) */
-static void pgarr_reset_all(struct ckpt_ctx *ctx)
-{
-	struct ckpt_pgarr *pgarr;
-
-	list_for_each_entry(pgarr, &ctx->pgarr_list, list)
-		pgarr_release_pages(pgarr);
-	list_splice_init(&ctx->pgarr_list, &ctx->pgarr_pool);
-}
-
-/**************************************************************************
- * Checkpoint
- *
- * Checkpoint is outside the context of the checkpointee, so one cannot
- * simply read pages from user-space. Instead, we scan the address space
- * of the target to cherry-pick pages of interest. Selected pages are
- * enlisted in a page-array chain (attached to the checkpoint context).
- * To save their contents, each page is mapped to kernel memory and then
- * dumped to the file descriptor.
- */
-
-/**
- * consider_private_page - return page pointer for dirty pages
- * @vma - target vma
- * @addr - page address
- *
- * Looks up the page that correspond to the address in the vma, and
- * returns the page if it was modified (and grabs a reference to it),
- * or otherwise returns NULL (or error).
- */
-static struct page *consider_private_page(struct vm_area_struct *vma,
-					  unsigned long addr)
-{
-	return __get_dirty_page(vma, addr);
-}
-
-/**
- * consider_shared_page - return page pointer for dirty pages
- * @ino - inode of shmem object
- * @idx - page index in shmem object
- *
- * Looks up the page that corresponds to the index in the shmem object,
- * and returns the page if it was modified (and grabs a reference to it),
- * or otherwise returns NULL (or error).
- */
-static struct page *consider_shared_page(struct inode *ino, unsigned long idx)
-{
-	struct page *page = NULL;
-	int ret;
-
-	/*
-	 * Inspired by do_shmem_file_read(): very simplified version.
-	 *
-	 * FIXME: consolidate with do_shmem_file_read()
-	 */
-
-	ret = shmem_getpage(ino, idx, &page, SGP_READ, NULL);
-	if (ret < 0)
-		return ERR_PTR(ret);
-
-	/*
-	 * Only care about dirty pages; shmem_getpage() only returns
-	 * pages that have been allocated, so they must be dirty. The
-	 * pages returned are locked and referenced.
-	 */
-
-	if (page) {
-		unlock_page(page);
-		/*
-		 * If users can be writing to this page using arbitrary
-		 * virtual addresses, take care about potential aliasing
-		 * before reading the page on the kernel side.
-		 */
-		if (mapping_writably_mapped(ino->i_mapping))
-			flush_dcache_page(page);
-		/*
-		 * Mark the page accessed if we read the beginning.
-		 */
-		mark_page_accessed(page);
-	}
-
-	return page;
-}
-
-/**
- * vma_fill_pgarr - fill a page-array with addr/page tuples
- * @ctx - checkpoint context
- * @vma - vma to scan
- * @start - start address (updated)
- *
- * Returns the number of pages collected
- */
-static int vma_fill_pgarr(struct ckpt_ctx *ctx,
-			  struct vm_area_struct *vma, struct inode *inode,
-			  unsigned long *start, unsigned long end)
-{
-	unsigned long addr = *start;
-	struct ckpt_pgarr *pgarr;
-	int nr_used;
-	int cnt = 0;
-
-	BUG_ON(inode && vma);
-
-	if (vma)
-		down_read(&vma->vm_mm->mmap_sem);
-	do {
-		pgarr = pgarr_current(ctx);
-		if (!pgarr) {
-			cnt = -ENOMEM;
-			goto out;
-		}
-
-		nr_used = pgarr->nr_used;
-
-		while (addr < end) {
-			struct page *page;
-
-			if (vma)
-				page = consider_private_page(vma, addr);
-			else
-				page = consider_shared_page(inode, addr);
-
-			if (IS_ERR(page)) {
-				cnt = PTR_ERR(page);
-				goto out;
-			}
-
-			if (page) {
-				_ckpt_debug(CKPT_DPAGE,
-					    "got page %#lx\n", addr);
-				pgarr->pages[pgarr->nr_used] = page;
-				pgarr->vaddrs[pgarr->nr_used] = addr;
-				pgarr->nr_used++;
-			}
-
-			if (vma)
-				addr += PAGE_SIZE;
-			else
-				addr++;
-
-			if (pgarr_is_full(pgarr))
-				break;
-		}
-
-		cnt += pgarr->nr_used - nr_used;
-
-	} while ((cnt < CKPT_PGARR_BATCH) && (addr < end));
- out:
-	if (vma)
-		up_read(&vma->vm_mm->mmap_sem);
-	*start = addr;
-	return cnt;
-}
-
-/* dump contents of a pages: use kmap_atomic() to avoid TLB flush */
-int checkpoint_dump_page(struct ckpt_ctx *ctx, struct page *page)
-{
-	void *ptr;
-
-	ptr = kmap_atomic(page, KM_USER1);
-	memcpy(ctx->scratch_page, ptr, PAGE_SIZE);
-	kunmap_atomic(ptr, KM_USER1);
-
-	return ckpt_kwrite(ctx, ctx->scratch_page, PAGE_SIZE);
-}
-
-/**
- * vma_dump_pages - dump pages listed in the ctx page-array chain
- * @ctx - checkpoint context
- * @total - total number of pages
- *
- * First dump all virtual addresses, followed by the contents of all pages
- */
-static int vma_dump_pages(struct ckpt_ctx *ctx, int total)
-{
-	struct ckpt_pgarr *pgarr;
-	int i, ret = 0;
-
-	if (!total)
-		return 0;
-
-	i =  total * (sizeof(unsigned long) + PAGE_SIZE);
-	ret = ckpt_write_obj_type(ctx, NULL, i, CKPT_HDR_BUFFER);
-	if (ret < 0)
-		return ret;
-
-	list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
-		ret = ckpt_kwrite(ctx, pgarr->vaddrs,
-				  pgarr->nr_used * sizeof(unsigned long));
-		if (ret < 0)
-			return ret;
-	}
-
-	list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
-		for (i = 0; i < pgarr->nr_used; i++) {
-			ret = checkpoint_dump_page(ctx, pgarr->pages[i]);
-			if (ret < 0)
-				return ret;
-		}
-	}
-
-	return ret;
-}
-
-/**
- * checkpoint_memory_contents - dump contents of a memory region
- * @ctx - checkpoint context
- * @vma - vma to scan (--or--)
- * @inode - inode to scan
- *
- * Collect lists of pages that needs to be dumped, and corresponding
- * virtual addresses into ctx->pgarr_list page-array chain. Then dump
- * the addresses, followed by the page contents.
- */
-int checkpoint_memory_contents(struct ckpt_ctx *ctx,
-			       struct vm_area_struct *vma,
-			       struct inode *inode)
-{
-	struct ckpt_hdr_pgarr *h;
-	unsigned long addr, end;
-	int cnt, ret;
-
-	BUG_ON(vma && inode);
-
-	if (vma) {
-		addr = vma->vm_start;
-		end = vma->vm_end;
-	} else {
-		addr = 0;
-		end = PAGE_ALIGN(i_size_read(inode)) >> PAGE_CACHE_SHIFT;
-	}
-
-	/*
-	 * Work iteratively, collecting and dumping at most CKPT_PGARR_BATCH
-	 * in each round. Each iterations is divided into two steps:
-	 *
-	 * (1) scan: scan through the PTEs of the vma to collect the pages
-	 * to dump (later we'll also make them COW), while keeping a list
-	 * of pages and their corresponding addresses on ctx->pgarr_list.
-	 *
-	 * (2) dump: write out a header specifying how many pages, followed
-	 * by the addresses of all pages in ctx->pgarr_list, followed by
-	 * the actual contents of all pages. (Then, release the references
-	 * to the pages and reset the page-array chain).
-	 *
-	 * (This split makes the logic simpler by first counting the pages
-	 * that need saving. More importantly, it allows for a future
-	 * optimization that will reduce application downtime by deferring
-	 * the actual write-out of the data to after the application is
-	 * allowed to resume execution).
-	 *
-	 * After dumping the entire contents, conclude with a header that
-	 * specifies 0 pages to mark the end of the contents.
-	 */
-
-	while (addr < end) {
-		cnt = vma_fill_pgarr(ctx, vma, inode, &addr, end);
-		if (cnt == 0)
-			break;
-		else if (cnt < 0)
-			return cnt;
-
-		ckpt_debug("collected %d pages\n", cnt);
-
-		h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
-		if (!h)
-			return -ENOMEM;
-
-		h->nr_pages = cnt;
-		ret = ckpt_write_obj(ctx, &h->h);
-		ckpt_hdr_put(ctx, h);
-		if (ret < 0)
-			return ret;
-
-		ret = vma_dump_pages(ctx, cnt);
-		if (ret < 0)
-			return ret;
-
-		pgarr_reset_all(ctx);
-	}
-
-	/* mark end of contents with header saying "0" pages */
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
-	if (!h)
-		return -ENOMEM;
-	h->nr_pages = 0;
-	ret = ckpt_write_obj(ctx, &h->h);
-	ckpt_hdr_put(ctx, h);
-
-	return ret;
-}
-
-/**
- * generic_vma_checkpoint - dump metadata of vma
- * @ctx: checkpoint context
- * @vma: vma object
- * @type: vma type
- * @vma_objref: vma objref
- */
-int generic_vma_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma,
-			   enum vma_type type, int vma_objref, int ino_objref)
-{
-	struct ckpt_hdr_vma *h;
-	int ret;
-
-	ckpt_debug("vma %#lx-%#lx flags %#lx type %d\n",
-		 vma->vm_start, vma->vm_end, vma->vm_flags, type);
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_VMA);
-	if (!h)
-		return -ENOMEM;
-
-	h->vma_type = type;
-	h->vma_objref = vma_objref;
-	h->ino_objref = ino_objref;
-
-	if (vma->vm_file)
-		h->ino_size = i_size_read(vma->vm_file->f_dentry->d_inode);
-	else
-		h->ino_size = 0;
-
-	h->vm_start = vma->vm_start;
-	h->vm_end = vma->vm_end;
-	h->vm_page_prot = pgprot_val(vma->vm_page_prot);
-	h->vm_flags = vma->vm_flags;
-	h->vm_pgoff = vma->vm_pgoff;
-
-	ret = ckpt_write_obj(ctx, &h->h);
-	ckpt_hdr_put(ctx, h);
-
-	return ret;
-}
-
-/**
- * private_vma_checkpoint - dump contents of private (anon, file) vma
- * @ctx: checkpoint context
- * @vma: vma object
- * @type: vma type
- * @vma_objref: vma objref
- */
-int private_vma_checkpoint(struct ckpt_ctx *ctx,
-			   struct vm_area_struct *vma,
-			   enum vma_type type, int vma_objref)
-{
-	int ret;
-
-	BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE));
-
-	ret = generic_vma_checkpoint(ctx, vma, type, vma_objref, 0);
-	if (ret < 0)
-		goto out;
-	ret = checkpoint_memory_contents(ctx, vma, NULL);
- out:
-	return ret;
-}
-
-/**
- * shmem_vma_checkpoint - dump contents of private (anon, file) vma
- * @ctx: checkpoint context
- * @vma: vma object
- * @type: vma type
- * @objref: vma object id
- */
-int shmem_vma_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma,
-			 enum vma_type type, int ino_objref)
-{
-	struct file *file = vma->vm_file;
-	int ret;
-
-	ckpt_debug("type %d, ino_ref %d\n", type, ino_objref);
-	BUG_ON(!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE)));
-	BUG_ON(!file);
-
-	ret = generic_vma_checkpoint(ctx, vma, type, 0, ino_objref);
-	if (ret < 0)
-		goto out;
-	if (type == CKPT_VMA_SHM_ANON_SKIP)
-		goto out;
-	ret = checkpoint_memory_contents(ctx, NULL, file->f_dentry->d_inode);
- out:
-	return ret;
-}
-
-/**
- * anonymous_checkpoint - dump contents of private-anonymous vma
- * @ctx: checkpoint context
- * @vma: vma object
- */
-static int anonymous_checkpoint(struct ckpt_ctx *ctx,
-				struct vm_area_struct *vma)
-{
-	/* should be private anonymous ... verify that this is the case */
-	BUG_ON(vma->vm_flags & VM_MAYSHARE);
-	BUG_ON(vma->vm_file);
-
-	return private_vma_checkpoint(ctx, vma, CKPT_VMA_ANON, 0);
-}
-
-static int checkpoint_vmas(struct ckpt_ctx *ctx, struct mm_struct *mm)
-{
-	struct vm_area_struct *vma, *next;
-	int map_count = 0;
-	int ret = 0;
-
-	vma = kzalloc(sizeof(*vma), GFP_KERNEL);
-	if (!vma)
-		return -ENOMEM;
-
-	/*
-	 * Must not hold mm->mmap_sem when writing to image file, so
-	 * can't simply traverse the vma list. Instead, use find_vma()
-	 * to get the @next and make a local "copy" of it.
-	 */
-	while (1) {
-		down_read(&mm->mmap_sem);
-		next = find_vma(mm, vma->vm_end);
-		if (!next) {
-			up_read(&mm->mmap_sem);
-			break;
-		}
-		if (vma->vm_file)
-			fput(vma->vm_file);
-		*vma = *next;
-		if (vma->vm_file)
-			get_file(vma->vm_file);
-		up_read(&mm->mmap_sem);
-
-		map_count++;
-
-		ckpt_debug("vma %#lx-%#lx flags %#lx\n",
-			 vma->vm_start, vma->vm_end, vma->vm_flags);
-
-		if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
-			ckpt_err(ctx, -ENOSYS, "%(T)vma: bad flags (%#lx)\n",
-					vma->vm_flags);
-			ret = -ENOSYS;
-			break;
-		}
-
-		if (!vma->vm_ops)
-			ret = anonymous_checkpoint(ctx, vma);
-		else if (vma->vm_ops->checkpoint)
-			ret = (*vma->vm_ops->checkpoint)(ctx, vma);
-		else
-			ret = -ENOSYS;
-		if (ret < 0) {
-			ckpt_err(ctx, ret, "%(T)vma: failed\n");
-			break;
-		}
-		/*
-		 * The file was collected, but not always checkpointed;
-		 * be safe and mark as visited to appease leak detection
-		 */
-		if (vma->vm_file && !(ctx->uflags & CHECKPOINT_SUBTREE)) {
-			ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
-			if (ret < 0)
-				break;
-		}
-	}
-
-	if (vma->vm_file)
-		fput(vma->vm_file);
-
-	kfree(vma);
-
-	return ret < 0 ? ret : map_count;
-}
-
-#define CKPT_AT_SZ (AT_VECTOR_SIZE * sizeof(u64))
-/*
- * We always write saved_auxv out as an array of u64s, though it is
- * an array of u32s on 32-bit arch.
- */
-static int ckpt_write_auxv(struct ckpt_ctx *ctx, struct mm_struct *mm)
-{
-	int i, ret;
-	u64 *buf = kzalloc(CKPT_AT_SZ, GFP_KERNEL);
-
-	if (!buf)
-		return -ENOMEM;
-	for (i = 0; i < AT_VECTOR_SIZE; i++)
-		buf[i] = mm->saved_auxv[i];
-	ret = ckpt_write_buffer(ctx, buf, CKPT_AT_SZ);
-	kfree(buf);
-	return ret;
-}
-
-static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
-{
-	struct ckpt_hdr_mm *h;
-	struct file *exe_file = NULL;
-	int ret;
-
-	if (check_for_outstanding_aio(mm)) {
-		ckpt_err(ctx, -EBUSY, "(%T)Outstanding aio\n");
-		return -EBUSY;
-	}
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_MM);
-	if (!h)
-		return -ENOMEM;
-
-	down_read(&mm->mmap_sem);
-
-	h->flags = mm->flags;
-	h->def_flags = mm->def_flags;
-
-	h->start_code = mm->start_code;
-	h->end_code = mm->end_code;
-	h->start_data = mm->start_data;
-	h->end_data = mm->end_data;
-	h->start_brk = mm->start_brk;
-	h->brk = mm->brk;
-	h->start_stack = mm->start_stack;
-	h->arg_start = mm->arg_start;
-	h->arg_end = mm->arg_end;
-	h->env_start = mm->env_start;
-	h->env_end = mm->env_end;
-
-	h->map_count = mm->map_count;
-
-	if (mm->exe_file) {  /* checkpoint the ->exe_file */
-		exe_file = mm->exe_file;
-		get_file(exe_file);
-	}
-
-	/*
-	 * Drop mm->mmap_sem before writing data to checkpoint image
-	 * to avoid reverse locking order (inode must come before mm).
-	 */
-	up_read(&mm->mmap_sem);
-
-	if (exe_file) {
-		h->exe_objref = checkpoint_obj(ctx, exe_file, CKPT_OBJ_FILE);
-		if (h->exe_objref < 0) {
-			ret = h->exe_objref;
-			goto out;
-		}
-	}
-
-	ret = ckpt_write_obj(ctx, &h->h);
-	if (ret < 0)
-		goto out;
-
-	ret = ckpt_write_auxv(ctx, mm);
-	if (ret < 0)
-		return ret;
-
-	ret = checkpoint_vmas(ctx, mm);
-	if (ret != h->map_count && ret >= 0)
-		ret = -EBUSY; /* checkpoint mm leak */
-	if (ret < 0)
-		goto out;
-
-	ret = checkpoint_mm_context(ctx, mm);
- out:
-	if (exe_file)
-		fput(exe_file);
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-int checkpoint_mm(struct ckpt_ctx *ctx, void *ptr)
-{
-	return do_checkpoint_mm(ctx, (struct mm_struct *) ptr);
-}
-
-int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	struct mm_struct *mm;
-	int objref;
-
-	mm = get_task_mm(t);
-	objref = checkpoint_obj(ctx, mm, CKPT_OBJ_MM);
-	mmput(mm);
-
-	return objref;
-}
-
-/***********************************************************************
- * Collect
- */
-
-static int collect_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
-{
-	struct vm_area_struct *vma;
-	struct file *file;
-	int ret;
-
-	/* if already exists (ret == 0), nothing to do */
-	ret = ckpt_obj_collect(ctx, mm, CKPT_OBJ_MM);
-	if (ret <= 0)
-		return ret;
-
-	/* if first time for this mm (ret > 0), proceed inside */
-	down_read(&mm->mmap_sem);
-	if (mm->exe_file) {
-		ret = ckpt_collect_file(ctx, mm->exe_file);
-		if (ret < 0) {
-			ckpt_err(ctx, ret, "%(T)mm: collect exe_file\n");
-			goto out;
-		}
-	}
-	for (vma = mm->mmap; vma; vma = vma->vm_next) {
-		file = vma->vm_file;
-		if (!file)
-			continue;
-		ret = ckpt_collect_file(ctx, file);
-		if (ret < 0) {
-			ckpt_err(ctx, ret, "%(T)mm: collect vm_file\n");
-			break;
-		}
-	}
- out:
-	up_read(&mm->mmap_sem);
-	return ret;
-
-}
-
-int ckpt_collect_mm(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	struct mm_struct *mm;
-	int ret;
-
-	mm = get_task_mm(t);
-	ret = collect_mm(ctx, mm);
-	mmput(mm);
-
-	return ret;
-}
-
-/***********************************************************************
- * Restart
- *
- * Unlike checkpoint, restart is executed in the context of each restarting
- * process: vma regions are restored via a call to mmap(), and the data is
- * read into the address space of the current process.
- */
-
-/**
- * read_pages_vaddrs - read addresses of pages to page-array chain
- * @ctx - restart context
- * @nr_pages - number of address to read
- */
-static int read_pages_vaddrs(struct ckpt_ctx *ctx, unsigned long nr_pages)
-{
-	struct ckpt_pgarr *pgarr;
-	unsigned long *vaddrp;
-	int nr, ret;
-
-	while (nr_pages) {
-		pgarr = pgarr_current(ctx);
-		if (!pgarr)
-			return -ENOMEM;
-		nr = pgarr_nr_free(pgarr);
-		if (nr > nr_pages)
-			nr = nr_pages;
-		vaddrp = &pgarr->vaddrs[pgarr->nr_used];
-		ret = ckpt_kread(ctx, vaddrp, nr * sizeof(unsigned long));
-		if (ret < 0)
-			return ret;
-		pgarr->nr_used += nr;
-		nr_pages -= nr;
-	}
-	return 0;
-}
-
-int restore_read_page(struct ckpt_ctx *ctx, struct page *page)
-{
-	void *ptr;
-	int ret;
-
-	ret = ckpt_kread(ctx, ctx->scratch_page, PAGE_SIZE);
-	if (ret < 0)
-		return ret;
-
-	ptr = kmap_atomic(page, KM_USER1);
-	memcpy(ptr, ctx->scratch_page, PAGE_SIZE);
-	kunmap_atomic(ptr, KM_USER1);
-
-	return 0;
-}
-
-static struct page *bring_private_page(unsigned long addr)
-{
-	struct page *page;
-	int ret;
-
-	ret = get_user_pages(current, current->mm, addr, 1, 1, 1, &page, NULL);
-	if (ret < 0)
-		page = ERR_PTR(ret);
-	return page;
-}
-
-static struct page *bring_shared_page(unsigned long idx, struct inode *ino)
-{
-	struct page *page = NULL;
-	int ret;
-
-	ret = shmem_getpage(ino, idx, &page, SGP_WRITE, NULL);
-	if (ret < 0)
-		return ERR_PTR(ret);
-	if (page)
-		unlock_page(page);
-	return page;
-}
-
-/**
- * read_pages_contents - read in data of pages in page-array chain
- * @ctx - restart context
- */
-static int read_pages_contents(struct ckpt_ctx *ctx, struct inode *inode)
-{
-	struct ckpt_pgarr *pgarr;
-	unsigned long *vaddrs;
-	int i, ret;
-
-	list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
-		vaddrs = pgarr->vaddrs;
-		for (i = 0; i < pgarr->nr_used; i++) {
-			struct page *page;
-
-			/* TODO: do in chunks to reduce mmap_sem overhead */
-			_ckpt_debug(CKPT_DPAGE, "got page %#lx\n", vaddrs[i]);
-			down_read(&current->mm->mmap_sem);
-			if (inode)
-				page = bring_shared_page(vaddrs[i], inode);
-			else
-				page = bring_private_page(vaddrs[i]);
-			up_read(&current->mm->mmap_sem);
-
-			if (IS_ERR(page))
-				return PTR_ERR(page);
-
-			ret = restore_read_page(ctx, page);
-			page_cache_release(page);
-
-			if (ret < 0)
-				return ret;
-		}
-	}
-	return 0;
-}
-
-/**
- * restore_memory_contents - restore contents of a memory region
- * @ctx - restart context
- * @inode - backing inode
- *
- * Reads a header that specifies how many pages will follow, then reads
- * a list of virtual addresses into ctx->pgarr_list page-array chain,
- * followed by the actual contents of the corresponding pages. Iterates
- * these steps until reaching a header specifying "0" pages, which marks
- * the end of the contents.
- */
-int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode)
-{
-	struct ckpt_hdr_pgarr *h;
-	unsigned long nr_pages;
-	int len, ret = 0;
-
-	while (1) {
-		h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
-		if (IS_ERR(h))
-			break;
-
-		ckpt_debug("total pages %ld\n", (unsigned long) h->nr_pages);
-
-		nr_pages = h->nr_pages;
-		ckpt_hdr_put(ctx, h);
-
-		if (!nr_pages)
-			break;
-
-		len = nr_pages * (sizeof(unsigned long) + PAGE_SIZE);
-		ret = _ckpt_read_buffer(ctx, NULL, len);
-		if (ret < 0)
-			break;
-
-		ret = read_pages_vaddrs(ctx, nr_pages);
-		if (ret < 0)
-			break;
-		ret = read_pages_contents(ctx, inode);
-		if (ret < 0)
-			break;
-		pgarr_reset_all(ctx);
-	}
-
-	return ret;
-}
-
-/**
- * calc_map_prot_bits - convert vm_flags to mmap protection
- * orig_vm_flags: source vm_flags
- */
-static unsigned long calc_map_prot_bits(unsigned long orig_vm_flags)
-{
-	unsigned long vm_prot = 0;
-
-	if (orig_vm_flags & VM_READ)
-		vm_prot |= PROT_READ;
-	if (orig_vm_flags & VM_WRITE)
-		vm_prot |= PROT_WRITE;
-	if (orig_vm_flags & VM_EXEC)
-		vm_prot |= PROT_EXEC;
-	if (orig_vm_flags & PROT_SEM)   /* only (?) with IPC-SHM  */
-		vm_prot |= PROT_SEM;
-
-	return vm_prot;
-}
-
-/**
- * calc_map_flags_bits - convert vm_flags to mmap flags
- * orig_vm_flags: source vm_flags
- */
-static unsigned long calc_map_flags_bits(unsigned long orig_vm_flags)
-{
-	unsigned long vm_flags = 0;
-
-	vm_flags = MAP_FIXED;
-	if (orig_vm_flags & VM_GROWSDOWN)
-		vm_flags |= MAP_GROWSDOWN;
-	if (orig_vm_flags & VM_DENYWRITE)
-		vm_flags |= MAP_DENYWRITE;
-	if (orig_vm_flags & VM_EXECUTABLE)
-		vm_flags |= MAP_EXECUTABLE;
-	if (orig_vm_flags & VM_MAYSHARE)
-		vm_flags |= MAP_SHARED;
-	else
-		vm_flags |= MAP_PRIVATE;
-
-	return vm_flags;
-}
-
-/**
- * generic_vma_restore - restore a vma
- * @mm - address space
- * @file - file to map (NULL for anonymous)
- * @h - vma header data
- */
-unsigned long generic_vma_restore(struct mm_struct *mm,
-				  struct file *file,
-				  struct ckpt_hdr_vma *h)
-{
-	unsigned long vm_size, vm_start, vm_flags, vm_prot, vm_pgoff;
-	unsigned long addr;
-
-	if (h->vm_end < h->vm_start)
-		return -EINVAL;
-	if (h->vma_objref < 0)
-		return -EINVAL;
-
-	vm_start = h->vm_start;
-	vm_pgoff = h->vm_pgoff;
-	vm_size = h->vm_end - h->vm_start;
-	vm_prot = calc_map_prot_bits(h->vm_flags);
-	vm_flags = calc_map_flags_bits(h->vm_flags);
-
-	down_write(&mm->mmap_sem);
-	addr = do_mmap_pgoff(file, vm_start, vm_size,
-			     vm_prot, vm_flags, vm_pgoff);
-	up_write(&mm->mmap_sem);
-	ckpt_debug("size %#lx prot %#lx flag %#lx pgoff %#lx => %#lx\n",
-		 vm_size, vm_prot, vm_flags, vm_pgoff, addr);
-
-	return addr;
-}
-
-/**
- * private_vma_restore - read vma data, recreate it and read contents
- * @ctx: checkpoint context
- * @mm: memory address space
- * @file: file to use for mapping
- * @h - vma header data
- */
-int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
-			struct file *file, struct ckpt_hdr_vma *h)
-{
-	unsigned long addr;
-
-	if (h->vm_flags & (VM_SHARED | VM_MAYSHARE))
-		return -EINVAL;
-
-	addr = generic_vma_restore(mm, file, h);
-	if (IS_ERR((void *) addr))
-		return PTR_ERR((void *) addr);
-
-	return restore_memory_contents(ctx, NULL);
-}
-
-/**
- * anon_private_restore - read vma data, recreate it and read contents
- * @ctx: checkpoint context
- * @mm: memory address space
- * @h - vma header data
- */
-static int anon_private_restore(struct ckpt_ctx *ctx,
-				     struct mm_struct *mm,
-				     struct ckpt_hdr_vma *h)
-{
-	/*
-	 * vm_pgoff for anonymous mapping is the "global" page
-	 * offset (namely from addr 0x0), so we force a zero
-	 */
-	h->vm_pgoff = 0;
-
-	return private_vma_restore(ctx, mm, NULL, h);
-}
-
-static int bad_vma_restore(struct ckpt_ctx *ctx,
-			   struct mm_struct *mm,
-			   struct ckpt_hdr_vma *h)
-{
-	return -EINVAL;
-}
-
-/* callbacks to restore vma per its type: */
-struct restore_vma_ops {
-	char *vma_name;
-	enum vma_type vma_type;
-	int (*restore) (struct ckpt_ctx *ctx,
-			struct mm_struct *mm,
-			struct ckpt_hdr_vma *ptr);
-};
-
-static struct restore_vma_ops restore_vma_ops[] = {
-	/* ignored vma */
-	{
-		.vma_name = "IGNORE",
-		.vma_type = CKPT_VMA_IGNORE,
-		.restore = NULL,
-	},
-	/* special mapping (vdso) */
-	{
-		.vma_name = "VDSO",
-		.vma_type = CKPT_VMA_VDSO,
-		.restore = special_mapping_restore,
-	},
-	/* anonymous private */
-	{
-		.vma_name = "ANON PRIVATE",
-		.vma_type = CKPT_VMA_ANON,
-		.restore = anon_private_restore,
-	},
-	/* file-mapped private */
-	{
-		.vma_name = "FILE PRIVATE",
-		.vma_type = CKPT_VMA_FILE,
-		.restore = filemap_restore,
-	},
-	/* anonymous shared */
-	{
-		.vma_name = "ANON SHARED",
-		.vma_type = CKPT_VMA_SHM_ANON,
-		.restore = shmem_restore,
-	},
-	/* anonymous shared (skipped) */
-	{
-		.vma_name = "ANON SHARED (skip)",
-		.vma_type = CKPT_VMA_SHM_ANON_SKIP,
-		.restore = shmem_restore,
-	},
-	/* file-mapped shared */
-	{
-		.vma_name = "FILE SHARED",
-		.vma_type = CKPT_VMA_SHM_FILE,
-		.restore = filemap_restore,
-	},
-	/* sysvipc shared */
-	{
-		.vma_name = "IPC SHARED",
-		.vma_type = CKPT_VMA_SHM_IPC,
-		/* ipc inode itself is restore by restore_ipc_ns()... */
-		.restore = bad_vma_restore,
-
-	},
-	/* sysvipc shared (skip) */
-	{
-		.vma_name = "IPC SHARED (skip)",
-		.vma_type = CKPT_VMA_SHM_IPC_SKIP,
-		.restore = ipcshm_restore,
-	},
-};
-
-/**
- * restore_vma - read vma data, recreate it and read contents
- * @ctx: checkpoint context
- * @mm: memory address space
- */
-static int restore_vma(struct ckpt_ctx *ctx, struct mm_struct *mm)
-{
-	struct ckpt_hdr_vma *h;
-	struct restore_vma_ops *ops;
-	int ret;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_VMA);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-
-	ckpt_debug("vma %#lx-%#lx flags %#lx type %d vmaref %d inoref %d\n",
-		   (unsigned long) h->vm_start, (unsigned long) h->vm_end,
-		   (unsigned long) h->vm_flags, (int) h->vma_type,
-		   (int) h->vma_objref, (int) h->ino_objref);
-
-	ret = -EINVAL;
-	if (h->vm_end < h->vm_start)
-		goto out;
-	if (h->vma_objref < 0 || h->ino_objref < 0)
-		goto out;
-	if (h->vma_type >= CKPT_VMA_MAX)
-		goto out;
-	if (h->vm_flags & CKPT_VMA_NOT_SUPPORTED)
-		return -ENOSYS;
-
-	ops = &restore_vma_ops[h->vma_type];
-
-	/* make sure we don't change this accidentally */
-	BUG_ON(ops->vma_type != h->vma_type);
-
-	if (ops->restore) {
-		ckpt_debug("vma type %s\n", ops->vma_name);
-		ret = ops->restore(ctx, mm, h);
-	} else {
-		ckpt_debug("vma ignored\n");
-		ret = 0;
-	}
- out:
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-static int ckpt_read_auxv(struct ckpt_ctx *ctx, struct mm_struct *mm)
-{
-	int i, ret;
-	u64 *buf = kmalloc(CKPT_AT_SZ, GFP_KERNEL);
-
-	if (!buf)
-		return -ENOMEM;
-	ret = _ckpt_read_buffer(ctx, buf, CKPT_AT_SZ);
-	if (ret < 0)
-		goto out;
-
-	ret = -E2BIG;
-	for (i = 0; i < AT_VECTOR_SIZE; i++)
-		if (buf[i] > (u64) ULONG_MAX)
-			goto out;
-
-	for (i = 0; i < AT_VECTOR_SIZE - 1; i++)
-		mm->saved_auxv[i] = buf[i];
-	/* sanitize the input: force AT_NULL in last entry  */
-	mm->saved_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
-
-	ret = 0;
- out:
-	kfree(buf);
-	return ret;
-}
-
-static struct mm_struct *do_restore_mm(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_mm *h;
-	struct mm_struct *mm = NULL;
-	struct file *file;
-	unsigned int nr;
-	int ret;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_MM);
-	if (IS_ERR(h))
-		return (struct mm_struct *) h;
-
-	ckpt_debug("map_count %d\n", h->map_count);
-
-	/* XXX need more sanity checks */
-
-	ret = -EINVAL;
-	if ((h->start_code > h->end_code) ||
-	    (h->start_data > h->end_data))
-		goto out;
-	if (h->exe_objref < 0)
-		goto out;
-	if (h->def_flags & ~VM_LOCKED)
-		goto out;
-	if (h->flags & ~(MMF_DUMP_FILTER_MASK |
-			 ((1 << MMF_DUMP_FILTER_BITS) - 1)))
-		goto out;
-
-	mm = current->mm;
-
-	/* point of no return -- destruct current mm */
-	down_write(&mm->mmap_sem);
-	ret = destroy_mm(mm);
-	if (ret < 0) {
-		up_write(&mm->mmap_sem);
-		goto out;
-	}
-
-	mm->flags = h->flags;
-	mm->def_flags = h->def_flags;
-
-	mm->start_code = h->start_code;
-	mm->end_code = h->end_code;
-	mm->start_data = h->start_data;
-	mm->end_data = h->end_data;
-	mm->start_brk = h->start_brk;
-	mm->brk = h->brk;
-	mm->start_stack = h->start_stack;
-	mm->arg_start = h->arg_start;
-	mm->arg_end = h->arg_end;
-	mm->env_start = h->env_start;
-	mm->env_end = h->env_end;
-
-	/* restore the ->exe_file */
-	if (h->exe_objref) {
-		file = ckpt_obj_fetch(ctx, h->exe_objref, CKPT_OBJ_FILE);
-		if (IS_ERR(file)) {
-			up_write(&mm->mmap_sem);
-			ret = PTR_ERR(file);
-			goto out;
-		}
-		set_mm_exe_file(mm, file);
-	}
-	up_write(&mm->mmap_sem);
-
-	ret = ckpt_read_auxv(ctx, mm);
-	if (ret < 0) {
-		ckpt_err(ctx, ret, "Error restoring auxv\n");
-		goto out;
-	}
-
-	for (nr = h->map_count; nr; nr--) {
-		ret = restore_vma(ctx, mm);
-		if (ret < 0)
-			goto out;
-	}
-
-	ret = restore_mm_context(ctx, mm);
- out:
-	ckpt_hdr_put(ctx, h);
-	if (ret < 0)
-		return ERR_PTR(ret);
-	/* restore_obj() expect an extra reference */
-	atomic_inc(&mm->mm_users);
-	return mm;
-}
-
-void *restore_mm(struct ckpt_ctx *ctx)
-{
-	return (void *) do_restore_mm(ctx);
-}
-
-int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref)
-{
-	struct mm_struct *mm;
-	int ret;
-
-	mm = ckpt_obj_fetch(ctx, mm_objref, CKPT_OBJ_MM);
-	if (IS_ERR(mm))
-		return PTR_ERR(mm);
-
-	if (mm == current->mm)
-		return 0;
-
-	ret = exec_mmap(mm);
-	if (ret < 0)
-		return ret;
-
-	atomic_inc(&mm->mm_users);
-	return 0;
-}
diff --git a/checkpoint/process.c b/checkpoint/process.c
index f917112..6e3e382 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -21,6 +21,7 @@
 #include <linux/user_namespace.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <linux/mm_checkpoint.h>
 #include <linux/syscalls.h>
 
 
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index d34ff98..a420c02 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -21,6 +21,7 @@
 #include <linux/uaccess.h>
 #include <linux/capability.h>
 #include <linux/checkpoint.h>
+#include <linux/mm_checkpoint.h> /* for ckpt_pgarr_free() */
 #include <linux/deferqueue.h>
 
 /*
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 7a06272..0fc3f70 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -101,9 +101,6 @@ extern int ckpt_read_consume(struct ckpt_ctx *ctx, int len, int type);
 extern char *ckpt_fill_fname(struct path *path, struct path *root,
 			     char *buf, int *len);
 
-extern int checkpoint_dump_page(struct ckpt_ctx *ctx, struct page *page);
-extern int restore_read_page(struct ckpt_ctx *ctx, struct page *page);
-
 /* pids */
 extern pid_t ckpt_pid_nr(struct ckpt_ctx *ctx, struct pid *pid);
 extern struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid);
@@ -174,6 +171,9 @@ extern int ckpt_obj_reserve(struct ckpt_ctx *ctx);
 extern struct ckpt_ctx *ckpt_ctx_get(struct ckpt_ctx *ctx);
 extern void ckpt_ctx_put(struct ckpt_ctx *ctx);
 
+extern int checkpoint_dump_page(struct ckpt_ctx *ctx, struct page *page);
+extern int restore_read_page(struct ckpt_ctx *ctx, struct page *page);
+
 extern long do_checkpoint(struct ckpt_ctx *ctx, pid_t pid);
 extern long do_restart(struct ckpt_ctx *ctx, pid_t pid, unsigned long flags);
 
@@ -189,12 +189,10 @@ extern void post_restore_task(void);
 extern int checkpoint_write_header_arch(struct ckpt_ctx *ctx);
 extern int checkpoint_thread(struct ckpt_ctx *ctx, struct task_struct *t);
 extern int checkpoint_cpu(struct ckpt_ctx *ctx, struct task_struct *t);
-extern int checkpoint_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
 
 extern int restore_read_header_arch(struct ckpt_ctx *ctx);
 extern int restore_thread(struct ckpt_ctx *ctx);
 extern int restore_cpu(struct ckpt_ctx *ctx);
-extern int restore_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
 
 extern int checkpoint_restart_block(struct ckpt_ctx *ctx,
 				    struct task_struct *t);
@@ -261,48 +259,6 @@ extern void *restore_cred(struct ckpt_ctx *ctx);
 
 extern int checkpoint_userns(struct ckpt_ctx *ctx, void *ptr);
 extern void *restore_userns(struct ckpt_ctx *ctx);
-
-/* memory */
-extern void ckpt_pgarr_free(struct ckpt_ctx *ctx);
-
-extern int generic_vma_checkpoint(struct ckpt_ctx *ctx,
-				  struct vm_area_struct *vma,
-				  enum vma_type type,
-				  int vma_objref, int ino_objref);
-extern int private_vma_checkpoint(struct ckpt_ctx *ctx,
-				  struct vm_area_struct *vma,
-				  enum vma_type type,
-				  int vma_objref);
-extern int shmem_vma_checkpoint(struct ckpt_ctx *ctx,
-				struct vm_area_struct *vma,
-				enum vma_type type,
-				int ino_objref);
-
-extern int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t);
-extern int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref);
-
-extern int ckpt_collect_mm(struct ckpt_ctx *ctx, struct task_struct *t);
-extern int checkpoint_mm(struct ckpt_ctx *ctx, void *ptr);
-extern void *restore_mm(struct ckpt_ctx *ctx);
-
-extern unsigned long generic_vma_restore(struct mm_struct *mm,
-					 struct file *file,
-					 struct ckpt_hdr_vma *h);
-
-extern int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
-			       struct file *file, struct ckpt_hdr_vma *h);
-
-extern int checkpoint_memory_contents(struct ckpt_ctx *ctx,
-				      struct vm_area_struct *vma,
-				      struct inode *inode);
-extern int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode);
-
-
-#define CKPT_VMA_NOT_SUPPORTED						\
-	(VM_IO | VM_HUGETLB | VM_NONLINEAR | VM_PFNMAP |		\
-	 VM_RESERVED | VM_NORESERVE | VM_HUGETLB | VM_NONLINEAR |	\
-	 VM_MAPPED_COPY | VM_INSERTPAGE | VM_MIXEDMAP | VM_SAO)
-
 /* signals */
 extern int checkpoint_obj_sighand(struct ckpt_ctx *ctx, struct task_struct *t);
 extern int restore_obj_sighand(struct ckpt_ctx *ctx, int sighand_objref);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 210d8e3..2459d1d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1303,8 +1303,26 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
 #ifdef CONFIG_PROC_FS
+/* Set/Get/Dup a reference to the file /proc/<pid>/exe symlinks to. */
+extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
+extern struct file *get_mm_exe_file(struct mm_struct *mm);
+extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
+
 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
 #else
+static inline void set_mm_exe_file(struct mm_struct *mm,
+				   struct file *new_exe_file)
+{}
+
+static inline struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+	return NULL;
+}
+
+static inline void dup_mm_exe_file(struct mm_struct *oldmm,
+				   struct mm_struct *newmm)
+{}
+
 static inline void vm_stat_account(struct mm_struct *mm,
 			unsigned long flags, struct file *file, long pages)
 {
diff --git a/include/linux/mm_checkpoint.h b/include/linux/mm_checkpoint.h
new file mode 100644
index 0000000..0092321
--- /dev/null
+++ b/include/linux/mm_checkpoint.h
@@ -0,0 +1,45 @@
+#ifndef _LINUX_MM_CHECKPOINT_H
+#define _LINUX_MM_CHECKPOINT_H
+
+#include <linux/checkpoint.h> /* for ckpt_obj_fetch, restore_read_page */
+#include <linux/checkpoint_hdr.h> /* for struct ckpt_hdr_vma */
+#include <linux/checkpoint_types.h> /* for struct ckpt_ctx */
+
+extern void ckpt_pgarr_free(struct ckpt_ctx *ctx);
+
+extern int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref);
+
+extern int ckpt_collect_mm(struct ckpt_ctx *ctx, struct task_struct *t);
+
+extern int checkpoint_memory_contents(struct ckpt_ctx *ctx,
+				      struct vm_area_struct *vma,
+				      struct inode *inode);
+extern int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode);
+
+/* common vma checkpoint/restore operations */
+extern int generic_vma_checkpoint(struct ckpt_ctx *ctx,
+				  struct vm_area_struct *vma,
+				  enum vma_type type,
+				  int vma_objref, int ino_objref);
+extern unsigned long generic_vma_restore(struct mm_struct *mm,
+					 struct file *file,
+					 struct ckpt_hdr_vma *h);
+extern int private_vma_checkpoint(struct ckpt_ctx *ctx,
+				  struct vm_area_struct *vma,
+				  enum vma_type type,
+				  int vma_objref);
+extern int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+			       struct file *file, struct ckpt_hdr_vma *h);
+extern int shmem_vma_checkpoint(struct ckpt_ctx *ctx,
+				struct vm_area_struct *vma,
+				enum vma_type type,
+				int ino_objref);
+
+
+#define CKPT_VMA_NOT_SUPPORTED						\
+	(VM_IO | VM_HUGETLB | VM_NONLINEAR | VM_PFNMAP |		\
+	 VM_RESERVED | VM_NORESERVE | VM_HUGETLB | VM_NONLINEAR |	\
+	 VM_MAPPED_COPY | VM_INSERTPAGE | VM_MIXEDMAP | VM_SAO)
+
+#endif /* _LINUX_MM_CHECKPOINT_H */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 379eaed..e187078 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -173,12 +173,6 @@ extern void proc_net_remove(struct net *net, const char *name);
 extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
 	struct proc_dir_entry *parent);
 
-/* While the {get|set|dup}_mm_exe_file functions are for mm_structs, they are
- * only needed to implement /proc/<pid>|self/exe so we define them here. */
-extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
-extern struct file *get_mm_exe_file(struct mm_struct *mm);
-extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
-
 #else
 
 #define proc_net_fops_create(net, name, mode, fops)  ({ (void)(mode), NULL; })
@@ -226,19 +220,6 @@ static inline void pid_ns_release_proc(struct pid_namespace *ns)
 {
 }
 
-static inline void set_mm_exe_file(struct mm_struct *mm,
-				   struct file *new_exe_file)
-{}
-
-static inline struct file *get_mm_exe_file(struct mm_struct *mm)
-{
-	return NULL;
-}
-
-static inline void dup_mm_exe_file(struct mm_struct *oldmm,
-	       			   struct mm_struct *newmm)
-{}
-
 #endif /* CONFIG_PROC_FS */
 
 #if !defined(CONFIG_PROC_KCORE)
diff --git a/ipc/checkpoint.c b/ipc/checkpoint.c
index 4322dea..2b05067 100644
--- a/ipc/checkpoint.c
+++ b/ipc/checkpoint.c
@@ -15,8 +15,7 @@
 #include <linux/msg.h>
 #include <linux/sched.h>
 #include <linux/ipc_namespace.h>
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
+#include <linux/mm_checkpoint.h>
 
 #include "util.h"
 
diff --git a/ipc/checkpoint_msg.c b/ipc/checkpoint_msg.c
index 61b3d78..51fb712 100644
--- a/ipc/checkpoint_msg.c
+++ b/ipc/checkpoint_msg.c
@@ -22,8 +22,7 @@
 
 #include "util.h"
 
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
+#include <linux/mm_checkpoint.h>
 
 /************************************************************************
  * ipc checkpoint
diff --git a/ipc/checkpoint_sem.c b/ipc/checkpoint_sem.c
index 395c84d..d1a9ba1 100644
--- a/ipc/checkpoint_sem.c
+++ b/ipc/checkpoint_sem.c
@@ -22,8 +22,7 @@
 struct msg_msg;
 #include "util.h"
 
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
+#include <linux/mm_checkpoint.h>
 
 /************************************************************************
  * ipc checkpoint
diff --git a/ipc/checkpoint_shm.c b/ipc/checkpoint_shm.c
index 01091d9..f06cfda 100644
--- a/ipc/checkpoint_shm.c
+++ b/ipc/checkpoint_shm.c
@@ -26,8 +26,7 @@
 #include <linux/msg.h>	/* needed for util.h that uses 'struct msg_msg' */
 #include "util.h"
 
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
+#include <linux/mm_checkpoint.h>
 
 /************************************************************************
  * ipc checkpoint
diff --git a/ipc/shm.c b/ipc/shm.c
index 18ae1b8..f4c3cea 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -39,7 +39,7 @@
 #include <linux/nsproxy.h>
 #include <linux/mount.h>
 #include <linux/ipc_namespace.h>
-#include <linux/checkpoint.h>
+#include <linux/mm_checkpoint.h>
 
 #include <asm/uaccess.h>
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 4ea28e6..1495ec0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,7 +34,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
-#include <linux/checkpoint.h>
+#include <linux/mm_checkpoint.h>
 #include "internal.h"
 
 /*
diff --git a/mm/mmap.c b/mm/mmap.c
index b19a754..c934021 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,8 @@
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
 #include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/mm_checkpoint.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -2624,6 +2626,1352 @@ void mm_drop_all_locks(struct mm_struct *mm)
 }
 
 #ifdef CONFIG_CHECKPOINT
+/* default debug level for output */
+#undef CKPT_DFLAG
+#define CKPT_DFLAG  CKPT_DMEM
+/*
+ * page-array chains: each ckpt_pgarr describes a set of <struct page *,vaddr>
+ * tuples (where vaddr is the virtual address of a page in a particular mm).
+ * Specifically, we use separate arrays so that all vaddrs can be written
+ * and read at once.
+ */
+
+struct ckpt_pgarr {
+	unsigned long *vaddrs;
+	struct page **pages;
+	unsigned int nr_used;
+	struct list_head list;
+};
+
+#define CKPT_PGARR_TOTAL  (PAGE_SIZE / sizeof(void *))
+#define CKPT_PGARR_BATCH  (16 * CKPT_PGARR_TOTAL)
+
+static inline int pgarr_is_full(struct ckpt_pgarr *pgarr)
+{
+	return (pgarr->nr_used == CKPT_PGARR_TOTAL);
+}
+
+static inline int pgarr_nr_free(struct ckpt_pgarr *pgarr)
+{
+	return CKPT_PGARR_TOTAL - pgarr->nr_used;
+}
+
+/*
+ * utilities to alloc, free, and handle 'struct ckpt_pgarr' (page-arrays)
+ * (common to ckpt_mem.c and rstr_mem.c).
+ *
+ * The checkpoint context structure has two members for page-arrays:
+ *   ctx->pgarr_list: list head of populated page-array chain
+ *   ctx->pgarr_pool: list head of empty page-array pool chain
+ *
+ * During checkpoint (and restart) the chain tracks the dirty pages (page
+ * pointer and virtual address) of each MM. For a particular MM, these are
+ * always added to the head of the page-array chain (ctx->pgarr_list).
+ * Before the next chunk of pages, the chain is reset (by dereferencing
+ * all pages) but not freed; instead, empty descsriptors are kept in pool.
+ *
+ * The head of the chain page-array ("current") advances as necessary. When
+ * it gets full, a new page-array descriptor is pushed in front of it. The
+ * new descriptor is taken from first empty descriptor (if one exists, for
+ * instance, after a chain reset), or allocated on-demand.
+ *
+ * When dumping the data, the chain is traversed in reverse order.
+ */
+
+/* return first page-array in the chain */
+static inline struct ckpt_pgarr *pgarr_first(struct ckpt_ctx *ctx)
+{
+	if (list_empty(&ctx->pgarr_list))
+		return NULL;
+	return list_first_entry(&ctx->pgarr_list, struct ckpt_pgarr, list);
+}
+
+/* return (and detach) first empty page-array in the pool, if exists */
+static inline struct ckpt_pgarr *pgarr_from_pool(struct ckpt_ctx *ctx)
+{
+	struct ckpt_pgarr *pgarr;
+
+	if (list_empty(&ctx->pgarr_pool))
+		return NULL;
+	pgarr = list_first_entry(&ctx->pgarr_pool, struct ckpt_pgarr, list);
+	list_del(&pgarr->list);
+	return pgarr;
+}
+
+/* release pages referenced by a page-array */
+static void pgarr_release_pages(struct ckpt_pgarr *pgarr)
+{
+	ckpt_debug("total pages %d\n", pgarr->nr_used);
+	/*
+	 * both checkpoint and restart use 'nr_used', however we only
+	 * collect pages during checkpoint; in restart we simply return
+	 * because pgarr->pages remains NULL.
+	 */
+	if (pgarr->pages) {
+		struct page **pages = pgarr->pages;
+		int nr = pgarr->nr_used;
+
+		while (nr--)
+			page_cache_release(pages[nr]);
+	}
+
+	pgarr->nr_used = 0;
+}
+
+/* free a single page-array object */
+static void pgarr_free_one(struct ckpt_pgarr *pgarr)
+{
+	pgarr_release_pages(pgarr);
+	kfree(pgarr->pages);
+	kfree(pgarr->vaddrs);
+	kfree(pgarr);
+}
+
+/* free the chains of page-arrays (populated and empty pool) */
+void ckpt_pgarr_free(struct ckpt_ctx *ctx)
+{
+	struct ckpt_pgarr *pgarr, *tmp;
+
+	list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr_list, list) {
+		list_del(&pgarr->list);
+		pgarr_free_one(pgarr);
+	}
+
+	list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr_pool, list) {
+		list_del(&pgarr->list);
+		pgarr_free_one(pgarr);
+	}
+}
+
+/* allocate a single page-array object */
+static struct ckpt_pgarr *pgarr_alloc_one(unsigned long flags)
+{
+	struct ckpt_pgarr *pgarr;
+
+	pgarr = kzalloc(sizeof(*pgarr), GFP_KERNEL);
+	if (!pgarr)
+		return NULL;
+	pgarr->vaddrs = kmalloc(CKPT_PGARR_TOTAL * sizeof(unsigned long),
+				GFP_KERNEL);
+	if (!pgarr->vaddrs)
+		goto nomem;
+
+	/* pgarr->pages is needed only for checkpoint */
+	if (flags & CKPT_CTX_CHECKPOINT) {
+		pgarr->pages = kmalloc(CKPT_PGARR_TOTAL *
+				       sizeof(struct page *), GFP_KERNEL);
+		if (!pgarr->pages)
+			goto nomem;
+	}
+
+	return pgarr;
+ nomem:
+	pgarr_free_one(pgarr);
+	return NULL;
+}
+
+/* pgarr_current - return the next available page-array in the chain
+ * @ctx: checkpoint context
+ *
+ * Returns the first page-array in the list that has space. Otherwise,
+ * try the next page-array after the last non-empty one, and move it to
+ * the front of the chain. Extends the list if none has space.
+ */
+static struct ckpt_pgarr *pgarr_current(struct ckpt_ctx *ctx)
+{
+	struct ckpt_pgarr *pgarr;
+
+	pgarr = pgarr_first(ctx);
+	if (pgarr && !pgarr_is_full(pgarr))
+		return pgarr;
+
+	pgarr = pgarr_from_pool(ctx);
+	if (!pgarr)
+		pgarr = pgarr_alloc_one(ctx->kflags);
+	if (!pgarr)
+		return NULL;
+
+	list_add(&pgarr->list, &ctx->pgarr_list);
+	return pgarr;
+}
+
+/* reset the page-array chain (dropping page references if necessary) */
+static void pgarr_reset_all(struct ckpt_ctx *ctx)
+{
+	struct ckpt_pgarr *pgarr;
+
+	list_for_each_entry(pgarr, &ctx->pgarr_list, list)
+		pgarr_release_pages(pgarr);
+	list_splice_init(&ctx->pgarr_list, &ctx->pgarr_pool);
+}
+
+/**************************************************************************
+ * Checkpoint
+ *
+ * Checkpoint is outside the context of the checkpointee, so one cannot
+ * simply read pages from user-space. Instead, we scan the address space
+ * of the target to cherry-pick pages of interest. Selected pages are
+ * enlisted in a page-array chain (attached to the checkpoint context).
+ * To save their contents, each page is mapped to kernel memory and then
+ * dumped to the file descriptor.
+ */
+
+/**
+ * consider_private_page - return page pointer for dirty pages
+ * @vma - target vma
+ * @addr - page address
+ *
+ * Looks up the page that correspond to the address in the vma, and
+ * returns the page if it was modified (and grabs a reference to it),
+ * or otherwise returns NULL (or error).
+ */
+static struct page *consider_private_page(struct vm_area_struct *vma,
+					  unsigned long addr)
+{
+	return __get_dirty_page(vma, addr);
+}
+
+/**
+ * consider_shared_page - return page pointer for dirty pages
+ * @ino - inode of shmem object
+ * @idx - page index in shmem object
+ *
+ * Looks up the page that corresponds to the index in the shmem object,
+ * and returns the page if it was modified (and grabs a reference to it),
+ * or otherwise returns NULL (or error).
+ */
+static struct page *consider_shared_page(struct inode *ino, unsigned long idx)
+{
+	struct page *page = NULL;
+	int ret;
+
+	/*
+	 * Inspired by do_shmem_file_read(): very simplified version.
+	 *
+	 * FIXME: consolidate with do_shmem_file_read()
+	 */
+
+	ret = shmem_getpage(ino, idx, &page, SGP_READ, NULL);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	/*
+	 * Only care about dirty pages; shmem_getpage() only returns
+	 * pages that have been allocated, so they must be dirty. The
+	 * pages returned are locked and referenced.
+	 */
+
+	if (page) {
+		unlock_page(page);
+		/*
+		 * If users can be writing to this page using arbitrary
+		 * virtual addresses, take care about potential aliasing
+		 * before reading the page on the kernel side.
+		 */
+		if (mapping_writably_mapped(ino->i_mapping))
+			flush_dcache_page(page);
+		/*
+		 * Mark the page accessed if we read the beginning.
+		 */
+		mark_page_accessed(page);
+	}
+
+	return page;
+}
+
+/**
+ * vma_fill_pgarr - fill a page-array with addr/page tuples
+ * @ctx - checkpoint context
+ * @vma - vma to scan
+ * @start - start address (updated)
+ *
+ * Returns the number of pages collected
+ */
+static int vma_fill_pgarr(struct ckpt_ctx *ctx,
+			  struct vm_area_struct *vma, struct inode *inode,
+			  unsigned long *start, unsigned long end)
+{
+	unsigned long addr = *start;
+	struct ckpt_pgarr *pgarr;
+	int nr_used;
+	int cnt = 0;
+
+	BUG_ON(inode && vma);
+
+	if (vma)
+		down_read(&vma->vm_mm->mmap_sem);
+	do {
+		pgarr = pgarr_current(ctx);
+		if (!pgarr) {
+			cnt = -ENOMEM;
+			goto out;
+		}
+
+		nr_used = pgarr->nr_used;
+
+		while (addr < end) {
+			struct page *page;
+
+			if (vma)
+				page = consider_private_page(vma, addr);
+			else
+				page = consider_shared_page(inode, addr);
+
+			if (IS_ERR(page)) {
+				cnt = PTR_ERR(page);
+				goto out;
+			}
+
+			if (page) {
+				_ckpt_debug(CKPT_DPAGE,
+					    "got page %#lx\n", addr);
+				pgarr->pages[pgarr->nr_used] = page;
+				pgarr->vaddrs[pgarr->nr_used] = addr;
+				pgarr->nr_used++;
+			}
+
+			if (vma)
+				addr += PAGE_SIZE;
+			else
+				addr++;
+
+			if (pgarr_is_full(pgarr))
+				break;
+		}
+
+		cnt += pgarr->nr_used - nr_used;
+
+	} while ((cnt < CKPT_PGARR_BATCH) && (addr < end));
+ out:
+	if (vma)
+		up_read(&vma->vm_mm->mmap_sem);
+	*start = addr;
+	return cnt;
+}
+
+/* dump contents of a pages: use kmap_atomic() to avoid TLB flush */
+int checkpoint_dump_page(struct ckpt_ctx *ctx, struct page *page)
+{
+	void *ptr;
+
+	ptr = kmap_atomic(page, KM_USER1);
+	memcpy(ctx->scratch_page, ptr, PAGE_SIZE);
+	kunmap_atomic(ptr, KM_USER1);
+
+	return ckpt_kwrite(ctx, ctx->scratch_page, PAGE_SIZE);
+}
+
+/**
+ * vma_dump_pages - dump pages listed in the ctx page-array chain
+ * @ctx - checkpoint context
+ * @total - total number of pages
+ *
+ * First dump all virtual addresses, followed by the contents of all pages
+ */
+static int vma_dump_pages(struct ckpt_ctx *ctx, int total)
+{
+	struct ckpt_pgarr *pgarr;
+	int i, ret = 0;
+
+	if (!total)
+		return 0;
+
+	i =  total * (sizeof(unsigned long) + PAGE_SIZE);
+	ret = ckpt_write_obj_type(ctx, NULL, i, CKPT_HDR_BUFFER);
+	if (ret < 0)
+		return ret;
+
+	list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
+		ret = ckpt_kwrite(ctx, pgarr->vaddrs,
+				  pgarr->nr_used * sizeof(unsigned long));
+		if (ret < 0)
+			return ret;
+	}
+
+	list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
+		for (i = 0; i < pgarr->nr_used; i++) {
+			ret = checkpoint_dump_page(ctx, pgarr->pages[i]);
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * checkpoint_memory_contents - dump contents of a memory region
+ * @ctx - checkpoint context
+ * @vma - vma to scan (--or--)
+ * @inode - inode to scan
+ *
+ * Collect lists of pages that needs to be dumped, and corresponding
+ * virtual addresses into ctx->pgarr_list page-array chain. Then dump
+ * the addresses, followed by the page contents.
+ */
+int checkpoint_memory_contents(struct ckpt_ctx *ctx,
+			       struct vm_area_struct *vma,
+			       struct inode *inode)
+{
+	struct ckpt_hdr_pgarr *h;
+	unsigned long addr, end;
+	int cnt, ret;
+
+	BUG_ON(vma && inode);
+
+	if (vma) {
+		addr = vma->vm_start;
+		end = vma->vm_end;
+	} else {
+		addr = 0;
+		end = PAGE_ALIGN(i_size_read(inode)) >> PAGE_CACHE_SHIFT;
+	}
+
+	/*
+	 * Work iteratively, collecting and dumping at most CKPT_PGARR_BATCH
+	 * in each round. Each iterations is divided into two steps:
+	 *
+	 * (1) scan: scan through the PTEs of the vma to collect the pages
+	 * to dump (later we'll also make them COW), while keeping a list
+	 * of pages and their corresponding addresses on ctx->pgarr_list.
+	 *
+	 * (2) dump: write out a header specifying how many pages, followed
+	 * by the addresses of all pages in ctx->pgarr_list, followed by
+	 * the actual contents of all pages. (Then, release the references
+	 * to the pages and reset the page-array chain).
+	 *
+	 * (This split makes the logic simpler by first counting the pages
+	 * that need saving. More importantly, it allows for a future
+	 * optimization that will reduce application downtime by deferring
+	 * the actual write-out of the data to after the application is
+	 * allowed to resume execution).
+	 *
+	 * After dumping the entire contents, conclude with a header that
+	 * specifies 0 pages to mark the end of the contents.
+	 */
+
+	while (addr < end) {
+		cnt = vma_fill_pgarr(ctx, vma, inode, &addr, end);
+		if (cnt == 0)
+			break;
+		else if (cnt < 0)
+			return cnt;
+
+		ckpt_debug("collected %d pages\n", cnt);
+
+		h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
+		if (!h)
+			return -ENOMEM;
+
+		h->nr_pages = cnt;
+		ret = ckpt_write_obj(ctx, &h->h);
+		ckpt_hdr_put(ctx, h);
+		if (ret < 0)
+			return ret;
+
+		ret = vma_dump_pages(ctx, cnt);
+		if (ret < 0)
+			return ret;
+
+		pgarr_reset_all(ctx);
+	}
+
+	/* mark end of contents with header saying "0" pages */
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
+	if (!h)
+		return -ENOMEM;
+	h->nr_pages = 0;
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+/**
+ * generic_vma_checkpoint - dump metadata of vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ * @type: vma type
+ * @vma_objref: vma objref
+ */
+int generic_vma_checkpoint(struct ckpt_ctx *ctx,
+			   struct vm_area_struct *vma,
+			   enum vma_type type, int vma_objref,
+			   int ino_objref)
+{
+	struct ckpt_hdr_vma *h;
+	int ret;
+
+	ckpt_debug("vma %#lx-%#lx flags %#lx type %d\n",
+		 vma->vm_start, vma->vm_end, vma->vm_flags, type);
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_VMA);
+	if (!h)
+		return -ENOMEM;
+
+	h->vma_type = type;
+	h->vma_objref = vma_objref;
+	h->ino_objref = ino_objref;
+
+	if (vma->vm_file)
+		h->ino_size = i_size_read(vma->vm_file->f_dentry->d_inode);
+	else
+		h->ino_size = 0;
+
+	h->vm_start = vma->vm_start;
+	h->vm_end = vma->vm_end;
+	h->vm_page_prot = pgprot_val(vma->vm_page_prot);
+	h->vm_flags = vma->vm_flags;
+	h->vm_pgoff = vma->vm_pgoff;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+/**
+ * private_vma_checkpoint - dump contents of private (anon, file) vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ * @type: vma type
+ * @vma_objref: vma objref
+ */
+int private_vma_checkpoint(struct ckpt_ctx *ctx,
+			   struct vm_area_struct *vma,
+			   enum vma_type type, int vma_objref)
+{
+	int ret;
+
+	BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE));
+
+	ret = generic_vma_checkpoint(ctx, vma, type, vma_objref, 0);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_memory_contents(ctx, vma, NULL);
+ out:
+	return ret;
+}
+
+/**
+ * shmem_vma_checkpoint - dump contents of private (anon, file) vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ * @type: vma type
+ * @objref: vma object id
+ */
+int shmem_vma_checkpoint(struct ckpt_ctx *ctx,
+			 struct vm_area_struct *vma,
+			 enum vma_type type, int ino_objref)
+{
+	struct file *file = vma->vm_file;
+	int ret;
+
+	ckpt_debug("type %d, ino_ref %d\n", type, ino_objref);
+	BUG_ON(!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE)));
+	BUG_ON(!file);
+
+	ret = generic_vma_checkpoint(ctx, vma, type, 0, ino_objref);
+	if (ret < 0)
+		goto out;
+	if (type == CKPT_VMA_SHM_ANON_SKIP)
+		goto out;
+	ret = checkpoint_memory_contents(ctx, NULL, file->f_dentry->d_inode);
+ out:
+	return ret;
+}
+
+/**
+ * anonymous_checkpoint - dump contents of private-anonymous vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ */
+static int anonymous_checkpoint(struct ckpt_ctx *ctx,
+				struct vm_area_struct *vma)
+{
+	/* should be private anonymous ... verify that this is the case */
+	BUG_ON(vma->vm_flags & VM_MAYSHARE);
+	BUG_ON(vma->vm_file);
+
+	return private_vma_checkpoint(ctx, vma, CKPT_VMA_ANON, 0);
+}
+
+static int checkpoint_vmas(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+	struct vm_area_struct *vma, *next;
+	int map_count = 0;
+	int ret = 0;
+
+	vma = kzalloc(sizeof(*vma), GFP_KERNEL);
+	if (!vma)
+		return -ENOMEM;
+
+	/*
+	 * Must not hold mm->mmap_sem when writing to image file, so
+	 * can't simply traverse the vma list. Instead, use find_vma()
+	 * to get the @next and make a local "copy" of it.
+	 */
+	while (1) {
+		down_read(&mm->mmap_sem);
+		next = find_vma(mm, vma->vm_end);
+		if (!next) {
+			up_read(&mm->mmap_sem);
+			break;
+		}
+		if (vma->vm_file)
+			fput(vma->vm_file);
+		*vma = *next;
+		if (vma->vm_file)
+			get_file(vma->vm_file);
+		up_read(&mm->mmap_sem);
+
+		map_count++;
+
+		ckpt_debug("vma %#lx-%#lx flags %#lx\n",
+			 vma->vm_start, vma->vm_end, vma->vm_flags);
+
+		if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
+			ckpt_err(ctx, -ENOSYS, "%(T)vma: bad flags (%#lx)\n",
+					vma->vm_flags);
+			ret = -ENOSYS;
+			break;
+		}
+
+		if (!vma->vm_ops)
+			ret = anonymous_checkpoint(ctx, vma);
+		else if (vma->vm_ops->checkpoint)
+			ret = (*vma->vm_ops->checkpoint)(ctx, vma);
+		else
+			ret = -ENOSYS;
+		if (ret < 0) {
+			ckpt_err(ctx, ret, "%(T)vma: failed\n");
+			break;
+		}
+		/*
+		 * The file was collected, but not always checkpointed;
+		 * be safe and mark as visited to appease leak detection
+		 */
+		if (vma->vm_file && !(ctx->uflags & CHECKPOINT_SUBTREE)) {
+			ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
+			if (ret < 0)
+				break;
+		}
+	}
+
+	if (vma->vm_file)
+		fput(vma->vm_file);
+
+	kfree(vma);
+
+	return ret < 0 ? ret : map_count;
+}
+
+#define CKPT_AT_SZ (AT_VECTOR_SIZE * sizeof(u64))
+/*
+ * We always write saved_auxv out as an array of u64s, though it is
+ * an array of u32s on 32-bit arch.
+ */
+static int ckpt_write_auxv(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+	int i, ret;
+	u64 *buf = kzalloc(CKPT_AT_SZ, GFP_KERNEL);
+
+	if (!buf)
+		return -ENOMEM;
+	for (i = 0; i < AT_VECTOR_SIZE; i++)
+		buf[i] = mm->saved_auxv[i];
+	ret = ckpt_write_buffer(ctx, buf, CKPT_AT_SZ);
+	kfree(buf);
+	return ret;
+}
+
+/* Defined by arch */
+extern int checkpoint_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
+
+static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+	struct ckpt_hdr_mm *h;
+	struct file *exe_file = NULL;
+	int ret;
+
+	if (check_for_outstanding_aio(mm)) {
+		ckpt_err(ctx, -EBUSY, "(%T)Outstanding aio\n");
+		return -EBUSY;
+	}
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_MM);
+	if (!h)
+		return -ENOMEM;
+
+	down_read(&mm->mmap_sem);
+
+	h->flags = mm->flags;
+	h->def_flags = mm->def_flags;
+
+	h->start_code = mm->start_code;
+	h->end_code = mm->end_code;
+	h->start_data = mm->start_data;
+	h->end_data = mm->end_data;
+	h->start_brk = mm->start_brk;
+	h->brk = mm->brk;
+	h->start_stack = mm->start_stack;
+	h->arg_start = mm->arg_start;
+	h->arg_end = mm->arg_end;
+	h->env_start = mm->env_start;
+	h->env_end = mm->env_end;
+
+	h->map_count = mm->map_count;
+
+	if (mm->exe_file) {  /* checkpoint the ->exe_file */
+		exe_file = mm->exe_file;
+		get_file(exe_file);
+	}
+
+	/*
+	 * Drop mm->mmap_sem before writing data to checkpoint image
+	 * to avoid reverse locking order (inode must come before mm).
+	 */
+	up_read(&mm->mmap_sem);
+
+	if (exe_file) {
+		h->exe_objref = checkpoint_obj(ctx, exe_file, CKPT_OBJ_FILE);
+		if (h->exe_objref < 0) {
+			ret = h->exe_objref;
+			goto out;
+		}
+	}
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	if (ret < 0)
+		goto out;
+
+	ret = ckpt_write_auxv(ctx, mm);
+	if (ret < 0)
+		return ret;
+
+	ret = checkpoint_vmas(ctx, mm);
+	if (ret != h->map_count && ret >= 0)
+		ret = -EBUSY; /* checkpoint mm leak */
+	if (ret < 0)
+		goto out;
+
+	ret = checkpoint_mm_context(ctx, mm);
+ out:
+	if (exe_file)
+		fput(exe_file);
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static int checkpoint_mm(struct ckpt_ctx *ctx, void *ptr)
+{
+	return do_checkpoint_mm(ctx, (struct mm_struct *) ptr);
+}
+
+int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct mm_struct *mm;
+	int objref;
+
+	mm = get_task_mm(t);
+	objref = checkpoint_obj(ctx, mm, CKPT_OBJ_MM);
+	mmput(mm);
+
+	return objref;
+}
+
+/***********************************************************************
+ * Collect
+ */
+
+static int collect_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	struct file *file;
+	int ret;
+
+	/* if already exists (ret == 0), nothing to do */
+	ret = ckpt_obj_collect(ctx, mm, CKPT_OBJ_MM);
+	if (ret <= 0)
+		return ret;
+
+	/* if first time for this mm (ret > 0), proceed inside */
+	down_read(&mm->mmap_sem);
+	if (mm->exe_file) {
+		ret = ckpt_collect_file(ctx, mm->exe_file);
+		if (ret < 0) {
+			ckpt_err(ctx, ret, "%(T)mm: collect exe_file\n");
+			goto out;
+		}
+	}
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		file = vma->vm_file;
+		if (!file)
+			continue;
+		ret = ckpt_collect_file(ctx, file);
+		if (ret < 0) {
+			ckpt_err(ctx, ret, "%(T)mm: collect vm_file\n");
+			break;
+		}
+	}
+ out:
+	up_read(&mm->mmap_sem);
+	return ret;
+
+}
+
+int ckpt_collect_mm(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct mm_struct *mm;
+	int ret;
+
+	mm = get_task_mm(t);
+	ret = collect_mm(ctx, mm);
+	mmput(mm);
+
+	return ret;
+}
+
+/***********************************************************************
+ * Restart
+ *
+ * Unlike checkpoint, restart is executed in the context of each restarting
+ * process: vma regions are restored via a call to mmap(), and the data is
+ * read into the address space of the current process.
+ */
+
+/**
+ * read_pages_vaddrs - read addresses of pages to page-array chain
+ * @ctx - restart context
+ * @nr_pages - number of address to read
+ */
+static int read_pages_vaddrs(struct ckpt_ctx *ctx, unsigned long nr_pages)
+{
+	struct ckpt_pgarr *pgarr;
+	unsigned long *vaddrp;
+	int nr, ret;
+
+	while (nr_pages) {
+		pgarr = pgarr_current(ctx);
+		if (!pgarr)
+			return -ENOMEM;
+		nr = pgarr_nr_free(pgarr);
+		if (nr > nr_pages)
+			nr = nr_pages;
+		vaddrp = &pgarr->vaddrs[pgarr->nr_used];
+		ret = ckpt_kread(ctx, vaddrp, nr * sizeof(unsigned long));
+		if (ret < 0)
+			return ret;
+		pgarr->nr_used += nr;
+		nr_pages -= nr;
+	}
+	return 0;
+}
+
+int restore_read_page(struct ckpt_ctx *ctx, struct page *page)
+{
+	void *ptr;
+	int ret;
+
+	ret = ckpt_kread(ctx, ctx->scratch_page, PAGE_SIZE);
+	if (ret < 0)
+		return ret;
+
+	ptr = kmap_atomic(page, KM_USER1);
+	memcpy(ptr, ctx->scratch_page, PAGE_SIZE);
+	kunmap_atomic(ptr, KM_USER1);
+
+	return 0;
+}
+
+static struct page *bring_private_page(unsigned long addr)
+{
+	struct page *page;
+	int ret;
+
+	ret = get_user_pages(current, current->mm, addr, 1, 1, 1, &page, NULL);
+	if (ret < 0)
+		page = ERR_PTR(ret);
+	return page;
+}
+
+static struct page *bring_shared_page(unsigned long idx, struct inode *ino)
+{
+	struct page *page = NULL;
+	int ret;
+
+	ret = shmem_getpage(ino, idx, &page, SGP_WRITE, NULL);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (page)
+		unlock_page(page);
+	return page;
+}
+
+/**
+ * read_pages_contents - read in data of pages in page-array chain
+ * @ctx - restart context
+ */
+static int read_pages_contents(struct ckpt_ctx *ctx, struct inode *inode)
+{
+	struct ckpt_pgarr *pgarr;
+	unsigned long *vaddrs;
+	int i, ret;
+
+	list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
+		vaddrs = pgarr->vaddrs;
+		for (i = 0; i < pgarr->nr_used; i++) {
+			struct page *page;
+
+			/* TODO: do in chunks to reduce mmap_sem overhead */
+			_ckpt_debug(CKPT_DPAGE, "got page %#lx\n", vaddrs[i]);
+			down_read(&current->mm->mmap_sem);
+			if (inode)
+				page = bring_shared_page(vaddrs[i], inode);
+			else
+				page = bring_private_page(vaddrs[i]);
+			up_read(&current->mm->mmap_sem);
+
+			if (IS_ERR(page))
+				return PTR_ERR(page);
+
+			ret = restore_read_page(ctx, page);
+			page_cache_release(page);
+
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+/**
+ * restore_memory_contents - restore contents of a memory region
+ * @ctx - restart context
+ * @inode - backing inode
+ *
+ * Reads a header that specifies how many pages will follow, then reads
+ * a list of virtual addresses into ctx->pgarr_list page-array chain,
+ * followed by the actual contents of the corresponding pages. Iterates
+ * these steps until reaching a header specifying "0" pages, which marks
+ * the end of the contents.
+ */
+int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode)
+{
+	struct ckpt_hdr_pgarr *h;
+	unsigned long nr_pages;
+	int len, ret = 0;
+
+	while (1) {
+		h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
+		if (IS_ERR(h))
+			break;
+
+		ckpt_debug("total pages %ld\n", (unsigned long) h->nr_pages);
+
+		nr_pages = h->nr_pages;
+		ckpt_hdr_put(ctx, h);
+
+		if (!nr_pages)
+			break;
+
+		len = nr_pages * (sizeof(unsigned long) + PAGE_SIZE);
+		ret = _ckpt_read_buffer(ctx, NULL, len);
+		if (ret < 0)
+			break;
+
+		ret = read_pages_vaddrs(ctx, nr_pages);
+		if (ret < 0)
+			break;
+		ret = read_pages_contents(ctx, inode);
+		if (ret < 0)
+			break;
+		pgarr_reset_all(ctx);
+	}
+
+	return ret;
+}
+
+/**
+ * calc_map_prot_bits - convert vm_flags to mmap protection
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long calc_map_prot_bits(unsigned long orig_vm_flags)
+{
+	unsigned long vm_prot = 0;
+
+	if (orig_vm_flags & VM_READ)
+		vm_prot |= PROT_READ;
+	if (orig_vm_flags & VM_WRITE)
+		vm_prot |= PROT_WRITE;
+	if (orig_vm_flags & VM_EXEC)
+		vm_prot |= PROT_EXEC;
+	if (orig_vm_flags & PROT_SEM)   /* only (?) with IPC-SHM  */
+		vm_prot |= PROT_SEM;
+
+	return vm_prot;
+}
+
+/**
+ * calc_map_flags_bits - convert vm_flags to mmap flags
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long calc_map_flags_bits(unsigned long orig_vm_flags)
+{
+	unsigned long vm_flags = 0;
+
+	vm_flags = MAP_FIXED;
+	if (orig_vm_flags & VM_GROWSDOWN)
+		vm_flags |= MAP_GROWSDOWN;
+	if (orig_vm_flags & VM_DENYWRITE)
+		vm_flags |= MAP_DENYWRITE;
+	if (orig_vm_flags & VM_EXECUTABLE)
+		vm_flags |= MAP_EXECUTABLE;
+	if (orig_vm_flags & VM_MAYSHARE)
+		vm_flags |= MAP_SHARED;
+	else
+		vm_flags |= MAP_PRIVATE;
+
+	return vm_flags;
+}
+
+/**
+ * generic_vma_restore - restore a vma
+ * @mm - address space
+ * @file - file to map (NULL for anonymous)
+ * @h - vma header data
+ */
+unsigned long generic_vma_restore(struct mm_struct *mm,
+				  struct file *file,
+				  struct ckpt_hdr_vma *h)
+{
+	unsigned long vm_size, vm_start, vm_flags, vm_prot, vm_pgoff;
+	unsigned long addr;
+
+	if (h->vm_end < h->vm_start)
+		return -EINVAL;
+	if (h->vma_objref < 0)
+		return -EINVAL;
+
+	vm_start = h->vm_start;
+	vm_pgoff = h->vm_pgoff;
+	vm_size = h->vm_end - h->vm_start;
+	vm_prot = calc_map_prot_bits(h->vm_flags);
+	vm_flags = calc_map_flags_bits(h->vm_flags);
+
+	down_write(&mm->mmap_sem);
+	addr = do_mmap_pgoff(file, vm_start, vm_size,
+			     vm_prot, vm_flags, vm_pgoff);
+	up_write(&mm->mmap_sem);
+	ckpt_debug("size %#lx prot %#lx flag %#lx pgoff %#lx => %#lx\n",
+		 vm_size, vm_prot, vm_flags, vm_pgoff, addr);
+
+	return addr;
+}
+
+/**
+ * private_vma_restore - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ * @file: file to use for mapping
+ * @h - vma header data
+ */
+int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+			struct file *file, struct ckpt_hdr_vma *h)
+{
+	unsigned long addr;
+
+	if (h->vm_flags & (VM_SHARED | VM_MAYSHARE))
+		return -EINVAL;
+
+	addr = generic_vma_restore(mm, file, h);
+	if (IS_ERR((void *) addr))
+		return PTR_ERR((void *) addr);
+
+	return restore_memory_contents(ctx, NULL);
+}
+
+/**
+ * anon_private_restore - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ * @h - vma header data
+ */
+static int anon_private_restore(struct ckpt_ctx *ctx,
+				     struct mm_struct *mm,
+				     struct ckpt_hdr_vma *h)
+{
+	/*
+	 * vm_pgoff for anonymous mapping is the "global" page
+	 * offset (namely from addr 0x0), so we force a zero
+	 */
+	h->vm_pgoff = 0;
+
+	return private_vma_restore(ctx, mm, NULL, h);
+}
+
+static int bad_vma_restore(struct ckpt_ctx *ctx,
+			   struct mm_struct *mm,
+			   struct ckpt_hdr_vma *h)
+{
+	return -EINVAL;
+}
+
+/* callbacks to restore vma per its type: */
+struct restore_vma_ops {
+	char *vma_name;
+	enum vma_type vma_type;
+	int (*restore) (struct ckpt_ctx *ctx,
+			struct mm_struct *mm,
+			struct ckpt_hdr_vma *ptr);
+};
+
+static struct restore_vma_ops restore_vma_ops[] = {
+	/* ignored vma */
+	{
+		.vma_name = "IGNORE",
+		.vma_type = CKPT_VMA_IGNORE,
+		.restore = NULL,
+	},
+	/* special mapping (vdso) */
+	{
+		.vma_name = "VDSO",
+		.vma_type = CKPT_VMA_VDSO,
+		.restore = special_mapping_restore,
+	},
+	/* anonymous private */
+	{
+		.vma_name = "ANON PRIVATE",
+		.vma_type = CKPT_VMA_ANON,
+		.restore = anon_private_restore,
+	},
+	/* file-mapped private */
+	{
+		.vma_name = "FILE PRIVATE",
+		.vma_type = CKPT_VMA_FILE,
+		.restore = filemap_restore,
+	},
+	/* anonymous shared */
+	{
+		.vma_name = "ANON SHARED",
+		.vma_type = CKPT_VMA_SHM_ANON,
+		.restore = shmem_restore,
+	},
+	/* anonymous shared (skipped) */
+	{
+		.vma_name = "ANON SHARED (skip)",
+		.vma_type = CKPT_VMA_SHM_ANON_SKIP,
+		.restore = shmem_restore,
+	},
+	/* file-mapped shared */
+	{
+		.vma_name = "FILE SHARED",
+		.vma_type = CKPT_VMA_SHM_FILE,
+		.restore = filemap_restore,
+	},
+	/* sysvipc shared */
+	{
+		.vma_name = "IPC SHARED",
+		.vma_type = CKPT_VMA_SHM_IPC,
+		/* ipc inode itself is restore by restore_ipc_ns()... */
+		.restore = bad_vma_restore,
+
+	},
+	/* sysvipc shared (skip) */
+	{
+		.vma_name = "IPC SHARED (skip)",
+		.vma_type = CKPT_VMA_SHM_IPC_SKIP,
+		.restore = ipcshm_restore,
+	},
+};
+
+/**
+ * restore_vma - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ */
+static int restore_vma(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+	struct ckpt_hdr_vma *h;
+	struct restore_vma_ops *ops;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_VMA);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ckpt_debug("vma %#lx-%#lx flags %#lx type %d vmaref %d inoref %d\n",
+		   (unsigned long) h->vm_start, (unsigned long) h->vm_end,
+		   (unsigned long) h->vm_flags, (int) h->vma_type,
+		   (int) h->vma_objref, (int) h->ino_objref);
+
+	ret = -EINVAL;
+	if (h->vm_end < h->vm_start)
+		goto out;
+	if (h->vma_objref < 0 || h->ino_objref < 0)
+		goto out;
+	if (h->vma_type >= CKPT_VMA_MAX)
+		goto out;
+	if (h->vm_flags & CKPT_VMA_NOT_SUPPORTED)
+		return -ENOSYS;
+
+	ops = &restore_vma_ops[h->vma_type];
+
+	/* make sure we don't change this accidentally */
+	BUG_ON(ops->vma_type != h->vma_type);
+
+	if (ops->restore) {
+		ckpt_debug("vma type %s\n", ops->vma_name);
+		ret = ops->restore(ctx, mm, h);
+	} else {
+		ckpt_debug("vma ignored\n");
+		ret = 0;
+	}
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static int ckpt_read_auxv(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+	int i, ret;
+	u64 *buf = kmalloc(CKPT_AT_SZ, GFP_KERNEL);
+
+	if (!buf)
+		return -ENOMEM;
+	ret = _ckpt_read_buffer(ctx, buf, CKPT_AT_SZ);
+	if (ret < 0)
+		goto out;
+
+	ret = -E2BIG;
+	for (i = 0; i < AT_VECTOR_SIZE; i++)
+		if (buf[i] > (u64) ULONG_MAX)
+			goto out;
+
+	for (i = 0; i < AT_VECTOR_SIZE - 1; i++)
+		mm->saved_auxv[i] = buf[i];
+	/* sanitize the input: force AT_NULL in last entry  */
+	mm->saved_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
+
+	ret = 0;
+ out:
+	kfree(buf);
+	return ret;
+}
+
+extern int restore_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
+
+static struct mm_struct *do_restore_mm(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_mm *h;
+	struct mm_struct *mm = NULL;
+	struct file *file;
+	unsigned int nr;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_MM);
+	if (IS_ERR(h))
+		return (struct mm_struct *) h;
+
+	ckpt_debug("map_count %d\n", h->map_count);
+
+	/* XXX need more sanity checks */
+
+	ret = -EINVAL;
+	if ((h->start_code > h->end_code) ||
+	    (h->start_data > h->end_data))
+		goto out;
+	if (h->exe_objref < 0)
+		goto out;
+	if (h->def_flags & ~VM_LOCKED)
+		goto out;
+	if (h->flags & ~(MMF_DUMP_FILTER_MASK |
+			 ((1 << MMF_DUMP_FILTER_BITS) - 1)))
+		goto out;
+
+	mm = current->mm;
+
+	/* point of no return -- destruct current mm */
+	down_write(&mm->mmap_sem);
+	ret = destroy_mm(mm);
+	if (ret < 0) {
+		up_write(&mm->mmap_sem);
+		goto out;
+	}
+
+	mm->flags = h->flags;
+	mm->def_flags = h->def_flags;
+
+	mm->start_code = h->start_code;
+	mm->end_code = h->end_code;
+	mm->start_data = h->start_data;
+	mm->end_data = h->end_data;
+	mm->start_brk = h->start_brk;
+	mm->brk = h->brk;
+	mm->start_stack = h->start_stack;
+	mm->arg_start = h->arg_start;
+	mm->arg_end = h->arg_end;
+	mm->env_start = h->env_start;
+	mm->env_end = h->env_end;
+
+	/* restore the ->exe_file */
+	if (h->exe_objref) {
+		file = ckpt_obj_fetch(ctx, h->exe_objref, CKPT_OBJ_FILE);
+		if (IS_ERR(file)) {
+			up_write(&mm->mmap_sem);
+			ret = PTR_ERR(file);
+			goto out;
+		}
+		set_mm_exe_file(mm, file);
+	}
+	up_write(&mm->mmap_sem);
+
+	ret = ckpt_read_auxv(ctx, mm);
+	if (ret < 0) {
+		ckpt_err(ctx, ret, "Error restoring auxv\n");
+		goto out;
+	}
+
+	for (nr = h->map_count; nr; nr--) {
+		ret = restore_vma(ctx, mm);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = restore_mm_context(ctx, mm);
+ out:
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	/* restore_obj() expect an extra reference */
+	atomic_inc(&mm->mm_users);
+	return mm;
+}
+
+static void *restore_mm(struct ckpt_ctx *ctx)
+{
+	return (void *) do_restore_mm(ctx);
+}
+
+int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref)
+{
+	struct mm_struct *mm;
+	int ret;
+
+	mm = ckpt_obj_fetch(ctx, mm_objref, CKPT_OBJ_MM);
+	if (IS_ERR(mm))
+		return PTR_ERR(mm);
+
+	if (mm == current->mm)
+		return 0;
+
+	ret = exec_mmap(mm);
+	if (ret < 0)
+		return ret;
+
+	atomic_inc(&mm->mm_users);
+	return 0;
+}
 static int obj_mm_grab(void *ptr)
 {
 	atomic_inc(&((struct mm_struct *) ptr)->mm_users);
diff --git a/mm/shmem.c b/mm/shmem.c
index e103155..3696342 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,7 +29,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/swap.h>
-#include <linux/checkpoint.h>
+#include <linux/mm_checkpoint.h>
 
 static struct vfsmount *shm_mnt;
 
-- 
1.6.3.3

_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers




More information about the Devel mailing list