[Devel] [PATCH 09/12] Move mmap checkpoint/restart into mm/mmap.c
Matt Helsley
matthltc at us.ibm.com
Fri Feb 26 00:45:10 PST 2010
Moving the memory pieces is more complicated because portions of it are
shared with ipc. Split the mm header bits into another
header file: include/linux/mm_checkpoint.h file so it's clear that these
are the only pieces we need in ipc/
Signed-off-by: Matt Helsley <matthltc at us.ibm.com>
---
checkpoint/Makefile | 3 +-
checkpoint/memory.c | 1364 -----------------------------------------
checkpoint/process.c | 1 +
checkpoint/sys.c | 1 +
include/linux/checkpoint.h | 50 +--
include/linux/mm.h | 18 +
include/linux/mm_checkpoint.h | 45 ++
include/linux/proc_fs.h | 19 -
ipc/checkpoint.c | 3 +-
ipc/checkpoint_msg.c | 3 +-
ipc/checkpoint_sem.c | 3 +-
ipc/checkpoint_shm.c | 3 +-
ipc/shm.c | 2 +-
mm/filemap.c | 2 +-
mm/mmap.c | 1348 ++++++++++++++++++++++++++++++++++++++++
mm/shmem.c | 2 +-
16 files changed, 1424 insertions(+), 1443 deletions(-)
delete mode 100644 checkpoint/memory.c
create mode 100644 include/linux/mm_checkpoint.h
diff --git a/checkpoint/Makefile b/checkpoint/Makefile
index 5bc8468..9571af3 100644
--- a/checkpoint/Makefile
+++ b/checkpoint/Makefile
@@ -8,5 +8,4 @@ obj-$(CONFIG_CHECKPOINT) += \
checkpoint.o \
restart.o \
process.o \
- namespace.o \
- memory.o \
+ namespace.o
diff --git a/checkpoint/memory.c b/checkpoint/memory.c
deleted file mode 100644
index e0b3b54..0000000
--- a/checkpoint/memory.c
+++ /dev/null
@@ -1,1364 +0,0 @@
-/*
- * Checkpoint/restart memory contents
- *
- * Copyright (C) 2008-2009 Oren Laadan
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file COPYING in the main directory of the Linux
- * distribution for more details.
- */
-
-/* default debug level for output */
-#define CKPT_DFLAG CKPT_DMEM
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/file.h>
-#include <linux/aio.h>
-#include <linux/err.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/pagemap.h>
-#include <linux/mm_types.h>
-#include <linux/shm.h>
-#include <linux/proc_fs.h>
-#include <linux/swap.h>
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
-
-/*
- * page-array chains: each ckpt_pgarr describes a set of <struct page *,vaddr>
- * tuples (where vaddr is the virtual address of a page in a particular mm).
- * Specifically, we use separate arrays so that all vaddrs can be written
- * and read at once.
- */
-
-struct ckpt_pgarr {
- unsigned long *vaddrs;
- struct page **pages;
- unsigned int nr_used;
- struct list_head list;
-};
-
-#define CKPT_PGARR_TOTAL (PAGE_SIZE / sizeof(void *))
-#define CKPT_PGARR_BATCH (16 * CKPT_PGARR_TOTAL)
-
-static inline int pgarr_is_full(struct ckpt_pgarr *pgarr)
-{
- return (pgarr->nr_used == CKPT_PGARR_TOTAL);
-}
-
-static inline int pgarr_nr_free(struct ckpt_pgarr *pgarr)
-{
- return CKPT_PGARR_TOTAL - pgarr->nr_used;
-}
-
-/*
- * utilities to alloc, free, and handle 'struct ckpt_pgarr' (page-arrays)
- * (common to ckpt_mem.c and rstr_mem.c).
- *
- * The checkpoint context structure has two members for page-arrays:
- * ctx->pgarr_list: list head of populated page-array chain
- * ctx->pgarr_pool: list head of empty page-array pool chain
- *
- * During checkpoint (and restart) the chain tracks the dirty pages (page
- * pointer and virtual address) of each MM. For a particular MM, these are
- * always added to the head of the page-array chain (ctx->pgarr_list).
- * Before the next chunk of pages, the chain is reset (by dereferencing
- * all pages) but not freed; instead, empty descsriptors are kept in pool.
- *
- * The head of the chain page-array ("current") advances as necessary. When
- * it gets full, a new page-array descriptor is pushed in front of it. The
- * new descriptor is taken from first empty descriptor (if one exists, for
- * instance, after a chain reset), or allocated on-demand.
- *
- * When dumping the data, the chain is traversed in reverse order.
- */
-
-/* return first page-array in the chain */
-static inline struct ckpt_pgarr *pgarr_first(struct ckpt_ctx *ctx)
-{
- if (list_empty(&ctx->pgarr_list))
- return NULL;
- return list_first_entry(&ctx->pgarr_list, struct ckpt_pgarr, list);
-}
-
-/* return (and detach) first empty page-array in the pool, if exists */
-static inline struct ckpt_pgarr *pgarr_from_pool(struct ckpt_ctx *ctx)
-{
- struct ckpt_pgarr *pgarr;
-
- if (list_empty(&ctx->pgarr_pool))
- return NULL;
- pgarr = list_first_entry(&ctx->pgarr_pool, struct ckpt_pgarr, list);
- list_del(&pgarr->list);
- return pgarr;
-}
-
-/* release pages referenced by a page-array */
-static void pgarr_release_pages(struct ckpt_pgarr *pgarr)
-{
- ckpt_debug("total pages %d\n", pgarr->nr_used);
- /*
- * both checkpoint and restart use 'nr_used', however we only
- * collect pages during checkpoint; in restart we simply return
- * because pgarr->pages remains NULL.
- */
- if (pgarr->pages) {
- struct page **pages = pgarr->pages;
- int nr = pgarr->nr_used;
-
- while (nr--)
- page_cache_release(pages[nr]);
- }
-
- pgarr->nr_used = 0;
-}
-
-/* free a single page-array object */
-static void pgarr_free_one(struct ckpt_pgarr *pgarr)
-{
- pgarr_release_pages(pgarr);
- kfree(pgarr->pages);
- kfree(pgarr->vaddrs);
- kfree(pgarr);
-}
-
-/* free the chains of page-arrays (populated and empty pool) */
-void ckpt_pgarr_free(struct ckpt_ctx *ctx)
-{
- struct ckpt_pgarr *pgarr, *tmp;
-
- list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr_list, list) {
- list_del(&pgarr->list);
- pgarr_free_one(pgarr);
- }
-
- list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr_pool, list) {
- list_del(&pgarr->list);
- pgarr_free_one(pgarr);
- }
-}
-
-/* allocate a single page-array object */
-static struct ckpt_pgarr *pgarr_alloc_one(unsigned long flags)
-{
- struct ckpt_pgarr *pgarr;
-
- pgarr = kzalloc(sizeof(*pgarr), GFP_KERNEL);
- if (!pgarr)
- return NULL;
- pgarr->vaddrs = kmalloc(CKPT_PGARR_TOTAL * sizeof(unsigned long),
- GFP_KERNEL);
- if (!pgarr->vaddrs)
- goto nomem;
-
- /* pgarr->pages is needed only for checkpoint */
- if (flags & CKPT_CTX_CHECKPOINT) {
- pgarr->pages = kmalloc(CKPT_PGARR_TOTAL *
- sizeof(struct page *), GFP_KERNEL);
- if (!pgarr->pages)
- goto nomem;
- }
-
- return pgarr;
- nomem:
- pgarr_free_one(pgarr);
- return NULL;
-}
-
-/* pgarr_current - return the next available page-array in the chain
- * @ctx: checkpoint context
- *
- * Returns the first page-array in the list that has space. Otherwise,
- * try the next page-array after the last non-empty one, and move it to
- * the front of the chain. Extends the list if none has space.
- */
-static struct ckpt_pgarr *pgarr_current(struct ckpt_ctx *ctx)
-{
- struct ckpt_pgarr *pgarr;
-
- pgarr = pgarr_first(ctx);
- if (pgarr && !pgarr_is_full(pgarr))
- return pgarr;
-
- pgarr = pgarr_from_pool(ctx);
- if (!pgarr)
- pgarr = pgarr_alloc_one(ctx->kflags);
- if (!pgarr)
- return NULL;
-
- list_add(&pgarr->list, &ctx->pgarr_list);
- return pgarr;
-}
-
-/* reset the page-array chain (dropping page references if necessary) */
-static void pgarr_reset_all(struct ckpt_ctx *ctx)
-{
- struct ckpt_pgarr *pgarr;
-
- list_for_each_entry(pgarr, &ctx->pgarr_list, list)
- pgarr_release_pages(pgarr);
- list_splice_init(&ctx->pgarr_list, &ctx->pgarr_pool);
-}
-
-/**************************************************************************
- * Checkpoint
- *
- * Checkpoint is outside the context of the checkpointee, so one cannot
- * simply read pages from user-space. Instead, we scan the address space
- * of the target to cherry-pick pages of interest. Selected pages are
- * enlisted in a page-array chain (attached to the checkpoint context).
- * To save their contents, each page is mapped to kernel memory and then
- * dumped to the file descriptor.
- */
-
-/**
- * consider_private_page - return page pointer for dirty pages
- * @vma - target vma
- * @addr - page address
- *
- * Looks up the page that correspond to the address in the vma, and
- * returns the page if it was modified (and grabs a reference to it),
- * or otherwise returns NULL (or error).
- */
-static struct page *consider_private_page(struct vm_area_struct *vma,
- unsigned long addr)
-{
- return __get_dirty_page(vma, addr);
-}
-
-/**
- * consider_shared_page - return page pointer for dirty pages
- * @ino - inode of shmem object
- * @idx - page index in shmem object
- *
- * Looks up the page that corresponds to the index in the shmem object,
- * and returns the page if it was modified (and grabs a reference to it),
- * or otherwise returns NULL (or error).
- */
-static struct page *consider_shared_page(struct inode *ino, unsigned long idx)
-{
- struct page *page = NULL;
- int ret;
-
- /*
- * Inspired by do_shmem_file_read(): very simplified version.
- *
- * FIXME: consolidate with do_shmem_file_read()
- */
-
- ret = shmem_getpage(ino, idx, &page, SGP_READ, NULL);
- if (ret < 0)
- return ERR_PTR(ret);
-
- /*
- * Only care about dirty pages; shmem_getpage() only returns
- * pages that have been allocated, so they must be dirty. The
- * pages returned are locked and referenced.
- */
-
- if (page) {
- unlock_page(page);
- /*
- * If users can be writing to this page using arbitrary
- * virtual addresses, take care about potential aliasing
- * before reading the page on the kernel side.
- */
- if (mapping_writably_mapped(ino->i_mapping))
- flush_dcache_page(page);
- /*
- * Mark the page accessed if we read the beginning.
- */
- mark_page_accessed(page);
- }
-
- return page;
-}
-
-/**
- * vma_fill_pgarr - fill a page-array with addr/page tuples
- * @ctx - checkpoint context
- * @vma - vma to scan
- * @start - start address (updated)
- *
- * Returns the number of pages collected
- */
-static int vma_fill_pgarr(struct ckpt_ctx *ctx,
- struct vm_area_struct *vma, struct inode *inode,
- unsigned long *start, unsigned long end)
-{
- unsigned long addr = *start;
- struct ckpt_pgarr *pgarr;
- int nr_used;
- int cnt = 0;
-
- BUG_ON(inode && vma);
-
- if (vma)
- down_read(&vma->vm_mm->mmap_sem);
- do {
- pgarr = pgarr_current(ctx);
- if (!pgarr) {
- cnt = -ENOMEM;
- goto out;
- }
-
- nr_used = pgarr->nr_used;
-
- while (addr < end) {
- struct page *page;
-
- if (vma)
- page = consider_private_page(vma, addr);
- else
- page = consider_shared_page(inode, addr);
-
- if (IS_ERR(page)) {
- cnt = PTR_ERR(page);
- goto out;
- }
-
- if (page) {
- _ckpt_debug(CKPT_DPAGE,
- "got page %#lx\n", addr);
- pgarr->pages[pgarr->nr_used] = page;
- pgarr->vaddrs[pgarr->nr_used] = addr;
- pgarr->nr_used++;
- }
-
- if (vma)
- addr += PAGE_SIZE;
- else
- addr++;
-
- if (pgarr_is_full(pgarr))
- break;
- }
-
- cnt += pgarr->nr_used - nr_used;
-
- } while ((cnt < CKPT_PGARR_BATCH) && (addr < end));
- out:
- if (vma)
- up_read(&vma->vm_mm->mmap_sem);
- *start = addr;
- return cnt;
-}
-
-/* dump contents of a pages: use kmap_atomic() to avoid TLB flush */
-int checkpoint_dump_page(struct ckpt_ctx *ctx, struct page *page)
-{
- void *ptr;
-
- ptr = kmap_atomic(page, KM_USER1);
- memcpy(ctx->scratch_page, ptr, PAGE_SIZE);
- kunmap_atomic(ptr, KM_USER1);
-
- return ckpt_kwrite(ctx, ctx->scratch_page, PAGE_SIZE);
-}
-
-/**
- * vma_dump_pages - dump pages listed in the ctx page-array chain
- * @ctx - checkpoint context
- * @total - total number of pages
- *
- * First dump all virtual addresses, followed by the contents of all pages
- */
-static int vma_dump_pages(struct ckpt_ctx *ctx, int total)
-{
- struct ckpt_pgarr *pgarr;
- int i, ret = 0;
-
- if (!total)
- return 0;
-
- i = total * (sizeof(unsigned long) + PAGE_SIZE);
- ret = ckpt_write_obj_type(ctx, NULL, i, CKPT_HDR_BUFFER);
- if (ret < 0)
- return ret;
-
- list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
- ret = ckpt_kwrite(ctx, pgarr->vaddrs,
- pgarr->nr_used * sizeof(unsigned long));
- if (ret < 0)
- return ret;
- }
-
- list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
- for (i = 0; i < pgarr->nr_used; i++) {
- ret = checkpoint_dump_page(ctx, pgarr->pages[i]);
- if (ret < 0)
- return ret;
- }
- }
-
- return ret;
-}
-
-/**
- * checkpoint_memory_contents - dump contents of a memory region
- * @ctx - checkpoint context
- * @vma - vma to scan (--or--)
- * @inode - inode to scan
- *
- * Collect lists of pages that needs to be dumped, and corresponding
- * virtual addresses into ctx->pgarr_list page-array chain. Then dump
- * the addresses, followed by the page contents.
- */
-int checkpoint_memory_contents(struct ckpt_ctx *ctx,
- struct vm_area_struct *vma,
- struct inode *inode)
-{
- struct ckpt_hdr_pgarr *h;
- unsigned long addr, end;
- int cnt, ret;
-
- BUG_ON(vma && inode);
-
- if (vma) {
- addr = vma->vm_start;
- end = vma->vm_end;
- } else {
- addr = 0;
- end = PAGE_ALIGN(i_size_read(inode)) >> PAGE_CACHE_SHIFT;
- }
-
- /*
- * Work iteratively, collecting and dumping at most CKPT_PGARR_BATCH
- * in each round. Each iterations is divided into two steps:
- *
- * (1) scan: scan through the PTEs of the vma to collect the pages
- * to dump (later we'll also make them COW), while keeping a list
- * of pages and their corresponding addresses on ctx->pgarr_list.
- *
- * (2) dump: write out a header specifying how many pages, followed
- * by the addresses of all pages in ctx->pgarr_list, followed by
- * the actual contents of all pages. (Then, release the references
- * to the pages and reset the page-array chain).
- *
- * (This split makes the logic simpler by first counting the pages
- * that need saving. More importantly, it allows for a future
- * optimization that will reduce application downtime by deferring
- * the actual write-out of the data to after the application is
- * allowed to resume execution).
- *
- * After dumping the entire contents, conclude with a header that
- * specifies 0 pages to mark the end of the contents.
- */
-
- while (addr < end) {
- cnt = vma_fill_pgarr(ctx, vma, inode, &addr, end);
- if (cnt == 0)
- break;
- else if (cnt < 0)
- return cnt;
-
- ckpt_debug("collected %d pages\n", cnt);
-
- h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
- if (!h)
- return -ENOMEM;
-
- h->nr_pages = cnt;
- ret = ckpt_write_obj(ctx, &h->h);
- ckpt_hdr_put(ctx, h);
- if (ret < 0)
- return ret;
-
- ret = vma_dump_pages(ctx, cnt);
- if (ret < 0)
- return ret;
-
- pgarr_reset_all(ctx);
- }
-
- /* mark end of contents with header saying "0" pages */
- h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
- if (!h)
- return -ENOMEM;
- h->nr_pages = 0;
- ret = ckpt_write_obj(ctx, &h->h);
- ckpt_hdr_put(ctx, h);
-
- return ret;
-}
-
-/**
- * generic_vma_checkpoint - dump metadata of vma
- * @ctx: checkpoint context
- * @vma: vma object
- * @type: vma type
- * @vma_objref: vma objref
- */
-int generic_vma_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma,
- enum vma_type type, int vma_objref, int ino_objref)
-{
- struct ckpt_hdr_vma *h;
- int ret;
-
- ckpt_debug("vma %#lx-%#lx flags %#lx type %d\n",
- vma->vm_start, vma->vm_end, vma->vm_flags, type);
-
- h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_VMA);
- if (!h)
- return -ENOMEM;
-
- h->vma_type = type;
- h->vma_objref = vma_objref;
- h->ino_objref = ino_objref;
-
- if (vma->vm_file)
- h->ino_size = i_size_read(vma->vm_file->f_dentry->d_inode);
- else
- h->ino_size = 0;
-
- h->vm_start = vma->vm_start;
- h->vm_end = vma->vm_end;
- h->vm_page_prot = pgprot_val(vma->vm_page_prot);
- h->vm_flags = vma->vm_flags;
- h->vm_pgoff = vma->vm_pgoff;
-
- ret = ckpt_write_obj(ctx, &h->h);
- ckpt_hdr_put(ctx, h);
-
- return ret;
-}
-
-/**
- * private_vma_checkpoint - dump contents of private (anon, file) vma
- * @ctx: checkpoint context
- * @vma: vma object
- * @type: vma type
- * @vma_objref: vma objref
- */
-int private_vma_checkpoint(struct ckpt_ctx *ctx,
- struct vm_area_struct *vma,
- enum vma_type type, int vma_objref)
-{
- int ret;
-
- BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE));
-
- ret = generic_vma_checkpoint(ctx, vma, type, vma_objref, 0);
- if (ret < 0)
- goto out;
- ret = checkpoint_memory_contents(ctx, vma, NULL);
- out:
- return ret;
-}
-
-/**
- * shmem_vma_checkpoint - dump contents of private (anon, file) vma
- * @ctx: checkpoint context
- * @vma: vma object
- * @type: vma type
- * @objref: vma object id
- */
-int shmem_vma_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma,
- enum vma_type type, int ino_objref)
-{
- struct file *file = vma->vm_file;
- int ret;
-
- ckpt_debug("type %d, ino_ref %d\n", type, ino_objref);
- BUG_ON(!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE)));
- BUG_ON(!file);
-
- ret = generic_vma_checkpoint(ctx, vma, type, 0, ino_objref);
- if (ret < 0)
- goto out;
- if (type == CKPT_VMA_SHM_ANON_SKIP)
- goto out;
- ret = checkpoint_memory_contents(ctx, NULL, file->f_dentry->d_inode);
- out:
- return ret;
-}
-
-/**
- * anonymous_checkpoint - dump contents of private-anonymous vma
- * @ctx: checkpoint context
- * @vma: vma object
- */
-static int anonymous_checkpoint(struct ckpt_ctx *ctx,
- struct vm_area_struct *vma)
-{
- /* should be private anonymous ... verify that this is the case */
- BUG_ON(vma->vm_flags & VM_MAYSHARE);
- BUG_ON(vma->vm_file);
-
- return private_vma_checkpoint(ctx, vma, CKPT_VMA_ANON, 0);
-}
-
-static int checkpoint_vmas(struct ckpt_ctx *ctx, struct mm_struct *mm)
-{
- struct vm_area_struct *vma, *next;
- int map_count = 0;
- int ret = 0;
-
- vma = kzalloc(sizeof(*vma), GFP_KERNEL);
- if (!vma)
- return -ENOMEM;
-
- /*
- * Must not hold mm->mmap_sem when writing to image file, so
- * can't simply traverse the vma list. Instead, use find_vma()
- * to get the @next and make a local "copy" of it.
- */
- while (1) {
- down_read(&mm->mmap_sem);
- next = find_vma(mm, vma->vm_end);
- if (!next) {
- up_read(&mm->mmap_sem);
- break;
- }
- if (vma->vm_file)
- fput(vma->vm_file);
- *vma = *next;
- if (vma->vm_file)
- get_file(vma->vm_file);
- up_read(&mm->mmap_sem);
-
- map_count++;
-
- ckpt_debug("vma %#lx-%#lx flags %#lx\n",
- vma->vm_start, vma->vm_end, vma->vm_flags);
-
- if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
- ckpt_err(ctx, -ENOSYS, "%(T)vma: bad flags (%#lx)\n",
- vma->vm_flags);
- ret = -ENOSYS;
- break;
- }
-
- if (!vma->vm_ops)
- ret = anonymous_checkpoint(ctx, vma);
- else if (vma->vm_ops->checkpoint)
- ret = (*vma->vm_ops->checkpoint)(ctx, vma);
- else
- ret = -ENOSYS;
- if (ret < 0) {
- ckpt_err(ctx, ret, "%(T)vma: failed\n");
- break;
- }
- /*
- * The file was collected, but not always checkpointed;
- * be safe and mark as visited to appease leak detection
- */
- if (vma->vm_file && !(ctx->uflags & CHECKPOINT_SUBTREE)) {
- ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
- if (ret < 0)
- break;
- }
- }
-
- if (vma->vm_file)
- fput(vma->vm_file);
-
- kfree(vma);
-
- return ret < 0 ? ret : map_count;
-}
-
-#define CKPT_AT_SZ (AT_VECTOR_SIZE * sizeof(u64))
-/*
- * We always write saved_auxv out as an array of u64s, though it is
- * an array of u32s on 32-bit arch.
- */
-static int ckpt_write_auxv(struct ckpt_ctx *ctx, struct mm_struct *mm)
-{
- int i, ret;
- u64 *buf = kzalloc(CKPT_AT_SZ, GFP_KERNEL);
-
- if (!buf)
- return -ENOMEM;
- for (i = 0; i < AT_VECTOR_SIZE; i++)
- buf[i] = mm->saved_auxv[i];
- ret = ckpt_write_buffer(ctx, buf, CKPT_AT_SZ);
- kfree(buf);
- return ret;
-}
-
-static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
-{
- struct ckpt_hdr_mm *h;
- struct file *exe_file = NULL;
- int ret;
-
- if (check_for_outstanding_aio(mm)) {
- ckpt_err(ctx, -EBUSY, "(%T)Outstanding aio\n");
- return -EBUSY;
- }
-
- h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_MM);
- if (!h)
- return -ENOMEM;
-
- down_read(&mm->mmap_sem);
-
- h->flags = mm->flags;
- h->def_flags = mm->def_flags;
-
- h->start_code = mm->start_code;
- h->end_code = mm->end_code;
- h->start_data = mm->start_data;
- h->end_data = mm->end_data;
- h->start_brk = mm->start_brk;
- h->brk = mm->brk;
- h->start_stack = mm->start_stack;
- h->arg_start = mm->arg_start;
- h->arg_end = mm->arg_end;
- h->env_start = mm->env_start;
- h->env_end = mm->env_end;
-
- h->map_count = mm->map_count;
-
- if (mm->exe_file) { /* checkpoint the ->exe_file */
- exe_file = mm->exe_file;
- get_file(exe_file);
- }
-
- /*
- * Drop mm->mmap_sem before writing data to checkpoint image
- * to avoid reverse locking order (inode must come before mm).
- */
- up_read(&mm->mmap_sem);
-
- if (exe_file) {
- h->exe_objref = checkpoint_obj(ctx, exe_file, CKPT_OBJ_FILE);
- if (h->exe_objref < 0) {
- ret = h->exe_objref;
- goto out;
- }
- }
-
- ret = ckpt_write_obj(ctx, &h->h);
- if (ret < 0)
- goto out;
-
- ret = ckpt_write_auxv(ctx, mm);
- if (ret < 0)
- return ret;
-
- ret = checkpoint_vmas(ctx, mm);
- if (ret != h->map_count && ret >= 0)
- ret = -EBUSY; /* checkpoint mm leak */
- if (ret < 0)
- goto out;
-
- ret = checkpoint_mm_context(ctx, mm);
- out:
- if (exe_file)
- fput(exe_file);
- ckpt_hdr_put(ctx, h);
- return ret;
-}
-
-int checkpoint_mm(struct ckpt_ctx *ctx, void *ptr)
-{
- return do_checkpoint_mm(ctx, (struct mm_struct *) ptr);
-}
-
-int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t)
-{
- struct mm_struct *mm;
- int objref;
-
- mm = get_task_mm(t);
- objref = checkpoint_obj(ctx, mm, CKPT_OBJ_MM);
- mmput(mm);
-
- return objref;
-}
-
-/***********************************************************************
- * Collect
- */
-
-static int collect_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
- struct file *file;
- int ret;
-
- /* if already exists (ret == 0), nothing to do */
- ret = ckpt_obj_collect(ctx, mm, CKPT_OBJ_MM);
- if (ret <= 0)
- return ret;
-
- /* if first time for this mm (ret > 0), proceed inside */
- down_read(&mm->mmap_sem);
- if (mm->exe_file) {
- ret = ckpt_collect_file(ctx, mm->exe_file);
- if (ret < 0) {
- ckpt_err(ctx, ret, "%(T)mm: collect exe_file\n");
- goto out;
- }
- }
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- file = vma->vm_file;
- if (!file)
- continue;
- ret = ckpt_collect_file(ctx, file);
- if (ret < 0) {
- ckpt_err(ctx, ret, "%(T)mm: collect vm_file\n");
- break;
- }
- }
- out:
- up_read(&mm->mmap_sem);
- return ret;
-
-}
-
-int ckpt_collect_mm(struct ckpt_ctx *ctx, struct task_struct *t)
-{
- struct mm_struct *mm;
- int ret;
-
- mm = get_task_mm(t);
- ret = collect_mm(ctx, mm);
- mmput(mm);
-
- return ret;
-}
-
-/***********************************************************************
- * Restart
- *
- * Unlike checkpoint, restart is executed in the context of each restarting
- * process: vma regions are restored via a call to mmap(), and the data is
- * read into the address space of the current process.
- */
-
-/**
- * read_pages_vaddrs - read addresses of pages to page-array chain
- * @ctx - restart context
- * @nr_pages - number of address to read
- */
-static int read_pages_vaddrs(struct ckpt_ctx *ctx, unsigned long nr_pages)
-{
- struct ckpt_pgarr *pgarr;
- unsigned long *vaddrp;
- int nr, ret;
-
- while (nr_pages) {
- pgarr = pgarr_current(ctx);
- if (!pgarr)
- return -ENOMEM;
- nr = pgarr_nr_free(pgarr);
- if (nr > nr_pages)
- nr = nr_pages;
- vaddrp = &pgarr->vaddrs[pgarr->nr_used];
- ret = ckpt_kread(ctx, vaddrp, nr * sizeof(unsigned long));
- if (ret < 0)
- return ret;
- pgarr->nr_used += nr;
- nr_pages -= nr;
- }
- return 0;
-}
-
-int restore_read_page(struct ckpt_ctx *ctx, struct page *page)
-{
- void *ptr;
- int ret;
-
- ret = ckpt_kread(ctx, ctx->scratch_page, PAGE_SIZE);
- if (ret < 0)
- return ret;
-
- ptr = kmap_atomic(page, KM_USER1);
- memcpy(ptr, ctx->scratch_page, PAGE_SIZE);
- kunmap_atomic(ptr, KM_USER1);
-
- return 0;
-}
-
-static struct page *bring_private_page(unsigned long addr)
-{
- struct page *page;
- int ret;
-
- ret = get_user_pages(current, current->mm, addr, 1, 1, 1, &page, NULL);
- if (ret < 0)
- page = ERR_PTR(ret);
- return page;
-}
-
-static struct page *bring_shared_page(unsigned long idx, struct inode *ino)
-{
- struct page *page = NULL;
- int ret;
-
- ret = shmem_getpage(ino, idx, &page, SGP_WRITE, NULL);
- if (ret < 0)
- return ERR_PTR(ret);
- if (page)
- unlock_page(page);
- return page;
-}
-
-/**
- * read_pages_contents - read in data of pages in page-array chain
- * @ctx - restart context
- */
-static int read_pages_contents(struct ckpt_ctx *ctx, struct inode *inode)
-{
- struct ckpt_pgarr *pgarr;
- unsigned long *vaddrs;
- int i, ret;
-
- list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
- vaddrs = pgarr->vaddrs;
- for (i = 0; i < pgarr->nr_used; i++) {
- struct page *page;
-
- /* TODO: do in chunks to reduce mmap_sem overhead */
- _ckpt_debug(CKPT_DPAGE, "got page %#lx\n", vaddrs[i]);
- down_read(¤t->mm->mmap_sem);
- if (inode)
- page = bring_shared_page(vaddrs[i], inode);
- else
- page = bring_private_page(vaddrs[i]);
- up_read(¤t->mm->mmap_sem);
-
- if (IS_ERR(page))
- return PTR_ERR(page);
-
- ret = restore_read_page(ctx, page);
- page_cache_release(page);
-
- if (ret < 0)
- return ret;
- }
- }
- return 0;
-}
-
-/**
- * restore_memory_contents - restore contents of a memory region
- * @ctx - restart context
- * @inode - backing inode
- *
- * Reads a header that specifies how many pages will follow, then reads
- * a list of virtual addresses into ctx->pgarr_list page-array chain,
- * followed by the actual contents of the corresponding pages. Iterates
- * these steps until reaching a header specifying "0" pages, which marks
- * the end of the contents.
- */
-int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode)
-{
- struct ckpt_hdr_pgarr *h;
- unsigned long nr_pages;
- int len, ret = 0;
-
- while (1) {
- h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
- if (IS_ERR(h))
- break;
-
- ckpt_debug("total pages %ld\n", (unsigned long) h->nr_pages);
-
- nr_pages = h->nr_pages;
- ckpt_hdr_put(ctx, h);
-
- if (!nr_pages)
- break;
-
- len = nr_pages * (sizeof(unsigned long) + PAGE_SIZE);
- ret = _ckpt_read_buffer(ctx, NULL, len);
- if (ret < 0)
- break;
-
- ret = read_pages_vaddrs(ctx, nr_pages);
- if (ret < 0)
- break;
- ret = read_pages_contents(ctx, inode);
- if (ret < 0)
- break;
- pgarr_reset_all(ctx);
- }
-
- return ret;
-}
-
-/**
- * calc_map_prot_bits - convert vm_flags to mmap protection
- * orig_vm_flags: source vm_flags
- */
-static unsigned long calc_map_prot_bits(unsigned long orig_vm_flags)
-{
- unsigned long vm_prot = 0;
-
- if (orig_vm_flags & VM_READ)
- vm_prot |= PROT_READ;
- if (orig_vm_flags & VM_WRITE)
- vm_prot |= PROT_WRITE;
- if (orig_vm_flags & VM_EXEC)
- vm_prot |= PROT_EXEC;
- if (orig_vm_flags & PROT_SEM) /* only (?) with IPC-SHM */
- vm_prot |= PROT_SEM;
-
- return vm_prot;
-}
-
-/**
- * calc_map_flags_bits - convert vm_flags to mmap flags
- * orig_vm_flags: source vm_flags
- */
-static unsigned long calc_map_flags_bits(unsigned long orig_vm_flags)
-{
- unsigned long vm_flags = 0;
-
- vm_flags = MAP_FIXED;
- if (orig_vm_flags & VM_GROWSDOWN)
- vm_flags |= MAP_GROWSDOWN;
- if (orig_vm_flags & VM_DENYWRITE)
- vm_flags |= MAP_DENYWRITE;
- if (orig_vm_flags & VM_EXECUTABLE)
- vm_flags |= MAP_EXECUTABLE;
- if (orig_vm_flags & VM_MAYSHARE)
- vm_flags |= MAP_SHARED;
- else
- vm_flags |= MAP_PRIVATE;
-
- return vm_flags;
-}
-
-/**
- * generic_vma_restore - restore a vma
- * @mm - address space
- * @file - file to map (NULL for anonymous)
- * @h - vma header data
- */
-unsigned long generic_vma_restore(struct mm_struct *mm,
- struct file *file,
- struct ckpt_hdr_vma *h)
-{
- unsigned long vm_size, vm_start, vm_flags, vm_prot, vm_pgoff;
- unsigned long addr;
-
- if (h->vm_end < h->vm_start)
- return -EINVAL;
- if (h->vma_objref < 0)
- return -EINVAL;
-
- vm_start = h->vm_start;
- vm_pgoff = h->vm_pgoff;
- vm_size = h->vm_end - h->vm_start;
- vm_prot = calc_map_prot_bits(h->vm_flags);
- vm_flags = calc_map_flags_bits(h->vm_flags);
-
- down_write(&mm->mmap_sem);
- addr = do_mmap_pgoff(file, vm_start, vm_size,
- vm_prot, vm_flags, vm_pgoff);
- up_write(&mm->mmap_sem);
- ckpt_debug("size %#lx prot %#lx flag %#lx pgoff %#lx => %#lx\n",
- vm_size, vm_prot, vm_flags, vm_pgoff, addr);
-
- return addr;
-}
-
-/**
- * private_vma_restore - read vma data, recreate it and read contents
- * @ctx: checkpoint context
- * @mm: memory address space
- * @file: file to use for mapping
- * @h - vma header data
- */
-int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
- struct file *file, struct ckpt_hdr_vma *h)
-{
- unsigned long addr;
-
- if (h->vm_flags & (VM_SHARED | VM_MAYSHARE))
- return -EINVAL;
-
- addr = generic_vma_restore(mm, file, h);
- if (IS_ERR((void *) addr))
- return PTR_ERR((void *) addr);
-
- return restore_memory_contents(ctx, NULL);
-}
-
-/**
- * anon_private_restore - read vma data, recreate it and read contents
- * @ctx: checkpoint context
- * @mm: memory address space
- * @h - vma header data
- */
-static int anon_private_restore(struct ckpt_ctx *ctx,
- struct mm_struct *mm,
- struct ckpt_hdr_vma *h)
-{
- /*
- * vm_pgoff for anonymous mapping is the "global" page
- * offset (namely from addr 0x0), so we force a zero
- */
- h->vm_pgoff = 0;
-
- return private_vma_restore(ctx, mm, NULL, h);
-}
-
-static int bad_vma_restore(struct ckpt_ctx *ctx,
- struct mm_struct *mm,
- struct ckpt_hdr_vma *h)
-{
- return -EINVAL;
-}
-
-/* callbacks to restore vma per its type: */
-struct restore_vma_ops {
- char *vma_name;
- enum vma_type vma_type;
- int (*restore) (struct ckpt_ctx *ctx,
- struct mm_struct *mm,
- struct ckpt_hdr_vma *ptr);
-};
-
-static struct restore_vma_ops restore_vma_ops[] = {
- /* ignored vma */
- {
- .vma_name = "IGNORE",
- .vma_type = CKPT_VMA_IGNORE,
- .restore = NULL,
- },
- /* special mapping (vdso) */
- {
- .vma_name = "VDSO",
- .vma_type = CKPT_VMA_VDSO,
- .restore = special_mapping_restore,
- },
- /* anonymous private */
- {
- .vma_name = "ANON PRIVATE",
- .vma_type = CKPT_VMA_ANON,
- .restore = anon_private_restore,
- },
- /* file-mapped private */
- {
- .vma_name = "FILE PRIVATE",
- .vma_type = CKPT_VMA_FILE,
- .restore = filemap_restore,
- },
- /* anonymous shared */
- {
- .vma_name = "ANON SHARED",
- .vma_type = CKPT_VMA_SHM_ANON,
- .restore = shmem_restore,
- },
- /* anonymous shared (skipped) */
- {
- .vma_name = "ANON SHARED (skip)",
- .vma_type = CKPT_VMA_SHM_ANON_SKIP,
- .restore = shmem_restore,
- },
- /* file-mapped shared */
- {
- .vma_name = "FILE SHARED",
- .vma_type = CKPT_VMA_SHM_FILE,
- .restore = filemap_restore,
- },
- /* sysvipc shared */
- {
- .vma_name = "IPC SHARED",
- .vma_type = CKPT_VMA_SHM_IPC,
- /* ipc inode itself is restore by restore_ipc_ns()... */
- .restore = bad_vma_restore,
-
- },
- /* sysvipc shared (skip) */
- {
- .vma_name = "IPC SHARED (skip)",
- .vma_type = CKPT_VMA_SHM_IPC_SKIP,
- .restore = ipcshm_restore,
- },
-};
-
-/**
- * restore_vma - read vma data, recreate it and read contents
- * @ctx: checkpoint context
- * @mm: memory address space
- */
-static int restore_vma(struct ckpt_ctx *ctx, struct mm_struct *mm)
-{
- struct ckpt_hdr_vma *h;
- struct restore_vma_ops *ops;
- int ret;
-
- h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_VMA);
- if (IS_ERR(h))
- return PTR_ERR(h);
-
- ckpt_debug("vma %#lx-%#lx flags %#lx type %d vmaref %d inoref %d\n",
- (unsigned long) h->vm_start, (unsigned long) h->vm_end,
- (unsigned long) h->vm_flags, (int) h->vma_type,
- (int) h->vma_objref, (int) h->ino_objref);
-
- ret = -EINVAL;
- if (h->vm_end < h->vm_start)
- goto out;
- if (h->vma_objref < 0 || h->ino_objref < 0)
- goto out;
- if (h->vma_type >= CKPT_VMA_MAX)
- goto out;
- if (h->vm_flags & CKPT_VMA_NOT_SUPPORTED)
- return -ENOSYS;
-
- ops = &restore_vma_ops[h->vma_type];
-
- /* make sure we don't change this accidentally */
- BUG_ON(ops->vma_type != h->vma_type);
-
- if (ops->restore) {
- ckpt_debug("vma type %s\n", ops->vma_name);
- ret = ops->restore(ctx, mm, h);
- } else {
- ckpt_debug("vma ignored\n");
- ret = 0;
- }
- out:
- ckpt_hdr_put(ctx, h);
- return ret;
-}
-
-static int ckpt_read_auxv(struct ckpt_ctx *ctx, struct mm_struct *mm)
-{
- int i, ret;
- u64 *buf = kmalloc(CKPT_AT_SZ, GFP_KERNEL);
-
- if (!buf)
- return -ENOMEM;
- ret = _ckpt_read_buffer(ctx, buf, CKPT_AT_SZ);
- if (ret < 0)
- goto out;
-
- ret = -E2BIG;
- for (i = 0; i < AT_VECTOR_SIZE; i++)
- if (buf[i] > (u64) ULONG_MAX)
- goto out;
-
- for (i = 0; i < AT_VECTOR_SIZE - 1; i++)
- mm->saved_auxv[i] = buf[i];
- /* sanitize the input: force AT_NULL in last entry */
- mm->saved_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
-
- ret = 0;
- out:
- kfree(buf);
- return ret;
-}
-
-static struct mm_struct *do_restore_mm(struct ckpt_ctx *ctx)
-{
- struct ckpt_hdr_mm *h;
- struct mm_struct *mm = NULL;
- struct file *file;
- unsigned int nr;
- int ret;
-
- h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_MM);
- if (IS_ERR(h))
- return (struct mm_struct *) h;
-
- ckpt_debug("map_count %d\n", h->map_count);
-
- /* XXX need more sanity checks */
-
- ret = -EINVAL;
- if ((h->start_code > h->end_code) ||
- (h->start_data > h->end_data))
- goto out;
- if (h->exe_objref < 0)
- goto out;
- if (h->def_flags & ~VM_LOCKED)
- goto out;
- if (h->flags & ~(MMF_DUMP_FILTER_MASK |
- ((1 << MMF_DUMP_FILTER_BITS) - 1)))
- goto out;
-
- mm = current->mm;
-
- /* point of no return -- destruct current mm */
- down_write(&mm->mmap_sem);
- ret = destroy_mm(mm);
- if (ret < 0) {
- up_write(&mm->mmap_sem);
- goto out;
- }
-
- mm->flags = h->flags;
- mm->def_flags = h->def_flags;
-
- mm->start_code = h->start_code;
- mm->end_code = h->end_code;
- mm->start_data = h->start_data;
- mm->end_data = h->end_data;
- mm->start_brk = h->start_brk;
- mm->brk = h->brk;
- mm->start_stack = h->start_stack;
- mm->arg_start = h->arg_start;
- mm->arg_end = h->arg_end;
- mm->env_start = h->env_start;
- mm->env_end = h->env_end;
-
- /* restore the ->exe_file */
- if (h->exe_objref) {
- file = ckpt_obj_fetch(ctx, h->exe_objref, CKPT_OBJ_FILE);
- if (IS_ERR(file)) {
- up_write(&mm->mmap_sem);
- ret = PTR_ERR(file);
- goto out;
- }
- set_mm_exe_file(mm, file);
- }
- up_write(&mm->mmap_sem);
-
- ret = ckpt_read_auxv(ctx, mm);
- if (ret < 0) {
- ckpt_err(ctx, ret, "Error restoring auxv\n");
- goto out;
- }
-
- for (nr = h->map_count; nr; nr--) {
- ret = restore_vma(ctx, mm);
- if (ret < 0)
- goto out;
- }
-
- ret = restore_mm_context(ctx, mm);
- out:
- ckpt_hdr_put(ctx, h);
- if (ret < 0)
- return ERR_PTR(ret);
- /* restore_obj() expect an extra reference */
- atomic_inc(&mm->mm_users);
- return mm;
-}
-
-void *restore_mm(struct ckpt_ctx *ctx)
-{
- return (void *) do_restore_mm(ctx);
-}
-
-int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref)
-{
- struct mm_struct *mm;
- int ret;
-
- mm = ckpt_obj_fetch(ctx, mm_objref, CKPT_OBJ_MM);
- if (IS_ERR(mm))
- return PTR_ERR(mm);
-
- if (mm == current->mm)
- return 0;
-
- ret = exec_mmap(mm);
- if (ret < 0)
- return ret;
-
- atomic_inc(&mm->mm_users);
- return 0;
-}
diff --git a/checkpoint/process.c b/checkpoint/process.c
index f917112..6e3e382 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -21,6 +21,7 @@
#include <linux/user_namespace.h>
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
+#include <linux/mm_checkpoint.h>
#include <linux/syscalls.h>
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index d34ff98..a420c02 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -21,6 +21,7 @@
#include <linux/uaccess.h>
#include <linux/capability.h>
#include <linux/checkpoint.h>
+#include <linux/mm_checkpoint.h> /* for ckpt_pgarr_free() */
#include <linux/deferqueue.h>
/*
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 7a06272..0fc3f70 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -101,9 +101,6 @@ extern int ckpt_read_consume(struct ckpt_ctx *ctx, int len, int type);
extern char *ckpt_fill_fname(struct path *path, struct path *root,
char *buf, int *len);
-extern int checkpoint_dump_page(struct ckpt_ctx *ctx, struct page *page);
-extern int restore_read_page(struct ckpt_ctx *ctx, struct page *page);
-
/* pids */
extern pid_t ckpt_pid_nr(struct ckpt_ctx *ctx, struct pid *pid);
extern struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid);
@@ -174,6 +171,9 @@ extern int ckpt_obj_reserve(struct ckpt_ctx *ctx);
extern struct ckpt_ctx *ckpt_ctx_get(struct ckpt_ctx *ctx);
extern void ckpt_ctx_put(struct ckpt_ctx *ctx);
+extern int checkpoint_dump_page(struct ckpt_ctx *ctx, struct page *page);
+extern int restore_read_page(struct ckpt_ctx *ctx, struct page *page);
+
extern long do_checkpoint(struct ckpt_ctx *ctx, pid_t pid);
extern long do_restart(struct ckpt_ctx *ctx, pid_t pid, unsigned long flags);
@@ -189,12 +189,10 @@ extern void post_restore_task(void);
extern int checkpoint_write_header_arch(struct ckpt_ctx *ctx);
extern int checkpoint_thread(struct ckpt_ctx *ctx, struct task_struct *t);
extern int checkpoint_cpu(struct ckpt_ctx *ctx, struct task_struct *t);
-extern int checkpoint_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
extern int restore_read_header_arch(struct ckpt_ctx *ctx);
extern int restore_thread(struct ckpt_ctx *ctx);
extern int restore_cpu(struct ckpt_ctx *ctx);
-extern int restore_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
extern int checkpoint_restart_block(struct ckpt_ctx *ctx,
struct task_struct *t);
@@ -261,48 +259,6 @@ extern void *restore_cred(struct ckpt_ctx *ctx);
extern int checkpoint_userns(struct ckpt_ctx *ctx, void *ptr);
extern void *restore_userns(struct ckpt_ctx *ctx);
-
-/* memory */
-extern void ckpt_pgarr_free(struct ckpt_ctx *ctx);
-
-extern int generic_vma_checkpoint(struct ckpt_ctx *ctx,
- struct vm_area_struct *vma,
- enum vma_type type,
- int vma_objref, int ino_objref);
-extern int private_vma_checkpoint(struct ckpt_ctx *ctx,
- struct vm_area_struct *vma,
- enum vma_type type,
- int vma_objref);
-extern int shmem_vma_checkpoint(struct ckpt_ctx *ctx,
- struct vm_area_struct *vma,
- enum vma_type type,
- int ino_objref);
-
-extern int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t);
-extern int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref);
-
-extern int ckpt_collect_mm(struct ckpt_ctx *ctx, struct task_struct *t);
-extern int checkpoint_mm(struct ckpt_ctx *ctx, void *ptr);
-extern void *restore_mm(struct ckpt_ctx *ctx);
-
-extern unsigned long generic_vma_restore(struct mm_struct *mm,
- struct file *file,
- struct ckpt_hdr_vma *h);
-
-extern int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
- struct file *file, struct ckpt_hdr_vma *h);
-
-extern int checkpoint_memory_contents(struct ckpt_ctx *ctx,
- struct vm_area_struct *vma,
- struct inode *inode);
-extern int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode);
-
-
-#define CKPT_VMA_NOT_SUPPORTED \
- (VM_IO | VM_HUGETLB | VM_NONLINEAR | VM_PFNMAP | \
- VM_RESERVED | VM_NORESERVE | VM_HUGETLB | VM_NONLINEAR | \
- VM_MAPPED_COPY | VM_INSERTPAGE | VM_MIXEDMAP | VM_SAO)
-
/* signals */
extern int checkpoint_obj_sighand(struct ckpt_ctx *ctx, struct task_struct *t);
extern int restore_obj_sighand(struct ckpt_ctx *ctx, int sighand_objref);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 210d8e3..2459d1d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1303,8 +1303,26 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
unsigned long size, pte_fn_t fn, void *data);
#ifdef CONFIG_PROC_FS
+/* Set/Get/Dup a reference to the file /proc/<pid>/exe symlinks to. */
+extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
+extern struct file *get_mm_exe_file(struct mm_struct *mm);
+extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
+
void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
#else
+static inline void set_mm_exe_file(struct mm_struct *mm,
+ struct file *new_exe_file)
+{}
+
+static inline struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+ return NULL;
+}
+
+static inline void dup_mm_exe_file(struct mm_struct *oldmm,
+ struct mm_struct *newmm)
+{}
+
static inline void vm_stat_account(struct mm_struct *mm,
unsigned long flags, struct file *file, long pages)
{
diff --git a/include/linux/mm_checkpoint.h b/include/linux/mm_checkpoint.h
new file mode 100644
index 0000000..0092321
--- /dev/null
+++ b/include/linux/mm_checkpoint.h
@@ -0,0 +1,45 @@
+#ifndef _LINUX_MM_CHECKPOINT_H
+#define _LINUX_MM_CHECKPOINT_H
+
+#include <linux/checkpoint.h> /* for ckpt_obj_fetch, restore_read_page */
+#include <linux/checkpoint_hdr.h> /* for struct ckpt_hdr_vma */
+#include <linux/checkpoint_types.h> /* for struct ckpt_ctx */
+
+extern void ckpt_pgarr_free(struct ckpt_ctx *ctx);
+
+extern int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref);
+
+extern int ckpt_collect_mm(struct ckpt_ctx *ctx, struct task_struct *t);
+
+extern int checkpoint_memory_contents(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma,
+ struct inode *inode);
+extern int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode);
+
+/* common vma checkpoint/restore operations */
+extern int generic_vma_checkpoint(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma,
+ enum vma_type type,
+ int vma_objref, int ino_objref);
+extern unsigned long generic_vma_restore(struct mm_struct *mm,
+ struct file *file,
+ struct ckpt_hdr_vma *h);
+extern int private_vma_checkpoint(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma,
+ enum vma_type type,
+ int vma_objref);
+extern int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+ struct file *file, struct ckpt_hdr_vma *h);
+extern int shmem_vma_checkpoint(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma,
+ enum vma_type type,
+ int ino_objref);
+
+
+#define CKPT_VMA_NOT_SUPPORTED \
+ (VM_IO | VM_HUGETLB | VM_NONLINEAR | VM_PFNMAP | \
+ VM_RESERVED | VM_NORESERVE | VM_HUGETLB | VM_NONLINEAR | \
+ VM_MAPPED_COPY | VM_INSERTPAGE | VM_MIXEDMAP | VM_SAO)
+
+#endif /* _LINUX_MM_CHECKPOINT_H */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 379eaed..e187078 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -173,12 +173,6 @@ extern void proc_net_remove(struct net *net, const char *name);
extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
struct proc_dir_entry *parent);
-/* While the {get|set|dup}_mm_exe_file functions are for mm_structs, they are
- * only needed to implement /proc/<pid>|self/exe so we define them here. */
-extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
-extern struct file *get_mm_exe_file(struct mm_struct *mm);
-extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
-
#else
#define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; })
@@ -226,19 +220,6 @@ static inline void pid_ns_release_proc(struct pid_namespace *ns)
{
}
-static inline void set_mm_exe_file(struct mm_struct *mm,
- struct file *new_exe_file)
-{}
-
-static inline struct file *get_mm_exe_file(struct mm_struct *mm)
-{
- return NULL;
-}
-
-static inline void dup_mm_exe_file(struct mm_struct *oldmm,
- struct mm_struct *newmm)
-{}
-
#endif /* CONFIG_PROC_FS */
#if !defined(CONFIG_PROC_KCORE)
diff --git a/ipc/checkpoint.c b/ipc/checkpoint.c
index 4322dea..2b05067 100644
--- a/ipc/checkpoint.c
+++ b/ipc/checkpoint.c
@@ -15,8 +15,7 @@
#include <linux/msg.h>
#include <linux/sched.h>
#include <linux/ipc_namespace.h>
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
+#include <linux/mm_checkpoint.h>
#include "util.h"
diff --git a/ipc/checkpoint_msg.c b/ipc/checkpoint_msg.c
index 61b3d78..51fb712 100644
--- a/ipc/checkpoint_msg.c
+++ b/ipc/checkpoint_msg.c
@@ -22,8 +22,7 @@
#include "util.h"
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
+#include <linux/mm_checkpoint.h>
/************************************************************************
* ipc checkpoint
diff --git a/ipc/checkpoint_sem.c b/ipc/checkpoint_sem.c
index 395c84d..d1a9ba1 100644
--- a/ipc/checkpoint_sem.c
+++ b/ipc/checkpoint_sem.c
@@ -22,8 +22,7 @@
struct msg_msg;
#include "util.h"
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
+#include <linux/mm_checkpoint.h>
/************************************************************************
* ipc checkpoint
diff --git a/ipc/checkpoint_shm.c b/ipc/checkpoint_shm.c
index 01091d9..f06cfda 100644
--- a/ipc/checkpoint_shm.c
+++ b/ipc/checkpoint_shm.c
@@ -26,8 +26,7 @@
#include <linux/msg.h> /* needed for util.h that uses 'struct msg_msg' */
#include "util.h"
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
+#include <linux/mm_checkpoint.h>
/************************************************************************
* ipc checkpoint
diff --git a/ipc/shm.c b/ipc/shm.c
index 18ae1b8..f4c3cea 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -39,7 +39,7 @@
#include <linux/nsproxy.h>
#include <linux/mount.h>
#include <linux/ipc_namespace.h>
-#include <linux/checkpoint.h>
+#include <linux/mm_checkpoint.h>
#include <asm/uaccess.h>
diff --git a/mm/filemap.c b/mm/filemap.c
index 4ea28e6..1495ec0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,7 +34,7 @@
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
#include <linux/mm_inline.h> /* for page_is_file_cache() */
-#include <linux/checkpoint.h>
+#include <linux/mm_checkpoint.h>
#include "internal.h"
/*
diff --git a/mm/mmap.c b/mm/mmap.c
index b19a754..c934021 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,8 @@
#include <linux/mmu_notifier.h>
#include <linux/perf_event.h>
#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/mm_checkpoint.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -2624,6 +2626,1352 @@ void mm_drop_all_locks(struct mm_struct *mm)
}
#ifdef CONFIG_CHECKPOINT
+/* default debug level for output */
+#undef CKPT_DFLAG
+#define CKPT_DFLAG CKPT_DMEM
+/*
+ * page-array chains: each ckpt_pgarr describes a set of <struct page *,vaddr>
+ * tuples (where vaddr is the virtual address of a page in a particular mm).
+ * Specifically, we use separate arrays so that all vaddrs can be written
+ * and read at once.
+ */
+
+struct ckpt_pgarr {
+ unsigned long *vaddrs;
+ struct page **pages;
+ unsigned int nr_used;
+ struct list_head list;
+};
+
+#define CKPT_PGARR_TOTAL (PAGE_SIZE / sizeof(void *))
+#define CKPT_PGARR_BATCH (16 * CKPT_PGARR_TOTAL)
+
+static inline int pgarr_is_full(struct ckpt_pgarr *pgarr)
+{
+ return (pgarr->nr_used == CKPT_PGARR_TOTAL);
+}
+
+static inline int pgarr_nr_free(struct ckpt_pgarr *pgarr)
+{
+ return CKPT_PGARR_TOTAL - pgarr->nr_used;
+}
+
+/*
+ * utilities to alloc, free, and handle 'struct ckpt_pgarr' (page-arrays)
+ * (common to ckpt_mem.c and rstr_mem.c).
+ *
+ * The checkpoint context structure has two members for page-arrays:
+ * ctx->pgarr_list: list head of populated page-array chain
+ * ctx->pgarr_pool: list head of empty page-array pool chain
+ *
+ * During checkpoint (and restart) the chain tracks the dirty pages (page
+ * pointer and virtual address) of each MM. For a particular MM, these are
+ * always added to the head of the page-array chain (ctx->pgarr_list).
+ * Before the next chunk of pages, the chain is reset (by dereferencing
+ * all pages) but not freed; instead, empty descsriptors are kept in pool.
+ *
+ * The head of the chain page-array ("current") advances as necessary. When
+ * it gets full, a new page-array descriptor is pushed in front of it. The
+ * new descriptor is taken from first empty descriptor (if one exists, for
+ * instance, after a chain reset), or allocated on-demand.
+ *
+ * When dumping the data, the chain is traversed in reverse order.
+ */
+
+/* return first page-array in the chain */
+static inline struct ckpt_pgarr *pgarr_first(struct ckpt_ctx *ctx)
+{
+ if (list_empty(&ctx->pgarr_list))
+ return NULL;
+ return list_first_entry(&ctx->pgarr_list, struct ckpt_pgarr, list);
+}
+
+/* return (and detach) first empty page-array in the pool, if exists */
+static inline struct ckpt_pgarr *pgarr_from_pool(struct ckpt_ctx *ctx)
+{
+ struct ckpt_pgarr *pgarr;
+
+ if (list_empty(&ctx->pgarr_pool))
+ return NULL;
+ pgarr = list_first_entry(&ctx->pgarr_pool, struct ckpt_pgarr, list);
+ list_del(&pgarr->list);
+ return pgarr;
+}
+
+/* release pages referenced by a page-array */
+static void pgarr_release_pages(struct ckpt_pgarr *pgarr)
+{
+ ckpt_debug("total pages %d\n", pgarr->nr_used);
+ /*
+ * both checkpoint and restart use 'nr_used', however we only
+ * collect pages during checkpoint; in restart we simply return
+ * because pgarr->pages remains NULL.
+ */
+ if (pgarr->pages) {
+ struct page **pages = pgarr->pages;
+ int nr = pgarr->nr_used;
+
+ while (nr--)
+ page_cache_release(pages[nr]);
+ }
+
+ pgarr->nr_used = 0;
+}
+
+/* free a single page-array object */
+static void pgarr_free_one(struct ckpt_pgarr *pgarr)
+{
+ pgarr_release_pages(pgarr);
+ kfree(pgarr->pages);
+ kfree(pgarr->vaddrs);
+ kfree(pgarr);
+}
+
+/* free the chains of page-arrays (populated and empty pool) */
+void ckpt_pgarr_free(struct ckpt_ctx *ctx)
+{
+ struct ckpt_pgarr *pgarr, *tmp;
+
+ list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr_list, list) {
+ list_del(&pgarr->list);
+ pgarr_free_one(pgarr);
+ }
+
+ list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr_pool, list) {
+ list_del(&pgarr->list);
+ pgarr_free_one(pgarr);
+ }
+}
+
+/* allocate a single page-array object */
+static struct ckpt_pgarr *pgarr_alloc_one(unsigned long flags)
+{
+ struct ckpt_pgarr *pgarr;
+
+ pgarr = kzalloc(sizeof(*pgarr), GFP_KERNEL);
+ if (!pgarr)
+ return NULL;
+ pgarr->vaddrs = kmalloc(CKPT_PGARR_TOTAL * sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!pgarr->vaddrs)
+ goto nomem;
+
+ /* pgarr->pages is needed only for checkpoint */
+ if (flags & CKPT_CTX_CHECKPOINT) {
+ pgarr->pages = kmalloc(CKPT_PGARR_TOTAL *
+ sizeof(struct page *), GFP_KERNEL);
+ if (!pgarr->pages)
+ goto nomem;
+ }
+
+ return pgarr;
+ nomem:
+ pgarr_free_one(pgarr);
+ return NULL;
+}
+
+/* pgarr_current - return the next available page-array in the chain
+ * @ctx: checkpoint context
+ *
+ * Returns the first page-array in the list that has space. Otherwise,
+ * try the next page-array after the last non-empty one, and move it to
+ * the front of the chain. Extends the list if none has space.
+ */
+static struct ckpt_pgarr *pgarr_current(struct ckpt_ctx *ctx)
+{
+ struct ckpt_pgarr *pgarr;
+
+ pgarr = pgarr_first(ctx);
+ if (pgarr && !pgarr_is_full(pgarr))
+ return pgarr;
+
+ pgarr = pgarr_from_pool(ctx);
+ if (!pgarr)
+ pgarr = pgarr_alloc_one(ctx->kflags);
+ if (!pgarr)
+ return NULL;
+
+ list_add(&pgarr->list, &ctx->pgarr_list);
+ return pgarr;
+}
+
+/* reset the page-array chain (dropping page references if necessary) */
+static void pgarr_reset_all(struct ckpt_ctx *ctx)
+{
+ struct ckpt_pgarr *pgarr;
+
+ list_for_each_entry(pgarr, &ctx->pgarr_list, list)
+ pgarr_release_pages(pgarr);
+ list_splice_init(&ctx->pgarr_list, &ctx->pgarr_pool);
+}
+
+/**************************************************************************
+ * Checkpoint
+ *
+ * Checkpoint is outside the context of the checkpointee, so one cannot
+ * simply read pages from user-space. Instead, we scan the address space
+ * of the target to cherry-pick pages of interest. Selected pages are
+ * enlisted in a page-array chain (attached to the checkpoint context).
+ * To save their contents, each page is mapped to kernel memory and then
+ * dumped to the file descriptor.
+ */
+
+/**
+ * consider_private_page - return page pointer for dirty pages
+ * @vma - target vma
+ * @addr - page address
+ *
+ * Looks up the page that correspond to the address in the vma, and
+ * returns the page if it was modified (and grabs a reference to it),
+ * or otherwise returns NULL (or error).
+ */
+static struct page *consider_private_page(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ return __get_dirty_page(vma, addr);
+}
+
+/**
+ * consider_shared_page - return page pointer for dirty pages
+ * @ino - inode of shmem object
+ * @idx - page index in shmem object
+ *
+ * Looks up the page that corresponds to the index in the shmem object,
+ * and returns the page if it was modified (and grabs a reference to it),
+ * or otherwise returns NULL (or error).
+ */
+static struct page *consider_shared_page(struct inode *ino, unsigned long idx)
+{
+ struct page *page = NULL;
+ int ret;
+
+ /*
+ * Inspired by do_shmem_file_read(): very simplified version.
+ *
+ * FIXME: consolidate with do_shmem_file_read()
+ */
+
+ ret = shmem_getpage(ino, idx, &page, SGP_READ, NULL);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ /*
+ * Only care about dirty pages; shmem_getpage() only returns
+ * pages that have been allocated, so they must be dirty. The
+ * pages returned are locked and referenced.
+ */
+
+ if (page) {
+ unlock_page(page);
+ /*
+ * If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(ino->i_mapping))
+ flush_dcache_page(page);
+ /*
+ * Mark the page accessed if we read the beginning.
+ */
+ mark_page_accessed(page);
+ }
+
+ return page;
+}
+
+/**
+ * vma_fill_pgarr - fill a page-array with addr/page tuples
+ * @ctx - checkpoint context
+ * @vma - vma to scan
+ * @start - start address (updated)
+ *
+ * Returns the number of pages collected
+ */
+static int vma_fill_pgarr(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma, struct inode *inode,
+ unsigned long *start, unsigned long end)
+{
+ unsigned long addr = *start;
+ struct ckpt_pgarr *pgarr;
+ int nr_used;
+ int cnt = 0;
+
+ BUG_ON(inode && vma);
+
+ if (vma)
+ down_read(&vma->vm_mm->mmap_sem);
+ do {
+ pgarr = pgarr_current(ctx);
+ if (!pgarr) {
+ cnt = -ENOMEM;
+ goto out;
+ }
+
+ nr_used = pgarr->nr_used;
+
+ while (addr < end) {
+ struct page *page;
+
+ if (vma)
+ page = consider_private_page(vma, addr);
+ else
+ page = consider_shared_page(inode, addr);
+
+ if (IS_ERR(page)) {
+ cnt = PTR_ERR(page);
+ goto out;
+ }
+
+ if (page) {
+ _ckpt_debug(CKPT_DPAGE,
+ "got page %#lx\n", addr);
+ pgarr->pages[pgarr->nr_used] = page;
+ pgarr->vaddrs[pgarr->nr_used] = addr;
+ pgarr->nr_used++;
+ }
+
+ if (vma)
+ addr += PAGE_SIZE;
+ else
+ addr++;
+
+ if (pgarr_is_full(pgarr))
+ break;
+ }
+
+ cnt += pgarr->nr_used - nr_used;
+
+ } while ((cnt < CKPT_PGARR_BATCH) && (addr < end));
+ out:
+ if (vma)
+ up_read(&vma->vm_mm->mmap_sem);
+ *start = addr;
+ return cnt;
+}
+
+/* dump contents of a pages: use kmap_atomic() to avoid TLB flush */
+int checkpoint_dump_page(struct ckpt_ctx *ctx, struct page *page)
+{
+ void *ptr;
+
+ ptr = kmap_atomic(page, KM_USER1);
+ memcpy(ctx->scratch_page, ptr, PAGE_SIZE);
+ kunmap_atomic(ptr, KM_USER1);
+
+ return ckpt_kwrite(ctx, ctx->scratch_page, PAGE_SIZE);
+}
+
+/**
+ * vma_dump_pages - dump pages listed in the ctx page-array chain
+ * @ctx - checkpoint context
+ * @total - total number of pages
+ *
+ * First dump all virtual addresses, followed by the contents of all pages
+ */
+static int vma_dump_pages(struct ckpt_ctx *ctx, int total)
+{
+ struct ckpt_pgarr *pgarr;
+ int i, ret = 0;
+
+ if (!total)
+ return 0;
+
+ i = total * (sizeof(unsigned long) + PAGE_SIZE);
+ ret = ckpt_write_obj_type(ctx, NULL, i, CKPT_HDR_BUFFER);
+ if (ret < 0)
+ return ret;
+
+ list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
+ ret = ckpt_kwrite(ctx, pgarr->vaddrs,
+ pgarr->nr_used * sizeof(unsigned long));
+ if (ret < 0)
+ return ret;
+ }
+
+ list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
+ for (i = 0; i < pgarr->nr_used; i++) {
+ ret = checkpoint_dump_page(ctx, pgarr->pages[i]);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * checkpoint_memory_contents - dump contents of a memory region
+ * @ctx - checkpoint context
+ * @vma - vma to scan (--or--)
+ * @inode - inode to scan
+ *
+ * Collect lists of pages that needs to be dumped, and corresponding
+ * virtual addresses into ctx->pgarr_list page-array chain. Then dump
+ * the addresses, followed by the page contents.
+ */
+int checkpoint_memory_contents(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma,
+ struct inode *inode)
+{
+ struct ckpt_hdr_pgarr *h;
+ unsigned long addr, end;
+ int cnt, ret;
+
+ BUG_ON(vma && inode);
+
+ if (vma) {
+ addr = vma->vm_start;
+ end = vma->vm_end;
+ } else {
+ addr = 0;
+ end = PAGE_ALIGN(i_size_read(inode)) >> PAGE_CACHE_SHIFT;
+ }
+
+ /*
+ * Work iteratively, collecting and dumping at most CKPT_PGARR_BATCH
+ * in each round. Each iterations is divided into two steps:
+ *
+ * (1) scan: scan through the PTEs of the vma to collect the pages
+ * to dump (later we'll also make them COW), while keeping a list
+ * of pages and their corresponding addresses on ctx->pgarr_list.
+ *
+ * (2) dump: write out a header specifying how many pages, followed
+ * by the addresses of all pages in ctx->pgarr_list, followed by
+ * the actual contents of all pages. (Then, release the references
+ * to the pages and reset the page-array chain).
+ *
+ * (This split makes the logic simpler by first counting the pages
+ * that need saving. More importantly, it allows for a future
+ * optimization that will reduce application downtime by deferring
+ * the actual write-out of the data to after the application is
+ * allowed to resume execution).
+ *
+ * After dumping the entire contents, conclude with a header that
+ * specifies 0 pages to mark the end of the contents.
+ */
+
+ while (addr < end) {
+ cnt = vma_fill_pgarr(ctx, vma, inode, &addr, end);
+ if (cnt == 0)
+ break;
+ else if (cnt < 0)
+ return cnt;
+
+ ckpt_debug("collected %d pages\n", cnt);
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
+ if (!h)
+ return -ENOMEM;
+
+ h->nr_pages = cnt;
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+ if (ret < 0)
+ return ret;
+
+ ret = vma_dump_pages(ctx, cnt);
+ if (ret < 0)
+ return ret;
+
+ pgarr_reset_all(ctx);
+ }
+
+ /* mark end of contents with header saying "0" pages */
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
+ if (!h)
+ return -ENOMEM;
+ h->nr_pages = 0;
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+
+ return ret;
+}
+
+/**
+ * generic_vma_checkpoint - dump metadata of vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ * @type: vma type
+ * @vma_objref: vma objref
+ */
+int generic_vma_checkpoint(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma,
+ enum vma_type type, int vma_objref,
+ int ino_objref)
+{
+ struct ckpt_hdr_vma *h;
+ int ret;
+
+ ckpt_debug("vma %#lx-%#lx flags %#lx type %d\n",
+ vma->vm_start, vma->vm_end, vma->vm_flags, type);
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_VMA);
+ if (!h)
+ return -ENOMEM;
+
+ h->vma_type = type;
+ h->vma_objref = vma_objref;
+ h->ino_objref = ino_objref;
+
+ if (vma->vm_file)
+ h->ino_size = i_size_read(vma->vm_file->f_dentry->d_inode);
+ else
+ h->ino_size = 0;
+
+ h->vm_start = vma->vm_start;
+ h->vm_end = vma->vm_end;
+ h->vm_page_prot = pgprot_val(vma->vm_page_prot);
+ h->vm_flags = vma->vm_flags;
+ h->vm_pgoff = vma->vm_pgoff;
+
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+
+ return ret;
+}
+
+/**
+ * private_vma_checkpoint - dump contents of private (anon, file) vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ * @type: vma type
+ * @vma_objref: vma objref
+ */
+int private_vma_checkpoint(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma,
+ enum vma_type type, int vma_objref)
+{
+ int ret;
+
+ BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE));
+
+ ret = generic_vma_checkpoint(ctx, vma, type, vma_objref, 0);
+ if (ret < 0)
+ goto out;
+ ret = checkpoint_memory_contents(ctx, vma, NULL);
+ out:
+ return ret;
+}
+
+/**
+ * shmem_vma_checkpoint - dump contents of private (anon, file) vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ * @type: vma type
+ * @objref: vma object id
+ */
+int shmem_vma_checkpoint(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma,
+ enum vma_type type, int ino_objref)
+{
+ struct file *file = vma->vm_file;
+ int ret;
+
+ ckpt_debug("type %d, ino_ref %d\n", type, ino_objref);
+ BUG_ON(!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE)));
+ BUG_ON(!file);
+
+ ret = generic_vma_checkpoint(ctx, vma, type, 0, ino_objref);
+ if (ret < 0)
+ goto out;
+ if (type == CKPT_VMA_SHM_ANON_SKIP)
+ goto out;
+ ret = checkpoint_memory_contents(ctx, NULL, file->f_dentry->d_inode);
+ out:
+ return ret;
+}
+
+/**
+ * anonymous_checkpoint - dump contents of private-anonymous vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ */
+static int anonymous_checkpoint(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma)
+{
+ /* should be private anonymous ... verify that this is the case */
+ BUG_ON(vma->vm_flags & VM_MAYSHARE);
+ BUG_ON(vma->vm_file);
+
+ return private_vma_checkpoint(ctx, vma, CKPT_VMA_ANON, 0);
+}
+
+static int checkpoint_vmas(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ struct vm_area_struct *vma, *next;
+ int map_count = 0;
+ int ret = 0;
+
+ vma = kzalloc(sizeof(*vma), GFP_KERNEL);
+ if (!vma)
+ return -ENOMEM;
+
+ /*
+ * Must not hold mm->mmap_sem when writing to image file, so
+ * can't simply traverse the vma list. Instead, use find_vma()
+ * to get the @next and make a local "copy" of it.
+ */
+ while (1) {
+ down_read(&mm->mmap_sem);
+ next = find_vma(mm, vma->vm_end);
+ if (!next) {
+ up_read(&mm->mmap_sem);
+ break;
+ }
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ *vma = *next;
+ if (vma->vm_file)
+ get_file(vma->vm_file);
+ up_read(&mm->mmap_sem);
+
+ map_count++;
+
+ ckpt_debug("vma %#lx-%#lx flags %#lx\n",
+ vma->vm_start, vma->vm_end, vma->vm_flags);
+
+ if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
+ ckpt_err(ctx, -ENOSYS, "%(T)vma: bad flags (%#lx)\n",
+ vma->vm_flags);
+ ret = -ENOSYS;
+ break;
+ }
+
+ if (!vma->vm_ops)
+ ret = anonymous_checkpoint(ctx, vma);
+ else if (vma->vm_ops->checkpoint)
+ ret = (*vma->vm_ops->checkpoint)(ctx, vma);
+ else
+ ret = -ENOSYS;
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T)vma: failed\n");
+ break;
+ }
+ /*
+ * The file was collected, but not always checkpointed;
+ * be safe and mark as visited to appease leak detection
+ */
+ if (vma->vm_file && !(ctx->uflags & CHECKPOINT_SUBTREE)) {
+ ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
+ if (ret < 0)
+ break;
+ }
+ }
+
+ if (vma->vm_file)
+ fput(vma->vm_file);
+
+ kfree(vma);
+
+ return ret < 0 ? ret : map_count;
+}
+
+#define CKPT_AT_SZ (AT_VECTOR_SIZE * sizeof(u64))
+/*
+ * We always write saved_auxv out as an array of u64s, though it is
+ * an array of u32s on 32-bit arch.
+ */
+static int ckpt_write_auxv(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ int i, ret;
+ u64 *buf = kzalloc(CKPT_AT_SZ, GFP_KERNEL);
+
+ if (!buf)
+ return -ENOMEM;
+ for (i = 0; i < AT_VECTOR_SIZE; i++)
+ buf[i] = mm->saved_auxv[i];
+ ret = ckpt_write_buffer(ctx, buf, CKPT_AT_SZ);
+ kfree(buf);
+ return ret;
+}
+
+/* Defined by arch */
+extern int checkpoint_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
+
+static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ struct ckpt_hdr_mm *h;
+ struct file *exe_file = NULL;
+ int ret;
+
+ if (check_for_outstanding_aio(mm)) {
+ ckpt_err(ctx, -EBUSY, "(%T)Outstanding aio\n");
+ return -EBUSY;
+ }
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_MM);
+ if (!h)
+ return -ENOMEM;
+
+ down_read(&mm->mmap_sem);
+
+ h->flags = mm->flags;
+ h->def_flags = mm->def_flags;
+
+ h->start_code = mm->start_code;
+ h->end_code = mm->end_code;
+ h->start_data = mm->start_data;
+ h->end_data = mm->end_data;
+ h->start_brk = mm->start_brk;
+ h->brk = mm->brk;
+ h->start_stack = mm->start_stack;
+ h->arg_start = mm->arg_start;
+ h->arg_end = mm->arg_end;
+ h->env_start = mm->env_start;
+ h->env_end = mm->env_end;
+
+ h->map_count = mm->map_count;
+
+ if (mm->exe_file) { /* checkpoint the ->exe_file */
+ exe_file = mm->exe_file;
+ get_file(exe_file);
+ }
+
+ /*
+ * Drop mm->mmap_sem before writing data to checkpoint image
+ * to avoid reverse locking order (inode must come before mm).
+ */
+ up_read(&mm->mmap_sem);
+
+ if (exe_file) {
+ h->exe_objref = checkpoint_obj(ctx, exe_file, CKPT_OBJ_FILE);
+ if (h->exe_objref < 0) {
+ ret = h->exe_objref;
+ goto out;
+ }
+ }
+
+ ret = ckpt_write_obj(ctx, &h->h);
+ if (ret < 0)
+ goto out;
+
+ ret = ckpt_write_auxv(ctx, mm);
+ if (ret < 0)
+ return ret;
+
+ ret = checkpoint_vmas(ctx, mm);
+ if (ret != h->map_count && ret >= 0)
+ ret = -EBUSY; /* checkpoint mm leak */
+ if (ret < 0)
+ goto out;
+
+ ret = checkpoint_mm_context(ctx, mm);
+ out:
+ if (exe_file)
+ fput(exe_file);
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+static int checkpoint_mm(struct ckpt_ctx *ctx, void *ptr)
+{
+ return do_checkpoint_mm(ctx, (struct mm_struct *) ptr);
+}
+
+int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+ struct mm_struct *mm;
+ int objref;
+
+ mm = get_task_mm(t);
+ objref = checkpoint_obj(ctx, mm, CKPT_OBJ_MM);
+ mmput(mm);
+
+ return objref;
+}
+
+/***********************************************************************
+ * Collect
+ */
+
+static int collect_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ struct file *file;
+ int ret;
+
+ /* if already exists (ret == 0), nothing to do */
+ ret = ckpt_obj_collect(ctx, mm, CKPT_OBJ_MM);
+ if (ret <= 0)
+ return ret;
+
+ /* if first time for this mm (ret > 0), proceed inside */
+ down_read(&mm->mmap_sem);
+ if (mm->exe_file) {
+ ret = ckpt_collect_file(ctx, mm->exe_file);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T)mm: collect exe_file\n");
+ goto out;
+ }
+ }
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ file = vma->vm_file;
+ if (!file)
+ continue;
+ ret = ckpt_collect_file(ctx, file);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T)mm: collect vm_file\n");
+ break;
+ }
+ }
+ out:
+ up_read(&mm->mmap_sem);
+ return ret;
+
+}
+
+int ckpt_collect_mm(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = get_task_mm(t);
+ ret = collect_mm(ctx, mm);
+ mmput(mm);
+
+ return ret;
+}
+
+/***********************************************************************
+ * Restart
+ *
+ * Unlike checkpoint, restart is executed in the context of each restarting
+ * process: vma regions are restored via a call to mmap(), and the data is
+ * read into the address space of the current process.
+ */
+
+/**
+ * read_pages_vaddrs - read addresses of pages to page-array chain
+ * @ctx - restart context
+ * @nr_pages - number of address to read
+ */
+static int read_pages_vaddrs(struct ckpt_ctx *ctx, unsigned long nr_pages)
+{
+ struct ckpt_pgarr *pgarr;
+ unsigned long *vaddrp;
+ int nr, ret;
+
+ while (nr_pages) {
+ pgarr = pgarr_current(ctx);
+ if (!pgarr)
+ return -ENOMEM;
+ nr = pgarr_nr_free(pgarr);
+ if (nr > nr_pages)
+ nr = nr_pages;
+ vaddrp = &pgarr->vaddrs[pgarr->nr_used];
+ ret = ckpt_kread(ctx, vaddrp, nr * sizeof(unsigned long));
+ if (ret < 0)
+ return ret;
+ pgarr->nr_used += nr;
+ nr_pages -= nr;
+ }
+ return 0;
+}
+
+int restore_read_page(struct ckpt_ctx *ctx, struct page *page)
+{
+ void *ptr;
+ int ret;
+
+ ret = ckpt_kread(ctx, ctx->scratch_page, PAGE_SIZE);
+ if (ret < 0)
+ return ret;
+
+ ptr = kmap_atomic(page, KM_USER1);
+ memcpy(ptr, ctx->scratch_page, PAGE_SIZE);
+ kunmap_atomic(ptr, KM_USER1);
+
+ return 0;
+}
+
+static struct page *bring_private_page(unsigned long addr)
+{
+ struct page *page;
+ int ret;
+
+ ret = get_user_pages(current, current->mm, addr, 1, 1, 1, &page, NULL);
+ if (ret < 0)
+ page = ERR_PTR(ret);
+ return page;
+}
+
+static struct page *bring_shared_page(unsigned long idx, struct inode *ino)
+{
+ struct page *page = NULL;
+ int ret;
+
+ ret = shmem_getpage(ino, idx, &page, SGP_WRITE, NULL);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (page)
+ unlock_page(page);
+ return page;
+}
+
+/**
+ * read_pages_contents - read in data of pages in page-array chain
+ * @ctx - restart context
+ */
+static int read_pages_contents(struct ckpt_ctx *ctx, struct inode *inode)
+{
+ struct ckpt_pgarr *pgarr;
+ unsigned long *vaddrs;
+ int i, ret;
+
+ list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
+ vaddrs = pgarr->vaddrs;
+ for (i = 0; i < pgarr->nr_used; i++) {
+ struct page *page;
+
+ /* TODO: do in chunks to reduce mmap_sem overhead */
+ _ckpt_debug(CKPT_DPAGE, "got page %#lx\n", vaddrs[i]);
+ down_read(¤t->mm->mmap_sem);
+ if (inode)
+ page = bring_shared_page(vaddrs[i], inode);
+ else
+ page = bring_private_page(vaddrs[i]);
+ up_read(¤t->mm->mmap_sem);
+
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ ret = restore_read_page(ctx, page);
+ page_cache_release(page);
+
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/**
+ * restore_memory_contents - restore contents of a memory region
+ * @ctx - restart context
+ * @inode - backing inode
+ *
+ * Reads a header that specifies how many pages will follow, then reads
+ * a list of virtual addresses into ctx->pgarr_list page-array chain,
+ * followed by the actual contents of the corresponding pages. Iterates
+ * these steps until reaching a header specifying "0" pages, which marks
+ * the end of the contents.
+ */
+int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode)
+{
+ struct ckpt_hdr_pgarr *h;
+ unsigned long nr_pages;
+ int len, ret = 0;
+
+ while (1) {
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
+ if (IS_ERR(h))
+ break;
+
+ ckpt_debug("total pages %ld\n", (unsigned long) h->nr_pages);
+
+ nr_pages = h->nr_pages;
+ ckpt_hdr_put(ctx, h);
+
+ if (!nr_pages)
+ break;
+
+ len = nr_pages * (sizeof(unsigned long) + PAGE_SIZE);
+ ret = _ckpt_read_buffer(ctx, NULL, len);
+ if (ret < 0)
+ break;
+
+ ret = read_pages_vaddrs(ctx, nr_pages);
+ if (ret < 0)
+ break;
+ ret = read_pages_contents(ctx, inode);
+ if (ret < 0)
+ break;
+ pgarr_reset_all(ctx);
+ }
+
+ return ret;
+}
+
+/**
+ * calc_map_prot_bits - convert vm_flags to mmap protection
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long calc_map_prot_bits(unsigned long orig_vm_flags)
+{
+ unsigned long vm_prot = 0;
+
+ if (orig_vm_flags & VM_READ)
+ vm_prot |= PROT_READ;
+ if (orig_vm_flags & VM_WRITE)
+ vm_prot |= PROT_WRITE;
+ if (orig_vm_flags & VM_EXEC)
+ vm_prot |= PROT_EXEC;
+ if (orig_vm_flags & PROT_SEM) /* only (?) with IPC-SHM */
+ vm_prot |= PROT_SEM;
+
+ return vm_prot;
+}
+
+/**
+ * calc_map_flags_bits - convert vm_flags to mmap flags
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long calc_map_flags_bits(unsigned long orig_vm_flags)
+{
+ unsigned long vm_flags = 0;
+
+ vm_flags = MAP_FIXED;
+ if (orig_vm_flags & VM_GROWSDOWN)
+ vm_flags |= MAP_GROWSDOWN;
+ if (orig_vm_flags & VM_DENYWRITE)
+ vm_flags |= MAP_DENYWRITE;
+ if (orig_vm_flags & VM_EXECUTABLE)
+ vm_flags |= MAP_EXECUTABLE;
+ if (orig_vm_flags & VM_MAYSHARE)
+ vm_flags |= MAP_SHARED;
+ else
+ vm_flags |= MAP_PRIVATE;
+
+ return vm_flags;
+}
+
+/**
+ * generic_vma_restore - restore a vma
+ * @mm - address space
+ * @file - file to map (NULL for anonymous)
+ * @h - vma header data
+ */
+unsigned long generic_vma_restore(struct mm_struct *mm,
+ struct file *file,
+ struct ckpt_hdr_vma *h)
+{
+ unsigned long vm_size, vm_start, vm_flags, vm_prot, vm_pgoff;
+ unsigned long addr;
+
+ if (h->vm_end < h->vm_start)
+ return -EINVAL;
+ if (h->vma_objref < 0)
+ return -EINVAL;
+
+ vm_start = h->vm_start;
+ vm_pgoff = h->vm_pgoff;
+ vm_size = h->vm_end - h->vm_start;
+ vm_prot = calc_map_prot_bits(h->vm_flags);
+ vm_flags = calc_map_flags_bits(h->vm_flags);
+
+ down_write(&mm->mmap_sem);
+ addr = do_mmap_pgoff(file, vm_start, vm_size,
+ vm_prot, vm_flags, vm_pgoff);
+ up_write(&mm->mmap_sem);
+ ckpt_debug("size %#lx prot %#lx flag %#lx pgoff %#lx => %#lx\n",
+ vm_size, vm_prot, vm_flags, vm_pgoff, addr);
+
+ return addr;
+}
+
+/**
+ * private_vma_restore - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ * @file: file to use for mapping
+ * @h - vma header data
+ */
+int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+ struct file *file, struct ckpt_hdr_vma *h)
+{
+ unsigned long addr;
+
+ if (h->vm_flags & (VM_SHARED | VM_MAYSHARE))
+ return -EINVAL;
+
+ addr = generic_vma_restore(mm, file, h);
+ if (IS_ERR((void *) addr))
+ return PTR_ERR((void *) addr);
+
+ return restore_memory_contents(ctx, NULL);
+}
+
+/**
+ * anon_private_restore - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ * @h - vma header data
+ */
+static int anon_private_restore(struct ckpt_ctx *ctx,
+ struct mm_struct *mm,
+ struct ckpt_hdr_vma *h)
+{
+ /*
+ * vm_pgoff for anonymous mapping is the "global" page
+ * offset (namely from addr 0x0), so we force a zero
+ */
+ h->vm_pgoff = 0;
+
+ return private_vma_restore(ctx, mm, NULL, h);
+}
+
+static int bad_vma_restore(struct ckpt_ctx *ctx,
+ struct mm_struct *mm,
+ struct ckpt_hdr_vma *h)
+{
+ return -EINVAL;
+}
+
+/* callbacks to restore vma per its type: */
+struct restore_vma_ops {
+ char *vma_name;
+ enum vma_type vma_type;
+ int (*restore) (struct ckpt_ctx *ctx,
+ struct mm_struct *mm,
+ struct ckpt_hdr_vma *ptr);
+};
+
+static struct restore_vma_ops restore_vma_ops[] = {
+ /* ignored vma */
+ {
+ .vma_name = "IGNORE",
+ .vma_type = CKPT_VMA_IGNORE,
+ .restore = NULL,
+ },
+ /* special mapping (vdso) */
+ {
+ .vma_name = "VDSO",
+ .vma_type = CKPT_VMA_VDSO,
+ .restore = special_mapping_restore,
+ },
+ /* anonymous private */
+ {
+ .vma_name = "ANON PRIVATE",
+ .vma_type = CKPT_VMA_ANON,
+ .restore = anon_private_restore,
+ },
+ /* file-mapped private */
+ {
+ .vma_name = "FILE PRIVATE",
+ .vma_type = CKPT_VMA_FILE,
+ .restore = filemap_restore,
+ },
+ /* anonymous shared */
+ {
+ .vma_name = "ANON SHARED",
+ .vma_type = CKPT_VMA_SHM_ANON,
+ .restore = shmem_restore,
+ },
+ /* anonymous shared (skipped) */
+ {
+ .vma_name = "ANON SHARED (skip)",
+ .vma_type = CKPT_VMA_SHM_ANON_SKIP,
+ .restore = shmem_restore,
+ },
+ /* file-mapped shared */
+ {
+ .vma_name = "FILE SHARED",
+ .vma_type = CKPT_VMA_SHM_FILE,
+ .restore = filemap_restore,
+ },
+ /* sysvipc shared */
+ {
+ .vma_name = "IPC SHARED",
+ .vma_type = CKPT_VMA_SHM_IPC,
+ /* ipc inode itself is restore by restore_ipc_ns()... */
+ .restore = bad_vma_restore,
+
+ },
+ /* sysvipc shared (skip) */
+ {
+ .vma_name = "IPC SHARED (skip)",
+ .vma_type = CKPT_VMA_SHM_IPC_SKIP,
+ .restore = ipcshm_restore,
+ },
+};
+
+/**
+ * restore_vma - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ */
+static int restore_vma(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ struct ckpt_hdr_vma *h;
+ struct restore_vma_ops *ops;
+ int ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_VMA);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ ckpt_debug("vma %#lx-%#lx flags %#lx type %d vmaref %d inoref %d\n",
+ (unsigned long) h->vm_start, (unsigned long) h->vm_end,
+ (unsigned long) h->vm_flags, (int) h->vma_type,
+ (int) h->vma_objref, (int) h->ino_objref);
+
+ ret = -EINVAL;
+ if (h->vm_end < h->vm_start)
+ goto out;
+ if (h->vma_objref < 0 || h->ino_objref < 0)
+ goto out;
+ if (h->vma_type >= CKPT_VMA_MAX)
+ goto out;
+ if (h->vm_flags & CKPT_VMA_NOT_SUPPORTED)
+ return -ENOSYS;
+
+ ops = &restore_vma_ops[h->vma_type];
+
+ /* make sure we don't change this accidentally */
+ BUG_ON(ops->vma_type != h->vma_type);
+
+ if (ops->restore) {
+ ckpt_debug("vma type %s\n", ops->vma_name);
+ ret = ops->restore(ctx, mm, h);
+ } else {
+ ckpt_debug("vma ignored\n");
+ ret = 0;
+ }
+ out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+static int ckpt_read_auxv(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ int i, ret;
+ u64 *buf = kmalloc(CKPT_AT_SZ, GFP_KERNEL);
+
+ if (!buf)
+ return -ENOMEM;
+ ret = _ckpt_read_buffer(ctx, buf, CKPT_AT_SZ);
+ if (ret < 0)
+ goto out;
+
+ ret = -E2BIG;
+ for (i = 0; i < AT_VECTOR_SIZE; i++)
+ if (buf[i] > (u64) ULONG_MAX)
+ goto out;
+
+ for (i = 0; i < AT_VECTOR_SIZE - 1; i++)
+ mm->saved_auxv[i] = buf[i];
+ /* sanitize the input: force AT_NULL in last entry */
+ mm->saved_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
+
+ ret = 0;
+ out:
+ kfree(buf);
+ return ret;
+}
+
+extern int restore_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
+
+static struct mm_struct *do_restore_mm(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_mm *h;
+ struct mm_struct *mm = NULL;
+ struct file *file;
+ unsigned int nr;
+ int ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_MM);
+ if (IS_ERR(h))
+ return (struct mm_struct *) h;
+
+ ckpt_debug("map_count %d\n", h->map_count);
+
+ /* XXX need more sanity checks */
+
+ ret = -EINVAL;
+ if ((h->start_code > h->end_code) ||
+ (h->start_data > h->end_data))
+ goto out;
+ if (h->exe_objref < 0)
+ goto out;
+ if (h->def_flags & ~VM_LOCKED)
+ goto out;
+ if (h->flags & ~(MMF_DUMP_FILTER_MASK |
+ ((1 << MMF_DUMP_FILTER_BITS) - 1)))
+ goto out;
+
+ mm = current->mm;
+
+ /* point of no return -- destruct current mm */
+ down_write(&mm->mmap_sem);
+ ret = destroy_mm(mm);
+ if (ret < 0) {
+ up_write(&mm->mmap_sem);
+ goto out;
+ }
+
+ mm->flags = h->flags;
+ mm->def_flags = h->def_flags;
+
+ mm->start_code = h->start_code;
+ mm->end_code = h->end_code;
+ mm->start_data = h->start_data;
+ mm->end_data = h->end_data;
+ mm->start_brk = h->start_brk;
+ mm->brk = h->brk;
+ mm->start_stack = h->start_stack;
+ mm->arg_start = h->arg_start;
+ mm->arg_end = h->arg_end;
+ mm->env_start = h->env_start;
+ mm->env_end = h->env_end;
+
+ /* restore the ->exe_file */
+ if (h->exe_objref) {
+ file = ckpt_obj_fetch(ctx, h->exe_objref, CKPT_OBJ_FILE);
+ if (IS_ERR(file)) {
+ up_write(&mm->mmap_sem);
+ ret = PTR_ERR(file);
+ goto out;
+ }
+ set_mm_exe_file(mm, file);
+ }
+ up_write(&mm->mmap_sem);
+
+ ret = ckpt_read_auxv(ctx, mm);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "Error restoring auxv\n");
+ goto out;
+ }
+
+ for (nr = h->map_count; nr; nr--) {
+ ret = restore_vma(ctx, mm);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = restore_mm_context(ctx, mm);
+ out:
+ ckpt_hdr_put(ctx, h);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ /* restore_obj() expect an extra reference */
+ atomic_inc(&mm->mm_users);
+ return mm;
+}
+
+static void *restore_mm(struct ckpt_ctx *ctx)
+{
+ return (void *) do_restore_mm(ctx);
+}
+
+int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = ckpt_obj_fetch(ctx, mm_objref, CKPT_OBJ_MM);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+
+ if (mm == current->mm)
+ return 0;
+
+ ret = exec_mmap(mm);
+ if (ret < 0)
+ return ret;
+
+ atomic_inc(&mm->mm_users);
+ return 0;
+}
static int obj_mm_grab(void *ptr)
{
atomic_inc(&((struct mm_struct *) ptr)->mm_users);
diff --git a/mm/shmem.c b/mm/shmem.c
index e103155..3696342 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,7 +29,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/swap.h>
-#include <linux/checkpoint.h>
+#include <linux/mm_checkpoint.h>
static struct vfsmount *shm_mnt;
--
1.6.3.3
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list