[Devel] [RFC][PATCH 3/4] checkpoint/restart: memory management
Dave Hansen
dave at linux.vnet.ibm.com
Thu Aug 7 15:40:37 PDT 2008
For each vma, there is a 'struct cr_vma'; if the vma is file-mapped,
it will be followed by the file name. The cr_vma->npages will tell
how many pages were dumped for this vma. Then it will be followed
by the actual data: first a dump of the addresses of all dumped
pages (npages entries) followed by a dump of the contents of all
dumped pages (npages pages). Then will come the next vma and so on.
I guess I could also separate out the x86-specific bits here, but
they're pretty small, comparatively.
Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
---
linux-2.6.git-dave/arch/x86/kernel/ldt.c | 2
linux-2.6.git-dave/ckpt/Makefile | 2
linux-2.6.git-dave/ckpt/ckpt_arch.h | 2
linux-2.6.git-dave/ckpt/ckpt_hdr.h | 21 +
linux-2.6.git-dave/ckpt/ckpt_mem.c | 388 ++++++++++++++++++++++++++++++
linux-2.6.git-dave/ckpt/ckpt_mem.h | 32 ++
linux-2.6.git-dave/ckpt/rstr_mem.c | 354 +++++++++++++++++++++++++++
linux-2.6.git-dave/ckpt/sys.c | 3
linux-2.6.git-dave/ckpt/x86.c | 83 ++++++
linux-2.6.git-dave/include/asm-x86/ckpt.h | 5
linux-2.6.git-dave/include/asm-x86/desc.h | 3
11 files changed, 892 insertions(+), 3 deletions(-)
diff -puN arch/x86/kernel/ldt.c~memory_part arch/x86/kernel/ldt.c
--- linux-2.6.git/arch/x86/kernel/ldt.c~memory_part 2008-08-05 08:37:29.000000000 -0700
+++ linux-2.6.git-dave/arch/x86/kernel/ldt.c 2008-08-05 08:38:00.000000000 -0700
@@ -183,7 +183,7 @@ static int read_default_ldt(void __user
return bytecount;
}
-static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
+int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct *mm = current->mm;
struct desc_struct ldt;
diff -puN ckpt/ckpt_arch.h~memory_part ckpt/ckpt_arch.h
--- linux-2.6.git/ckpt/ckpt_arch.h~memory_part 2008-08-05 08:37:29.000000000 -0700
+++ linux-2.6.git-dave/ckpt/ckpt_arch.h 2008-08-05 08:37:29.000000000 -0700
@@ -4,3 +4,5 @@ int cr_write_thread(struct cr_ctx *ctx,
int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t);
int cr_read_thread(struct cr_ctx *ctx);
int cr_read_cpu(struct cr_ctx *ctx);
+int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm);
+int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm);
diff -puN ckpt/ckpt_hdr.h~memory_part ckpt/ckpt_hdr.h
--- linux-2.6.git/ckpt/ckpt_hdr.h~memory_part 2008-08-05 08:37:29.000000000 -0700
+++ linux-2.6.git-dave/ckpt/ckpt_hdr.h 2008-08-05 08:37:29.000000000 -0700
@@ -67,3 +67,24 @@ struct cr_hdr_task {
};
+
+struct cr_hdr_mm {
+ __u32 tag; /* sharing identifier */
+ __u64 start_code, end_code, start_data, end_data;
+ __u64 start_brk, brk, start_stack;
+ __u64 arg_start, arg_end, env_start, env_end;
+ __s16 map_count;
+};
+
+struct cr_hdr_vma {
+ __u32 how;
+
+ __u64 vm_start;
+ __u64 vm_end;
+ __u64 vm_page_prot;
+ __u64 vm_flags;
+ __u64 vm_pgoff;
+
+ __s16 npages;
+ __s16 namelen;
+};
diff -puN /dev/null ckpt/ckpt_mem.c
--- /dev/null 2007-04-11 11:48:27.000000000 -0700
+++ linux-2.6.git-dave/ckpt/ckpt_mem.c 2008-08-05 08:37:29.000000000 -0700
@@ -0,0 +1,388 @@
+/*
+ * Checkpoint memory contents
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/mm_types.h>
+
+#include "ckpt.h"
+#include "ckpt_hdr.h"
+#include "ckpt_arch.h"
+#include "ckpt_mem.h"
+
+/*
+ * utilities to alloc, free, and handle 'struct cr_pgarr'
+ * (common to ckpt_mem.c and rstr_mem.c)
+ */
+
+#define CR_ORDER_PGARR 0
+#define CR_PGARR_TOTAL ((PAGE_SIZE << CR_ORDER_PGARR) / sizeof(void *))
+
+/* release pages referenced by a page-array */
+void _cr_pgarr_release(struct cr_ctx *ctx, struct cr_pgarr *pgarr)
+{
+ int n;
+
+ /* only checkpoint keeps references to pages */
+ if (ctx->flags & CR_CTX_CKPT) {
+ CR_PRINTK("release pages (nused %d)\n", pgarr->nused);
+ for (n = pgarr->nused; n--; )
+ page_cache_release(pgarr->pages[n]);
+ }
+ pgarr->nused = 0;
+ pgarr->nleft = CR_PGARR_TOTAL;
+}
+
+/* release pages referenced by chain of page-arrays */
+void cr_pgarr_release(struct cr_ctx *ctx)
+{
+ struct cr_pgarr *pgarr;
+
+ for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next)
+ _cr_pgarr_release(ctx, pgarr);
+}
+
+/* free a chain of page-arrays */
+void cr_pgarr_free(struct cr_ctx *ctx)
+{
+ struct cr_pgarr *pgarr, *pgnxt;
+
+ for (pgarr = ctx->pgarr; pgarr; pgarr = pgnxt) {
+ _cr_pgarr_release(ctx, pgarr);
+ free_pages((unsigned long) ctx->pgarr->addrs, CR_ORDER_PGARR);
+ free_pages((unsigned long) ctx->pgarr->pages, CR_ORDER_PGARR);
+ pgnxt = pgarr->next;
+ kfree(pgarr);
+ }
+}
+
+/* allocate and add a new page-array to chain */
+struct cr_pgarr *cr_pgarr_alloc(struct cr_ctx *ctx, struct cr_pgarr **pgnew)
+{
+ struct cr_pgarr *pgarr = ctx->pgcur;
+
+ if (pgarr && pgarr->next) {
+ ctx->pgcur = pgarr->next;
+ return pgarr->next;
+ }
+
+ if ((pgarr = kzalloc(sizeof(*pgarr), GFP_KERNEL))) {
+ pgarr->nused = 0;
+ pgarr->nleft = CR_PGARR_TOTAL;
+ pgarr->addrs = (unsigned long *)
+ __get_free_pages(GFP_KERNEL, CR_ORDER_PGARR);
+ pgarr->pages = (struct page **)
+ __get_free_pages(GFP_KERNEL, CR_ORDER_PGARR);
+ if (likely(pgarr->addrs && pgarr->pages)) {
+ *pgnew = pgarr;
+ ctx->pgcur = pgarr;
+ return pgarr;
+ } else if (pgarr->addrs)
+ free_pages((unsigned long) pgarr->addrs,
+ CR_ORDER_PGARR);
+ kfree(pgarr);
+ }
+
+ return NULL;
+}
+
+/* return current page-array (and allocate if needed) */
+struct cr_pgarr *cr_pgarr_prep(struct cr_ctx *ctx)
+{
+ struct cr_pgarr *pgarr = ctx->pgcur;
+
+ if (unlikely(!pgarr->nleft))
+ pgarr = cr_pgarr_alloc(ctx, &pgarr->next);
+ return pgarr;
+}
+
+/*
+ * Checkpoint is outside the context of the checkpointee, so one cannot
+ * simply read pages from user-space. Instead, we scan the address space
+ * of the target to cherry-pick pages of interest. Selected pages are
+ * enlisted in a page-array chain (attached to the checkpoint context).
+ * To save their contents, each page is mapped to kernel memory and then
+ * dumped to the file descriptor.
+ */
+
+/**
+ * cr_vma_fill_pgarr - fill a page-array with addr/page tuples for a vma
+ * @ctx - checkpoint context
+ * @pgarr - page-array to fill
+ * @vma - vma to scan
+ * @start - start address (updated)
+ */
+static int cr_vma_fill_pgarr(struct cr_ctx *ctx, struct cr_pgarr *pgarr,
+ struct vm_area_struct *vma, unsigned long *start)
+{
+ unsigned long end = vma->vm_end;
+ unsigned long addr = *start;
+ struct page **pagep;
+ unsigned long *addrp;
+ int cow, nr, ret = 0;
+
+ nr = pgarr->nleft;
+ pagep = &pgarr->pages[pgarr->nused];
+ addrp = &pgarr->addrs[pgarr->nused];
+ cow = !!vma->vm_file;
+
+ while (addr < end) {
+ struct page *page;
+
+ /* simplified version of get_user_pages(): already have vma,
+ * only need FOLL_TOUCH, and (for now) ignore fault stats */
+
+ cond_resched();
+ while (!(page = follow_page(vma, addr, FOLL_TOUCH))) {
+ ret = handle_mm_fault(vma->vm_mm, vma, addr, 0);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ ret = -ENOMEM;
+ else if (ret & VM_FAULT_SIGBUS)
+ ret = -EFAULT;
+ else
+ BUG();
+ break;
+ }
+ cond_resched();
+ }
+
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ break;
+ }
+
+ if (page == ZERO_PAGE(0))
+ page = NULL; /* zero page: ignore */
+ else if (cow && page_mapping(page) != NULL)
+ page = NULL; /* clean cow: ignore */
+ else {
+ get_page(page);
+ *(addrp++) = addr;
+ *(pagep++) = page;
+ if (--nr == 0) {
+ addr += PAGE_SIZE;
+ break;
+ }
+ }
+
+ addr += PAGE_SIZE;
+ }
+
+ if (unlikely(ret < 0)) {
+ nr = pgarr->nleft - nr;
+ while (nr--)
+ page_cache_release(*(--pagep));
+ return ret;
+ }
+
+ *start = addr;
+ return (pgarr->nleft - nr);
+}
+
+/**
+ * cr_vma_scan_pages - scan vma for pages that will need to be dumped
+ * @ctx - checkpoint context
+ * @vma - vma to scan
+ *
+ * a list of addr/page tuples is kept in ctx->pgarr page-array chain
+ */
+static int cr_vma_scan_pages(struct cr_ctx *ctx, struct vm_area_struct *vma)
+{
+ unsigned long addr = vma->vm_start;
+ unsigned long end = vma->vm_end;
+ struct cr_pgarr *pgarr;
+ int nr, total = 0;
+
+ while (addr < end) {
+ if (!(pgarr = cr_pgarr_prep(ctx)))
+ return -ENOMEM;
+ if ((nr = cr_vma_fill_pgarr(ctx, pgarr, vma, &addr)) < 0)
+ return nr;
+ pgarr->nleft -= nr;
+ pgarr->nused += nr;
+ total += nr;
+ }
+
+ CR_PRINTK("total %d\n", total);
+ return total;
+}
+
+/**
+ * cr_vma_dump_pages - dump pages listed in the ctx page-array chain
+ * @ctx - checkpoint context
+ * @total - total number of pages
+ */
+static int cr_vma_dump_pages(struct cr_ctx *ctx, int total)
+{
+ struct cr_pgarr *pgarr;
+ int ret;
+
+ if (!total)
+ return 0;
+
+ for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next) {
+ ret = cr_kwrite(ctx, pgarr->addrs,
+ pgarr->nused * sizeof(*pgarr->addrs));
+ if (ret < 0)
+ return ret;
+ }
+
+ for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next) {
+ struct page **pages = pgarr->pages;
+ int nr = pgarr->nused;
+ void *ptr;
+
+ while (nr--) {
+ ptr = kmap(*pages);
+ ret = cr_kwrite(ctx, ptr, PAGE_SIZE);
+ kunmap(*pages);
+ if (ret < 0)
+ return ret;
+ pages++;
+ }
+ }
+
+ return total;
+}
+
+static int cr_write_vma(struct cr_ctx *ctx, struct vm_area_struct *vma)
+{
+ struct cr_hdr h;
+ struct cr_hdr_vma *hh = ctx->tbuf;
+ char *fname = NULL;
+ int how, nr, ret;
+
+ h.type = CR_HDR_VMA;
+ h.len = sizeof(*hh);
+ h.id = ctx->pid;
+
+ hh->vm_start = vma->vm_start;
+ hh->vm_end = vma->vm_end;
+ hh->vm_page_prot = vma->vm_page_prot.pgprot;
+ hh->vm_flags = vma->vm_flags;
+ hh->vm_pgoff = vma->vm_pgoff;
+
+ if (vma->vm_flags & (VM_SHARED | VM_IO | VM_HUGETLB | VM_NONLINEAR)) {
+ printk(KERN_WARNING "CR: unknown VMA %#lx\n", vma->vm_flags);
+ return -ETXTBSY;
+ }
+
+ /* by default assume anon memory */
+ how = CR_VMA_ANON;
+
+ /* if there is a backing file, assume private-mapped */
+ /* (NEED: check if the file is unlinked) */
+ if (vma->vm_file) {
+ nr = PAGE_SIZE;
+ fname = cr_get_fname(&vma->vm_file->f_path,
+ ctx->vfsroot, ctx->tbuf, &nr);
+ if (IS_ERR(fname))
+ return PTR_ERR(fname);
+ hh->namelen = nr;
+ how = CR_VMA_FILE;
+ } else
+ hh->namelen = 0;
+
+ hh->how = how;
+
+ /*
+ * it seems redundant now, but we do it in 3 steps for because:
+ * first, the logic is simpler when we how many pages before
+ * dumping them; second, a future optimization will defer the
+ * writeout (dump, and free) to a later step; in which case all
+ * the pages to be dumped will be aggregated on the checkpoint ctx
+ */
+
+ /* (1) scan: scan through the PTEs of the vma, both to count the
+ * pages to dump, and make those pages COW. keep the list of pages
+ * (and a reference to each page) on the checkpoint ctx */
+ nr = cr_vma_scan_pages(ctx, vma);
+ if (nr < 0) {
+ cr_put_fname(ctx->tbuf, fname, PAGE_SIZE);
+ return nr;
+ }
+
+ hh->npages = nr;
+ ret = cr_write_obj(ctx, &h, hh);
+
+ if (!ret && hh->namelen)
+ ret = cr_write_str(ctx, fname, hh->namelen);
+
+ cr_put_fname(ctx->tbuf, fname, PAGE_SIZE);
+
+ if (ret < 0)
+ return ret;
+
+ /* (2) dump: write out the addresses of all pages in the list (on
+ * the checkpoint ctx) followed by the contents of all pages */
+ ret = cr_vma_dump_pages(ctx, nr);
+
+ /* (3) free: free the extra references to the pages in the list */
+ cr_pgarr_release(ctx);
+
+ return ret;
+}
+
+int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t)
+{
+ struct cr_hdr h;
+ struct cr_hdr_mm *hh = ctx->tbuf;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int ret;
+
+ h.type = CR_HDR_MM;
+ h.len = sizeof(*hh);
+ h.id = ctx->pid;
+
+ mm = get_task_mm(t);
+
+ hh->tag = 1; /* non-zero will mean first time encounter */
+
+ hh->start_code = mm->start_code;
+ hh->end_code = mm->end_code;
+ hh->start_data = mm->start_data;
+ hh->end_data = mm->end_data;
+ hh->start_brk = mm->start_brk;
+ hh->brk = mm->brk;
+ hh->start_stack = mm->start_stack;
+ hh->arg_start = mm->arg_start;
+ hh->arg_end = mm->arg_end;
+ hh->env_start = mm->env_start;
+ hh->env_end = mm->env_end;
+
+ hh->map_count = mm->map_count;
+
+ /* FIX: need also mm->flags */
+
+ ret = cr_write_obj(ctx, &h, hh);
+ if (ret < 0)
+ goto out;
+
+ /* write the vma's */
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if ((ret = cr_write_vma(ctx, vma)) < 0)
+ break;
+ }
+ up_read(&mm->mmap_sem);
+
+ if (ret < 0)
+ goto out;
+
+ ret = cr_write_mm_context(ctx, mm);
+
+ out:
+ mmput(mm);
+ return ret;
+}
diff -puN /dev/null ckpt/ckpt_mem.h
--- /dev/null 2007-04-11 11:48:27.000000000 -0700
+++ linux-2.6.git-dave/ckpt/ckpt_mem.h 2008-08-05 08:37:29.000000000 -0700
@@ -0,0 +1,32 @@
+/*
+ * Generic container checkpoint-restart
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/mm_types.h>
+
+/* page-array chains: each pgarr hols a list of <addr,page> tuples */
+struct cr_pgarr {
+ unsigned long *addrs;
+ struct page **pages;
+ struct cr_pgarr *next;
+ unsigned short nleft;
+ unsigned short nused;
+};
+
+/* vma subtypes */
+enum {
+ CR_VMA_ANON = 1,
+ CR_VMA_FILE
+};
+
+extern void _cr_pgarr_release(struct cr_ctx *ctx, struct cr_pgarr *pgarr);
+extern void cr_pgarr_release(struct cr_ctx *ctx);
+extern void cr_pgarr_free(struct cr_ctx *ctx);
+extern struct cr_pgarr *cr_pgarr_alloc(struct cr_ctx *ctx, struct cr_pgarr **pgnew);
+extern struct cr_pgarr *cr_pgarr_prep(struct cr_ctx *ctx);
diff -puN ckpt/Makefile~memory_part ckpt/Makefile
--- linux-2.6.git/ckpt/Makefile~memory_part 2008-08-05 08:37:29.000000000 -0700
+++ linux-2.6.git-dave/ckpt/Makefile 2008-08-05 08:37:29.000000000 -0700
@@ -1,2 +1,2 @@
-obj-y += sys.o checkpoint.o restart.o
+obj-y += sys.o checkpoint.o restart.o ckpt_mem.o rstr_mem.o
obj-$(CONFIG_X86) += x86.o
diff -puN /dev/null ckpt/rstr_mem.c
--- /dev/null 2007-04-11 11:48:27.000000000 -0700
+++ linux-2.6.git-dave/ckpt/rstr_mem.c 2008-08-05 08:37:29.000000000 -0700
@@ -0,0 +1,354 @@
+/*
+ * Restart memory contents
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <asm/unistd.h>
+
+#include <linux/sched.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/mm_types.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <asm/cacheflush.h>
+
+#include "ckpt.h"
+#include "ckpt_arch.h"
+#include "ckpt_hdr.h"
+#include "ckpt_mem.h"
+
+/*
+ * Unlike checkpoint, restart is executed in the context of each restarting
+ * process: vma regions are restored via a call to mmap(), and the data is
+ * read in directly to the address space of the current process
+ */
+
+/**
+ * cr_vma_read_pages_addr - read addresses of pages to page-array chain
+ * @ctx - restart context
+ * @npages - number of pages
+ */
+static int cr_vma_read_pages_addr(struct cr_ctx *ctx, int npages)
+{
+ struct cr_pgarr *pgarr;
+ int nr, ret;
+
+ while (npages) {
+ if (!(pgarr = cr_pgarr_prep(ctx)))
+ return -ENOMEM;
+ nr = min(npages, (int) pgarr->nleft);
+ ret = cr_kread(ctx, pgarr->addrs, nr * sizeof(unsigned long));
+ if (ret < 0)
+ return ret;
+ pgarr->nleft -= nr;
+ pgarr->nused += nr;
+ npages -= nr;
+ }
+ return 0;
+}
+
+/**
+ * cr_vma_read_pages_data - read in data of pages in page-array chain
+ * @ctx - restart context
+ * @npages - number of pages
+ */
+static int cr_vma_read_pages_data(struct cr_ctx *ctx, int npages)
+{
+ struct cr_pgarr *pgarr;
+ unsigned long *addrs;
+ int nr, ret;
+
+ for (pgarr = ctx->pgarr; npages; pgarr = pgarr->next) {
+ addrs = pgarr->addrs;
+ nr = pgarr->nused;
+ npages -= nr;
+ while (nr--) {
+ ret = cr_uread(ctx, (void *) *(addrs++), PAGE_SIZE);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/* change the protection of an address range to be writable/non-writable.
+ * this is useful when restoring the memory of a read-only vma */
+static int cr_vma_writable(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int writable)
+{
+ struct vm_area_struct *vma, *prev;
+ unsigned long flags = 0;
+ int ret = -EINVAL;
+
+ CR_PRINTK("vma %#lx-%#lx writable %d\n", start, end, writable);
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma_prev(mm, start, &prev);
+ if (unlikely(!vma || vma->vm_start > end || vma->vm_end < start))
+ goto out;
+ if (writable && !(vma->vm_flags & VM_WRITE))
+ flags = vma->vm_flags | VM_WRITE;
+ else if (!writable && (vma->vm_flags & VM_WRITE))
+ flags = vma->vm_flags & ~VM_WRITE;
+ CR_PRINTK("flags %#lx\n", flags);
+ if (flags)
+ ret = mprotect_fixup(vma, &prev, vma->vm_start,
+ vma->vm_end, flags);
+ out:
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+/**
+ * cr_vma_read_pages - read in pages for to restore a vma
+ * @ctx - restart context
+ * @cr_vma - vma descriptor from restart
+ */
+static int cr_vma_read_pages(struct cr_ctx *ctx, struct cr_hdr_vma *cr_vma)
+{
+ struct mm_struct *mm = current->mm;
+ int ret = 0;
+
+ if (!cr_vma->npages)
+ return 0;
+
+ /* in the unlikely case that this vma is read-only */
+ if (!(cr_vma->vm_flags & VM_WRITE))
+ ret = cr_vma_writable(mm, cr_vma->vm_start, cr_vma->vm_end, 1);
+
+ if (!ret)
+ ret = cr_vma_read_pages_addr(ctx, cr_vma->npages);
+ if (!ret)
+ ret = cr_vma_read_pages_data(ctx, cr_vma->npages);
+ if (ret < 0)
+ return ret;
+
+ cr_pgarr_release(ctx); /* reset page-array chain */
+
+ /* restore original protection for this vma */
+ if (!(cr_vma->vm_flags & VM_WRITE))
+ ret = cr_vma_writable(mm, cr_vma->vm_start, cr_vma->vm_end, 0);
+
+ return ret;
+}
+
+/**
+ * cr_calc_map_prot_bits - convert vm_flags to mmap protection
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long cr_calc_map_prot_bits(unsigned long orig_vm_flags)
+{
+ unsigned long vm_prot = 0;
+
+ if (orig_vm_flags & VM_READ)
+ vm_prot |= PROT_READ;
+ if (orig_vm_flags & VM_WRITE)
+ vm_prot |= PROT_WRITE;
+ if (orig_vm_flags & VM_EXEC)
+ vm_prot |= PROT_EXEC;
+ if (orig_vm_flags & PROT_SEM) /* only (?) with IPC-SHM */
+ vm_prot |= PROT_SEM;
+
+ return vm_prot;
+}
+
+/**
+ * cr_calc_map_flags_bits - convert vm_flags to mmap flags
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long cr_calc_map_flags_bits(unsigned long orig_vm_flags)
+{
+ unsigned long vm_flags = 0;
+
+ vm_flags = MAP_FIXED;
+ if (orig_vm_flags & VM_GROWSDOWN)
+ vm_flags |= MAP_GROWSDOWN;
+ if (orig_vm_flags & VM_DENYWRITE)
+ vm_flags |= MAP_DENYWRITE;
+ if (orig_vm_flags & VM_EXECUTABLE)
+ vm_flags |= MAP_EXECUTABLE;
+ if (orig_vm_flags & VM_MAYSHARE)
+ vm_flags |= MAP_SHARED;
+ else
+ vm_flags |= MAP_PRIVATE;
+
+ return vm_flags;
+}
+
+static int cr_read_vma(struct cr_ctx *ctx, struct mm_struct *mm)
+{
+ struct cr_hdr_vma *hh = cr_hbuf_get(ctx, sizeof(*hh));
+ unsigned long vm_size, vm_flags, vm_prot, vm_pgoff;
+ unsigned long addr;
+ unsigned long flags;
+ struct file *file = NULL;
+ char *fname = NULL;
+ int ret;
+
+ ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_VMA);
+ if (ret < 0)
+ return ret;
+
+ CR_PRINTK("vma %#lx-%#lx npages %d namelen %d\n",
+ (unsigned long) hh->vm_start, (unsigned long) hh->vm_end,
+ (int) hh->npages, (int) hh->namelen);
+
+ if (hh->vm_end < hh->vm_start)
+ return -EINVAL;
+ if (hh->npages < 0 || hh->namelen < 0)
+ return -EINVAL;
+
+ vm_size = hh->vm_end - hh->vm_start;
+ vm_prot = cr_calc_map_prot_bits(hh->vm_flags);
+ vm_flags = cr_calc_map_flags_bits(hh->vm_flags);
+ vm_pgoff = hh->vm_pgoff;
+
+ if (hh->namelen) {
+ fname = ctx->tbuf;
+ ret = cr_read_str(ctx, fname, PAGE_SIZE);
+ if (ret < 0)
+ return ret;
+ }
+
+ CR_PRINTK("vma fname '%s' how %d\n", fname, hh->how);
+
+ switch (hh->how) {
+
+ case CR_VMA_ANON: /* anonymous private mapping */
+ if (hh->namelen)
+ return -EINVAL;
+ /* vm_pgoff for anonymous mapping is the "global" page
+ offset (namely from addr 0x0), so we force a zero */
+ vm_pgoff = 0;
+ break;
+
+ case CR_VMA_FILE: /* private mapping from a file */
+ if (!hh->namelen)
+ return -EINVAL;
+ /* O_RDWR only needed if both (VM_WRITE|VM_SHARED) are set */
+ flags = hh->vm_flags & (VM_WRITE | VM_SHARED);
+ flags = (flags == (VM_WRITE | VM_SHARED) ? O_RDWR : O_RDONLY);
+ file = filp_open(fname, flags, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ break;
+
+ default:
+ return -EINVAL;
+
+ }
+
+ addr = do_mmap_pgoff(file, (unsigned long) hh->vm_start,
+ vm_size, vm_prot, vm_flags, vm_pgoff);
+ CR_PRINTK("vma size %#lx prot %#lx flags %#lx pgoff %#lx => %#lx\n",
+ vm_size, vm_prot, vm_flags, vm_pgoff, addr);
+
+ /* the file (if opened) is now referenced by the vma */
+ if (file)
+ filp_close(file, NULL);
+
+ if (IS_ERR((void*) addr))
+ return (PTR_ERR((void *) addr));
+
+ /*
+ * CR_VMA_ANON: read in memory as is
+ * CR_VMA_FILE: read in memory as is
+ * (more to follow ...)
+ */
+
+ switch (hh->how) {
+ case CR_VMA_ANON:
+ case CR_VMA_FILE:
+ /* standard case: read the data into the memory */
+ ret = cr_vma_read_pages(ctx, hh);
+ break;
+ }
+
+ if (ret < 0)
+ return ret;
+
+ if (vm_prot & PROT_EXEC)
+ flush_icache_range(hh->vm_start, hh->vm_end);
+
+ cr_hbuf_put(ctx, sizeof(*hh));
+ CR_PRINTK("vma retval %d\n", ret);
+ return 0;
+}
+
+static int cr_destroy_mm(struct mm_struct *mm)
+{
+ struct vm_area_struct *vmnext = mm->mmap;
+ struct vm_area_struct *vma;
+ int ret;
+
+ while (vmnext) {
+ vma = vmnext;
+ vmnext = vmnext->vm_next;
+ ret = do_munmap(mm, vma->vm_start, vma->vm_end-vma->vm_start);
+ if (ret < 0)
+ return ret;
+ }
+ return 0;
+}
+
+int cr_read_mm(struct cr_ctx *ctx)
+{
+ struct cr_hdr_mm *hh = cr_hbuf_get(ctx, sizeof(*hh));
+ struct mm_struct *mm;
+ int nr, ret;
+
+ ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_MM);
+ if (ret < 0)
+ return ret;
+
+ CR_PRINTK("map_count %d\n", hh->map_count);
+
+ /* XXX need more sanity checks */
+ if (hh->start_code > hh->end_code ||
+ hh->start_data > hh->end_data || hh->map_count < 0)
+ return -EINVAL;
+
+ mm = current->mm;
+
+ /* point of no return -- destruct current mm */
+ down_write(&mm->mmap_sem);
+ ret = cr_destroy_mm(mm);
+ up_write(&mm->mmap_sem);
+
+ if (ret < 0)
+ return ret;
+
+ mm->start_code = hh->start_code;
+ mm->end_code = hh->end_code;
+ mm->start_data = hh->start_data;
+ mm->end_data = hh->end_data;
+ mm->start_brk = hh->start_brk;
+ mm->brk = hh->brk;
+ mm->start_stack = hh->start_stack;
+ mm->arg_start = hh->arg_start;
+ mm->arg_end = hh->arg_end;
+ mm->env_start = hh->env_start;
+ mm->env_end = hh->env_end;
+
+ /* FIX: need also mm->flags */
+
+ for (nr = hh->map_count; nr; nr--) {
+ ret = cr_read_vma(ctx, mm);
+ if (ret < 0)
+ return ret;
+ }
+
+ cr_hbuf_put(ctx, sizeof(*hh));
+
+ return cr_read_mm_context(ctx, mm);
+}
diff -puN ckpt/sys.c~memory_part ckpt/sys.c
--- linux-2.6.git/ckpt/sys.c~memory_part 2008-08-05 08:37:29.000000000 -0700
+++ linux-2.6.git-dave/ckpt/sys.c 2008-08-05 08:37:29.000000000 -0700
@@ -15,6 +15,7 @@
#include <linux/capability.h>
#include "ckpt.h"
+#include "ckpt_mem.h"
/*
* helpers to write/read to/from the image file descriptor
@@ -118,6 +119,8 @@ void cr_ctx_free(struct cr_ctx *ctx)
if (ctx->vfsroot)
path_put(ctx->vfsroot);
+ cr_pgarr_free(ctx);
+
free_pages((unsigned long) ctx->tbuf, CR_ORDER_TBUF);
free_pages((unsigned long) ctx->hbuf, CR_ORDER_HBUF);
diff -puN ckpt/x86.c~memory_part ckpt/x86.c
--- linux-2.6.git/ckpt/x86.c~memory_part 2008-08-05 08:37:29.000000000 -0700
+++ linux-2.6.git-dave/ckpt/x86.c 2008-08-05 08:37:29.000000000 -0700
@@ -1,5 +1,6 @@
#include <asm/ckpt.h>
#include <asm/desc.h>
+#include <asm/ldt.h>
#include <asm/i387.h>
#include "ckpt.h"
@@ -267,3 +268,85 @@ int cr_read_cpu(struct cr_ctx *ctx)
return 0;
}
+
+int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm)
+{
+ struct cr_hdr h;
+ struct cr_hdr_mm_context *hh = ctx->tbuf;
+ int ret;
+
+ h.type = CR_HDR_MM_CONTEXT;
+ h.len = sizeof(*hh);
+ h.id = ctx->pid;
+
+ mutex_lock(&mm->context.lock);
+
+ hh->ldt_entry_size = LDT_ENTRY_SIZE;
+ hh->nldt = mm->context.size;
+
+ CR_PRINTK("nldt %d\n", hh->nldt);
+
+ ret = cr_write_obj(ctx, &h, hh);
+ if (ret < 0)
+ return ret;
+
+ ret = cr_kwrite(ctx, mm->context.ldt, hh->nldt * LDT_ENTRY_SIZE);
+
+ mutex_unlock(&mm->context.lock);
+
+ return ret;
+}
+
+int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm)
+{
+ struct cr_hdr_mm_context *hh = cr_hbuf_get(ctx, sizeof(*hh));
+ int n, ret;
+
+ ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_MM_CONTEXT);
+ if (ret < 0)
+ return ret;
+
+ CR_PRINTK("nldt %d\n", hh->nldt);
+
+ if (hh->nldt < 0 || hh->ldt_entry_size != LDT_ENTRY_SIZE)
+ return -EINVAL;
+
+ /* to utilize the syscall modify_ldt() we first convert the data
+ * in the checkpoint image from 'struct desc_struct' to 'struct
+ * user_desc' with reverse logic of inclue/asm/desc.h:fill_ldt() */
+
+ for (n = 0; n < hh->nldt; n++) {
+ struct user_desc info;
+ struct desc_struct desc;
+ mm_segment_t old_fs;
+
+ ret = cr_kread(ctx, &desc, LDT_ENTRY_SIZE);
+ if (ret < 0)
+ return ret;
+
+ info.entry_number = n;
+ info.base_addr = desc.base0 | (desc.base1 << 16);
+ info.limit = desc.limit0;
+ info.seg_32bit = desc.d;
+ info.contents = desc.type >> 2;
+ info.read_exec_only = (desc.type >> 1) ^ 1;
+ info.limit_in_pages = desc.g;
+ info.seg_not_present = desc.p ^ 1;
+ info.useable = desc.avl;
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ /* ret = sys_modify_ldt(1, &info, sizeof(info)); */
+ /* modified by daveh */
+ ret = write_ldt(&info, sizeof(info), 1);
+ set_fs(old_fs);
+
+ if (ret < 0)
+ return ret;
+ }
+
+ load_LDT(&mm->context);
+
+ cr_hbuf_put(ctx, sizeof(*hh));
+ return 0;
+}
diff -puN include/asm-x86/ckpt.h~memory_part include/asm-x86/ckpt.h
--- linux-2.6.git/include/asm-x86/ckpt.h~memory_part 2008-08-05 08:37:29.000000000 -0700
+++ linux-2.6.git-dave/include/asm-x86/ckpt.h 2008-08-05 08:37:29.000000000 -0700
@@ -43,4 +43,9 @@ struct cr_hdr_cpu {
union thread_xstate xstate; /* i387 */
};
+struct cr_hdr_mm_context {
+ __s16 ldt_entry_size;
+ __s16 nldt;
+};
+
#endif /* __ASM_X86_CKPT_H */
diff -puN include/asm-x86/desc.h~memory_part include/asm-x86/desc.h
--- linux-2.6.git/include/asm-x86/desc.h~memory_part 2008-08-05 08:37:29.000000000 -0700
+++ linux-2.6.git-dave/include/asm-x86/desc.h 2008-08-05 08:40:11.000000000 -0700
@@ -111,6 +111,8 @@ static inline void native_write_ldt_entr
memcpy(&ldt[entry], desc, 8);
}
+int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode);
+
static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
const void *desc, int type)
{
@@ -394,7 +396,6 @@ static inline void set_system_gate_ist(i
shll $16, base; \
movw idx * 8 + 2(gdt), lo_w;
-
#endif /* __ASSEMBLY__ */
#endif
_
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list