[CRIU] [PATCH] restore: Move memory restoring code into mem.c
Pavel Emelyanov
xemul at virtuozzo.com
Fri May 20 10:30:11 PDT 2016
All the dumping code sits in there, so it's time to move the restoring
one there too :)
Signed-off-by: Pavel Emelyanov <xemul at virtuozzo.com>
---
criu/cr-restore.c | 433 +----------------------------------------------------
criu/include/mem.h | 3 +
criu/mem.c | 429 ++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 435 insertions(+), 430 deletions(-)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 5513e1c..8e7f18e 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -76,7 +76,6 @@
#include "aio.h"
#include "lsm.h"
#include "seccomp.h"
-#include "bitmap.h"
#include "fault-injection.h"
#include "uffd.h"
#include "sk-queue.h"
@@ -280,432 +279,6 @@ err:
return ret;
}
-/* Map a private vma, if it is not mapped by a parent yet */
-static int map_private_vma(struct vma_area *vma, void **tgt_addr,
- struct vma_area **pvma, struct list_head *pvma_list)
-{
- int ret;
- void *addr, *paddr = NULL;
- unsigned long nr_pages, size;
- struct vma_area *p = *pvma;
-
- if (vma_area_is(vma, VMA_FILE_PRIVATE)) {
- ret = vma->vm_open(current->pid.virt, vma);
- if (ret < 0) {
- pr_err("Can't fixup VMA's fd\n");
- return -1;
- }
-
- vma->vm_open = NULL; /* prevent from 2nd open in open_vmas */
- }
-
- nr_pages = vma_entry_len(vma->e) / PAGE_SIZE;
- vma->page_bitmap = xzalloc(BITS_TO_LONGS(nr_pages) * sizeof(long));
- if (vma->page_bitmap == NULL)
- return -1;
-
- list_for_each_entry_from(p, pvma_list, list) {
- if (p->e->start > vma->e->start)
- break;
-
- if (!vma_area_is_private(p, kdat.task_size))
- continue;
-
- if (p->e->end != vma->e->end ||
- p->e->start != vma->e->start)
- continue;
-
- /* Check flags, which must be identical for both vma-s */
- if ((vma->e->flags ^ p->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS))
- break;
-
- if (!(vma->e->flags & MAP_ANONYMOUS) &&
- vma->e->shmid != p->e->shmid)
- break;
-
- pr_info("COW 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
- vma->e->start, vma->e->end, vma->e->pgoff);
- paddr = decode_pointer(p->premmaped_addr);
-
- break;
- }
-
- /*
- * A grow-down VMA has a guard page, which protect a VMA below it.
- * So one more page is mapped here to restore content of the first page
- */
- if (vma->e->flags & MAP_GROWSDOWN) {
- vma->e->start -= PAGE_SIZE;
- if (paddr)
- paddr -= PAGE_SIZE;
- }
-
- size = vma_entry_len(vma->e);
- if (paddr == NULL) {
- int flag = 0;
- /*
- * The respective memory area was NOT found in the parent.
- * Map a new one.
- */
- pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
- vma->e->start, vma->e->end, vma->e->pgoff);
-
- /*
- * Restore AIO ring buffer content to temporary anonymous area.
- * This will be placed in io_setup'ed AIO in restore_aio_ring().
- */
- if (vma_entry_is(vma->e, VMA_AREA_AIORING))
- flag |= MAP_ANONYMOUS;
-
- addr = mmap(*tgt_addr, size,
- vma->e->prot | PROT_WRITE,
- vma->e->flags | MAP_FIXED | flag,
- vma->e->fd, vma->e->pgoff);
-
- if (addr == MAP_FAILED) {
- pr_perror("Unable to map ANON_VMA");
- return -1;
- }
-
- *pvma = p;
- } else {
- /*
- * This region was found in parent -- remap it to inherit physical
- * pages (if any) from it (and COW them later if required).
- */
- vma->ppage_bitmap = p->page_bitmap;
-
- addr = mremap(paddr, size, size,
- MREMAP_FIXED | MREMAP_MAYMOVE, *tgt_addr);
- if (addr != *tgt_addr) {
- pr_perror("Unable to remap a private vma");
- return -1;
- }
-
- *pvma = list_entry(p->list.next, struct vma_area, list);
- }
-
- vma->premmaped_addr = (unsigned long) addr;
- pr_debug("\tpremap 0x%016"PRIx64"-0x%016"PRIx64" -> %016lx\n",
- vma->e->start, vma->e->end, (unsigned long)addr);
-
- if (vma->e->flags & MAP_GROWSDOWN) { /* Skip gurad page */
- vma->e->start += PAGE_SIZE;
- vma->premmaped_addr += PAGE_SIZE;
- }
-
- if (vma_area_is(vma, VMA_FILE_PRIVATE))
- close(vma->e->fd);
-
- *tgt_addr += size;
- return 0;
-}
-
-static int premap_priv_vmas(struct vm_area_list *vmas, void *at)
-{
- struct list_head *parent_vmas;
- struct vma_area *pvma, *vma;
- unsigned long pstart = 0;
- int ret = 0;
- LIST_HEAD(empty);
-
- /*
- * Keep parent vmas at hands to check whether we can "inherit" them.
- * See comments in map_private_vma.
- */
- if (current->parent)
- parent_vmas = &rsti(current->parent)->vmas.h;
- else
- parent_vmas = ∅
-
- pvma = list_first_entry(parent_vmas, struct vma_area, list);
-
- list_for_each_entry(vma, &vmas->h, list) {
- if (pstart > vma->e->start) {
- ret = -1;
- pr_err("VMA-s are not sorted in the image file\n");
- break;
- }
- pstart = vma->e->start;
-
- if (!vma_area_is_private(vma, kdat.task_size))
- continue;
-
- ret = map_private_vma(vma, &at, &pvma, parent_vmas);
- if (ret < 0)
- break;
- }
-
- return ret;
-}
-
-static int restore_priv_vma_content(void)
-{
- struct vma_area *vma;
- int ret = 0;
- struct list_head *vmas = &rsti(current)->vmas.h;
-
- unsigned int nr_restored = 0;
- unsigned int nr_shared = 0;
- unsigned int nr_droped = 0;
- unsigned int nr_compared = 0;
- unsigned int nr_lazy = 0;
- unsigned long va;
- struct page_read pr;
-
- vma = list_first_entry(vmas, struct vma_area, list);
-
- ret = open_page_read(current->pid.virt, &pr, PR_TASK);
- if (ret <= 0)
- return -1;
-
- /*
- * Read page contents.
- */
- while (1) {
- unsigned long off, i, nr_pages;
- struct iovec iov;
-
- ret = pr.get_pagemap(&pr, &iov);
- if (ret <= 0)
- break;
-
- va = (unsigned long)iov.iov_base;
- nr_pages = iov.iov_len / PAGE_SIZE;
-
- for (i = 0; i < nr_pages; i++) {
- unsigned char buf[PAGE_SIZE];
- void *p;
-
- /*
- * The lookup is over *all* possible VMAs
- * read from image file.
- */
- while (va >= vma->e->end) {
- if (vma->list.next == vmas)
- goto err_addr;
- vma = list_entry(vma->list.next, struct vma_area, list);
- }
-
- /*
- * Make sure the page address is inside existing VMA
- * and the VMA it refers to still private one, since
- * there is no guarantee that the data from pagemap is
- * valid.
- */
- if (va < vma->e->start)
- goto err_addr;
- else if (unlikely(!vma_area_is_private(vma, kdat.task_size))) {
- pr_err("Trying to restore page for non-private VMA\n");
- goto err_addr;
- }
-
- off = (va - vma->e->start) / PAGE_SIZE;
- p = decode_pointer((off) * PAGE_SIZE +
- vma->premmaped_addr);
-
- /*
- * This means that userfaultfd is used to load the pages
- * on demand.
- */
- if (opts.lazy_pages && vma_entry_can_be_lazy(vma->e)) {
- pr_debug("Lazy restore skips 0x%016"PRIx64"\n", vma->e->start);
- pr.skip_pages(&pr, PAGE_SIZE);
- nr_lazy++;
- continue;
- }
-
- set_bit(off, vma->page_bitmap);
- if (vma->ppage_bitmap) { /* inherited vma */
- clear_bit(off, vma->ppage_bitmap);
-
- ret = pr.read_pages(&pr, va, 1, buf);
- if (ret < 0)
- goto err_read;
-
- va += PAGE_SIZE;
- nr_compared++;
-
- if (memcmp(p, buf, PAGE_SIZE) == 0) {
- nr_shared++; /* the page is cowed */
- continue;
- }
-
- nr_restored++;
- memcpy(p, buf, PAGE_SIZE);
- } else {
- int nr;
-
- /*
- * Try to read as many pages as possible at once.
- *
- * Within the current pagemap we still have
- * nr_pages - i pages (not all, as we might have
- * switched VMA above), within the current VMA
- * we have at most (vma->end - current_addr) bytes.
- */
-
- nr = min_t(int, nr_pages - i, (vma->e->end - va) / PAGE_SIZE);
-
- ret = pr.read_pages(&pr, va, nr, p);
- if (ret < 0)
- goto err_read;
-
- va += nr * PAGE_SIZE;
- nr_restored += nr;
- i += nr - 1;
-
- bitmap_set(vma->page_bitmap, off + 1, nr - 1);
- }
-
- }
-
- if (pr.put_pagemap)
- pr.put_pagemap(&pr);
- }
-
-err_read:
- pr.close(&pr);
- if (ret < 0)
- return ret;
-
- /* Remove pages, which were not shared with a child */
- list_for_each_entry(vma, vmas, list) {
- unsigned long size, i = 0;
- void *addr = decode_pointer(vma->premmaped_addr);
-
- if (vma->ppage_bitmap == NULL)
- continue;
-
- size = vma_entry_len(vma->e) / PAGE_SIZE;
- while (1) {
- /* Find all pages, which are not shared with this child */
- i = find_next_bit(vma->ppage_bitmap, size, i);
-
- if ( i >= size)
- break;
-
- ret = madvise(addr + PAGE_SIZE * i,
- PAGE_SIZE, MADV_DONTNEED);
- if (ret < 0) {
- pr_perror("madvise failed");
- return -1;
- }
- i++;
- nr_droped++;
- }
- }
-
- cnt_add(CNT_PAGES_COMPARED, nr_compared);
- cnt_add(CNT_PAGES_SKIPPED_COW, nr_shared);
- cnt_add(CNT_PAGES_RESTORED, nr_restored);
-
- pr_info("nr_restored_pages: %d\n", nr_restored);
- pr_info("nr_shared_pages: %d\n", nr_shared);
- pr_info("nr_droped_pages: %d\n", nr_droped);
- pr_info("nr_lazy: %d\n", nr_lazy);
-
- return 0;
-
-err_addr:
- pr_err("Page entry address %lx outside of VMA %lx-%lx\n",
- va, (long)vma->e->start, (long)vma->e->end);
- return -1;
-}
-
-static int prepare_mappings(void)
-{
- int ret = 0;
- void *addr;
- struct vm_area_list *vmas;
-
- void *old_premmapped_addr = NULL;
- unsigned long old_premmapped_len;
-
- vmas = &rsti(current)->vmas;
- if (vmas->nr == 0) /* Zombie */
- goto out;
-
- /* Reserve a place for mapping private vma-s one by one */
- addr = mmap(NULL, vmas->priv_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
- if (addr == MAP_FAILED) {
- ret = -1;
- pr_perror("Unable to reserve memory (%lu bytes)", vmas->priv_size);
- goto out;
- }
-
- old_premmapped_addr = rsti(current)->premmapped_addr;
- old_premmapped_len = rsti(current)->premmapped_len;
- rsti(current)->premmapped_addr = addr;
- rsti(current)->premmapped_len = vmas->priv_size;
-
- ret = premap_priv_vmas(vmas, addr);
- if (ret < 0)
- goto out;
-
- ret = restore_priv_vma_content();
- if (ret < 0)
- goto out;
-
- if (old_premmapped_addr) {
- ret = munmap(old_premmapped_addr, old_premmapped_len);
- if (ret < 0)
- pr_perror("Unable to unmap %p(%lx)",
- old_premmapped_addr, old_premmapped_len);
- }
-
-out:
- return ret;
-}
-
-/*
- * A gard page must be unmapped after restoring content and
- * forking children to restore COW memory.
- */
-static int unmap_guard_pages()
-{
- struct vma_area *vma;
- struct list_head *vmas = &rsti(current)->vmas.h;
-
- list_for_each_entry(vma, vmas, list) {
- if (!vma_area_is_private(vma, kdat.task_size))
- continue;
-
- if (vma->e->flags & MAP_GROWSDOWN) {
- void *addr = decode_pointer(vma->premmaped_addr);
-
- if (munmap(addr - PAGE_SIZE, PAGE_SIZE)) {
- pr_perror("Can't unmap guard page");
- return -1;
- }
- }
- }
-
- return 0;
-}
-
-static int open_vmas(int pid)
-{
- struct vma_area *vma;
- struct list_head *vmas = &rsti(current)->vmas.h;
-
- list_for_each_entry(vma, vmas, list) {
- if (!(vma_area_is(vma, VMA_AREA_REGULAR)))
- continue;
-
- pr_info("Opening 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" (%x) vma\n",
- vma->e->start, vma->e->end,
- vma->e->pgoff, vma->e->status);
-
- if (vma->vm_open && vma->vm_open(pid, vma)) {
- pr_err("`- Can't open vma\n");
- return -1;
- }
- }
-
- return 0;
-}
-
static rt_sigaction_t sigchld_act;
static rt_sigaction_t parent_act[SIGMAX];
@@ -920,7 +493,7 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
if (prepare_file_locks(pid))
return -1;
- if (open_vmas(pid))
+ if (open_vmas(current))
return -1;
if (fixup_sysv_shmems())
@@ -1683,7 +1256,7 @@ static int restore_task_with_children(void *_arg)
if (restore_task_mnt_ns(current))
goto err;
- if (prepare_mappings())
+ if (prepare_mappings(current))
goto err;
if (prepare_sigactions() < 0)
@@ -1698,7 +1271,7 @@ static int restore_task_with_children(void *_arg)
goto err;
- if (unmap_guard_pages())
+ if (unmap_guard_pages(current))
goto err;
restore_pgid();
diff --git a/criu/include/mem.h b/criu/include/mem.h
index 5269cad..1bf3195 100644
--- a/criu/include/mem.h
+++ b/criu/include/mem.h
@@ -24,4 +24,7 @@ extern int parasite_dump_pages_seized(struct parasite_ctl *ctl,
#define PME_PFRAME_MASK ((1ULL << PME_PSHIFT_OFFSET) - 1)
#define PME_PFRAME(x) ((x) & PME_PFRAME_MASK)
+int open_vmas(struct pstree_item *t);
+int unmap_guard_pages(struct pstree_item *t);
+int prepare_mappings(struct pstree_item *t);
#endif /* __CR_MEM_H__ */
diff --git a/criu/mem.c b/criu/mem.c
index e05dbc1..6ca5da1 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -18,6 +18,7 @@
#include "shmem.h"
#include "pstree.h"
#include "restorer.h"
+#include "bitmap.h"
#include "sk-packet.h"
#include "files-reg.h"
#include "pagemap-cache.h"
@@ -483,3 +484,431 @@ int prepare_mm_pid(struct pstree_item *i)
return ret;
}
+/* Map a private vma, if it is not mapped by a parent yet */
+static int map_private_vma(struct pstree_item *t,
+ struct vma_area *vma, void **tgt_addr,
+ struct vma_area **pvma, struct list_head *pvma_list)
+{
+ int ret;
+ void *addr, *paddr = NULL;
+ unsigned long nr_pages, size;
+ struct vma_area *p = *pvma;
+
+ if (vma_area_is(vma, VMA_FILE_PRIVATE)) {
+ ret = vma->vm_open(t->pid.virt, vma);
+ if (ret < 0) {
+ pr_err("Can't fixup VMA's fd\n");
+ return -1;
+ }
+
+ vma->vm_open = NULL; /* prevent from 2nd open in open_vmas */
+ }
+
+ nr_pages = vma_entry_len(vma->e) / PAGE_SIZE;
+ vma->page_bitmap = xzalloc(BITS_TO_LONGS(nr_pages) * sizeof(long));
+ if (vma->page_bitmap == NULL)
+ return -1;
+
+ list_for_each_entry_from(p, pvma_list, list) {
+ if (p->e->start > vma->e->start)
+ break;
+
+ if (!vma_area_is_private(p, kdat.task_size))
+ continue;
+
+ if (p->e->end != vma->e->end ||
+ p->e->start != vma->e->start)
+ continue;
+
+ /* Check flags, which must be identical for both vma-s */
+ if ((vma->e->flags ^ p->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS))
+ break;
+
+ if (!(vma->e->flags & MAP_ANONYMOUS) &&
+ vma->e->shmid != p->e->shmid)
+ break;
+
+ pr_info("COW 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
+ vma->e->start, vma->e->end, vma->e->pgoff);
+ paddr = decode_pointer(p->premmaped_addr);
+
+ break;
+ }
+
+ /*
+ * A grow-down VMA has a guard page, which protect a VMA below it.
+ * So one more page is mapped here to restore content of the first page
+ */
+ if (vma->e->flags & MAP_GROWSDOWN) {
+ vma->e->start -= PAGE_SIZE;
+ if (paddr)
+ paddr -= PAGE_SIZE;
+ }
+
+ size = vma_entry_len(vma->e);
+ if (paddr == NULL) {
+ int flag = 0;
+ /*
+ * The respective memory area was NOT found in the parent.
+ * Map a new one.
+ */
+ pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
+ vma->e->start, vma->e->end, vma->e->pgoff);
+
+ /*
+ * Restore AIO ring buffer content to temporary anonymous area.
+ * This will be placed in io_setup'ed AIO in restore_aio_ring().
+ */
+ if (vma_entry_is(vma->e, VMA_AREA_AIORING))
+ flag |= MAP_ANONYMOUS;
+
+ addr = mmap(*tgt_addr, size,
+ vma->e->prot | PROT_WRITE,
+ vma->e->flags | MAP_FIXED | flag,
+ vma->e->fd, vma->e->pgoff);
+
+ if (addr == MAP_FAILED) {
+ pr_perror("Unable to map ANON_VMA");
+ return -1;
+ }
+
+ *pvma = p;
+ } else {
+ /*
+ * This region was found in parent -- remap it to inherit physical
+ * pages (if any) from it (and COW them later if required).
+ */
+ vma->ppage_bitmap = p->page_bitmap;
+
+ addr = mremap(paddr, size, size,
+ MREMAP_FIXED | MREMAP_MAYMOVE, *tgt_addr);
+ if (addr != *tgt_addr) {
+ pr_perror("Unable to remap a private vma");
+ return -1;
+ }
+
+ *pvma = list_entry(p->list.next, struct vma_area, list);
+ }
+
+ vma->premmaped_addr = (unsigned long) addr;
+ pr_debug("\tpremap 0x%016"PRIx64"-0x%016"PRIx64" -> %016lx\n",
+ vma->e->start, vma->e->end, (unsigned long)addr);
+
+ if (vma->e->flags & MAP_GROWSDOWN) { /* Skip gurad page */
+ vma->e->start += PAGE_SIZE;
+ vma->premmaped_addr += PAGE_SIZE;
+ }
+
+ if (vma_area_is(vma, VMA_FILE_PRIVATE))
+ close(vma->e->fd);
+
+ *tgt_addr += size;
+ return 0;
+}
+
+static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, void *at)
+{
+ struct list_head *parent_vmas;
+ struct vma_area *pvma, *vma;
+ unsigned long pstart = 0;
+ int ret = 0;
+ LIST_HEAD(empty);
+
+ /*
+ * Keep parent vmas at hands to check whether we can "inherit" them.
+ * See comments in map_private_vma.
+ */
+ if (t->parent)
+ parent_vmas = &rsti(t->parent)->vmas.h;
+ else
+ parent_vmas = ∅
+
+ pvma = list_first_entry(parent_vmas, struct vma_area, list);
+
+ list_for_each_entry(vma, &vmas->h, list) {
+ if (pstart > vma->e->start) {
+ ret = -1;
+ pr_err("VMA-s are not sorted in the image file\n");
+ break;
+ }
+ pstart = vma->e->start;
+
+ if (!vma_area_is_private(vma, kdat.task_size))
+ continue;
+
+ ret = map_private_vma(t, vma, &at, &pvma, parent_vmas);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
+}
+
+static int restore_priv_vma_content(struct pstree_item *t)
+{
+ struct vma_area *vma;
+ int ret = 0;
+ struct list_head *vmas = &rsti(t)->vmas.h;
+
+ unsigned int nr_restored = 0;
+ unsigned int nr_shared = 0;
+ unsigned int nr_droped = 0;
+ unsigned int nr_compared = 0;
+ unsigned int nr_lazy = 0;
+ unsigned long va;
+ struct page_read pr;
+
+ vma = list_first_entry(vmas, struct vma_area, list);
+
+ ret = open_page_read(t->pid.virt, &pr, PR_TASK);
+ if (ret <= 0)
+ return -1;
+
+ /*
+ * Read page contents.
+ */
+ while (1) {
+ unsigned long off, i, nr_pages;
+ struct iovec iov;
+
+ ret = pr.get_pagemap(&pr, &iov);
+ if (ret <= 0)
+ break;
+
+ va = (unsigned long)iov.iov_base;
+ nr_pages = iov.iov_len / PAGE_SIZE;
+
+ for (i = 0; i < nr_pages; i++) {
+ unsigned char buf[PAGE_SIZE];
+ void *p;
+
+ /*
+ * The lookup is over *all* possible VMAs
+ * read from image file.
+ */
+ while (va >= vma->e->end) {
+ if (vma->list.next == vmas)
+ goto err_addr;
+ vma = list_entry(vma->list.next, struct vma_area, list);
+ }
+
+ /*
+ * Make sure the page address is inside existing VMA
+ * and the VMA it refers to still private one, since
+ * there is no guarantee that the data from pagemap is
+ * valid.
+ */
+ if (va < vma->e->start)
+ goto err_addr;
+ else if (unlikely(!vma_area_is_private(vma, kdat.task_size))) {
+ pr_err("Trying to restore page for non-private VMA\n");
+ goto err_addr;
+ }
+
+ off = (va - vma->e->start) / PAGE_SIZE;
+ p = decode_pointer((off) * PAGE_SIZE +
+ vma->premmaped_addr);
+
+ /*
+ * This means that userfaultfd is used to load the pages
+ * on demand.
+ */
+ if (opts.lazy_pages && vma_entry_can_be_lazy(vma->e)) {
+ pr_debug("Lazy restore skips 0x%016"PRIx64"\n", vma->e->start);
+ pr.skip_pages(&pr, PAGE_SIZE);
+ nr_lazy++;
+ continue;
+ }
+
+ set_bit(off, vma->page_bitmap);
+ if (vma->ppage_bitmap) { /* inherited vma */
+ clear_bit(off, vma->ppage_bitmap);
+
+ ret = pr.read_pages(&pr, va, 1, buf);
+ if (ret < 0)
+ goto err_read;
+
+ va += PAGE_SIZE;
+ nr_compared++;
+
+ if (memcmp(p, buf, PAGE_SIZE) == 0) {
+ nr_shared++; /* the page is cowed */
+ continue;
+ }
+
+ nr_restored++;
+ memcpy(p, buf, PAGE_SIZE);
+ } else {
+ int nr;
+
+ /*
+ * Try to read as many pages as possible at once.
+ *
+ * Within the t pagemap we still have
+ * nr_pages - i pages (not all, as we might have
+ * switched VMA above), within the t VMA
+ * we have at most (vma->end - t_addr) bytes.
+ */
+
+ nr = min_t(int, nr_pages - i, (vma->e->end - va) / PAGE_SIZE);
+
+ ret = pr.read_pages(&pr, va, nr, p);
+ if (ret < 0)
+ goto err_read;
+
+ va += nr * PAGE_SIZE;
+ nr_restored += nr;
+ i += nr - 1;
+
+ bitmap_set(vma->page_bitmap, off + 1, nr - 1);
+ }
+
+ }
+
+ if (pr.put_pagemap)
+ pr.put_pagemap(&pr);
+ }
+
+err_read:
+ pr.close(&pr);
+ if (ret < 0)
+ return ret;
+
+ /* Remove pages, which were not shared with a child */
+ list_for_each_entry(vma, vmas, list) {
+ unsigned long size, i = 0;
+ void *addr = decode_pointer(vma->premmaped_addr);
+
+ if (vma->ppage_bitmap == NULL)
+ continue;
+
+ size = vma_entry_len(vma->e) / PAGE_SIZE;
+ while (1) {
+ /* Find all pages, which are not shared with this child */
+ i = find_next_bit(vma->ppage_bitmap, size, i);
+
+ if ( i >= size)
+ break;
+
+ ret = madvise(addr + PAGE_SIZE * i,
+ PAGE_SIZE, MADV_DONTNEED);
+ if (ret < 0) {
+ pr_perror("madvise failed");
+ return -1;
+ }
+ i++;
+ nr_droped++;
+ }
+ }
+
+ cnt_add(CNT_PAGES_COMPARED, nr_compared);
+ cnt_add(CNT_PAGES_SKIPPED_COW, nr_shared);
+ cnt_add(CNT_PAGES_RESTORED, nr_restored);
+
+ pr_info("nr_restored_pages: %d\n", nr_restored);
+ pr_info("nr_shared_pages: %d\n", nr_shared);
+ pr_info("nr_droped_pages: %d\n", nr_droped);
+ pr_info("nr_lazy: %d\n", nr_lazy);
+
+ return 0;
+
+err_addr:
+ pr_err("Page entry address %lx outside of VMA %lx-%lx\n",
+ va, (long)vma->e->start, (long)vma->e->end);
+ return -1;
+}
+
+int prepare_mappings(struct pstree_item *t)
+{
+ int ret = 0;
+ void *addr;
+ struct vm_area_list *vmas;
+
+ void *old_premmapped_addr = NULL;
+ unsigned long old_premmapped_len;
+
+ vmas = &rsti(t)->vmas;
+ if (vmas->nr == 0) /* Zombie */
+ goto out;
+
+ /* Reserve a place for mapping private vma-s one by one */
+ addr = mmap(NULL, vmas->priv_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ if (addr == MAP_FAILED) {
+ ret = -1;
+ pr_perror("Unable to reserve memory (%lu bytes)", vmas->priv_size);
+ goto out;
+ }
+
+ old_premmapped_addr = rsti(t)->premmapped_addr;
+ old_premmapped_len = rsti(t)->premmapped_len;
+ rsti(t)->premmapped_addr = addr;
+ rsti(t)->premmapped_len = vmas->priv_size;
+
+ ret = premap_priv_vmas(t, vmas, addr);
+ if (ret < 0)
+ goto out;
+
+ ret = restore_priv_vma_content(t);
+ if (ret < 0)
+ goto out;
+
+ if (old_premmapped_addr) {
+ ret = munmap(old_premmapped_addr, old_premmapped_len);
+ if (ret < 0)
+ pr_perror("Unable to unmap %p(%lx)",
+ old_premmapped_addr, old_premmapped_len);
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * A gard page must be unmapped after restoring content and
+ * forking children to restore COW memory.
+ */
+int unmap_guard_pages(struct pstree_item *t)
+{
+ struct vma_area *vma;
+ struct list_head *vmas = &rsti(t)->vmas.h;
+
+ list_for_each_entry(vma, vmas, list) {
+ if (!vma_area_is_private(vma, kdat.task_size))
+ continue;
+
+ if (vma->e->flags & MAP_GROWSDOWN) {
+ void *addr = decode_pointer(vma->premmaped_addr);
+
+ if (munmap(addr - PAGE_SIZE, PAGE_SIZE)) {
+ pr_perror("Can't unmap guard page");
+ return -1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int open_vmas(struct pstree_item *t)
+{
+ int pid = t->pid.virt;
+ struct vma_area *vma;
+ struct list_head *vmas = &rsti(t)->vmas.h;
+
+ list_for_each_entry(vma, vmas, list) {
+ if (!(vma_area_is(vma, VMA_AREA_REGULAR)))
+ continue;
+
+ pr_info("Opening 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" (%x) vma\n",
+ vma->e->start, vma->e->end,
+ vma->e->pgoff, vma->e->status);
+
+ if (vma->vm_open && vma->vm_open(pid, vma)) {
+ pr_err("`- Can't open vma\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
--
2.5.0
More information about the CRIU
mailing list