[CRIU] [PATCH 11/11] mem: Delayed vma/pr restore (v2)
Pavel Emelyanov
xemul at virtuozzo.com
Thu May 11 02:13:52 PDT 2017
Performance experiments show, that we spend (relatively) a lot of time
mremap-ing areas from premap area into their proper places. This time
depends on the task being restored, but for those with many vmas this
can be up to 20%.
The thing is that premapping is only needed to restore cow pages since
we don't have any API in the kernel to share a page between two or more
anonymous vmas. For non-cowing areas we map mmap() them directly in
place. But for such cases we'll also need to restore the page's contents
also from the pie code.
Doing the whole page-read code from PIE is way too complex (for now), so
the proposal is to optimize the case when we have a single local pagemap
layer. This is what pr.pieok boolean stands for.
v2:
* Fixed ARM compiling (vma addresses formatting)
* Unused tail of premapped area was left in task after restore
* Preadv-ing pages in restorer context worked on corrupted iovs
due to mistakes in pointer arithmetics
* AIO mapping skipped at premap wasn't mapped in pie
* Growsdown VMAs should sometimes (when they are "guarded" by
previous VMA and guard page's contents cannot be restored in
place) be premmaped
* Always premmap for lazy-pages restore
Signed-off-by: Pavel Emelyanov <xemul at virtuozzo.com>
---
criu/cr-restore.c | 1 +
criu/include/pagemap.h | 6 ++++
criu/include/restorer.h | 12 +++++++
criu/include/rst_info.h | 2 ++
criu/mem.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++---
criu/pagemap.c | 35 ++++++++++++++++--
criu/pie/restorer.c | 52 +++++++++++++++++++++++++--
criu/pstree.c | 1 +
8 files changed, 195 insertions(+), 8 deletions(-)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 66a5e91..63c86e8 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -3477,6 +3477,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
RST_MEM_FIXUP_PPTR(task_args->helpers);
RST_MEM_FIXUP_PPTR(task_args->zombies);
RST_MEM_FIXUP_PPTR(task_args->seccomp_filters);
+ RST_MEM_FIXUP_PPTR(task_args->vma_ios);
if (core->tc->has_seccomp_mode)
task_args->seccomp_mode = core->tc->seccomp_mode;
diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h
index aa3c4aa..08633ef 100644
--- a/criu/include/pagemap.h
+++ b/criu/include/pagemap.h
@@ -58,6 +58,9 @@ struct page_read {
int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr,
int nr, void *buf, unsigned flags);
+ /* Whether or not pages can be read in PIE code */
+ bool pieok;
+
/* Private data of reader */
struct cr_img *pmi;
struct cr_img *pi;
@@ -104,8 +107,11 @@ extern int open_page_read(int pid, struct page_read *, int pr_flags);
extern int open_page_read_at(int dfd, int pid, struct page_read *pr,
int pr_flags);
+struct task_restore_args;
+
int pagemap_enqueue_iovec(struct page_read *pr, void *buf,
unsigned long len, struct list_head *to);
+int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta);
/*
* Create a shallow copy of page_read object.
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index 87de026..1866bbd 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -102,6 +102,14 @@ struct thread_restore_args {
typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
+struct restore_vma_io {
+ int nr_iovs;
+ loff_t off;
+ struct iovec iovs[0];
+};
+
+#define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec))
+
struct task_restore_args {
struct thread_restore_args *t; /* thread group leader */
@@ -124,6 +132,10 @@ struct task_restore_args {
VmaEntry *vmas;
unsigned int vmas_n;
+ int vma_ios_fd;
+ struct restore_vma_io *vma_ios;
+ unsigned int vma_ios_n;
+
struct restore_posix_timer *posix_timers;
unsigned int posix_timers_n;
diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h
index 92dfc9d..c3dbe2d 100644
--- a/criu/include/rst_info.h
+++ b/criu/include/rst_info.h
@@ -39,6 +39,8 @@ struct rst_info {
struct vm_area_list vmas;
struct _MmEntry *mm;
+ struct list_head vma_io;
+ unsigned int pages_img_id;
u32 cg_set;
diff --git a/criu/mem.c b/criu/mem.c
index b4f9990..942d5d9 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -741,8 +741,34 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void
return 0;
}
+static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head)
+{
+ /*
+ * Growsdown VMAs always have one guard page at the
+ * beginning and sometimes this page contains data.
+ * In case the VMA is premmaped, we premmap one page
+ * larger VMA. In case of in place restore we can only
+ * do this if the VMA in question is not "guarded" by
+ * some other VMA.
+ */
+ if (vma->e->flags & MAP_GROWSDOWN) {
+ if (vma->list.prev != head) {
+ struct vma_area *prev;
+
+ prev = list_entry(vma->list.prev, struct vma_area, list);
+ if (prev->e->end == vma->e->start) {
+ pr_debug("Force premmap for 0x%"PRIx64":0x%"PRIx64"\n",
+ vma->e->start, vma->e->end);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas,
- void *at, struct page_read *pr)
+ void **at, struct page_read *pr)
{
struct vma_area *vma;
unsigned long pstart = 0;
@@ -760,7 +786,14 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas,
if (!vma_area_is_private(vma, kdat.task_size))
continue;
- ret = premap_private_vma(t, vma, &at);
+ if (vma->pvma == NULL && pr->pieok && !vma_force_premap(vma, &vmas->h))
+ /*
+ * VMA in question is not shared with anyone. We'll
+ * restore it with its contents in restorer.
+ */
+ continue;
+
+ ret = premap_private_vma(t, vma, at);
if (ret < 0)
break;
}
@@ -773,6 +806,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr)
struct vma_area *vma;
int ret = 0;
struct list_head *vmas = &rsti(t)->vmas.h;
+ struct list_head *vma_io = &rsti(t)->vma_io;
unsigned int nr_restored = 0;
unsigned int nr_shared = 0;
@@ -787,6 +821,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr)
}
vma = list_first_entry(vmas, struct vma_area, list);
+ rsti(t)->pages_img_id = pr->pages_img_id;
/*
* Read page contents.
@@ -839,6 +874,28 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr)
goto err_addr;
}
+ if (!vma_area_is(vma, VMA_PREMMAPED)) {
+ unsigned long len = min_t(unsigned long,
+ (nr_pages - i) * PAGE_SIZE,
+ vma->e->end - va);
+
+ if (pagemap_enqueue_iovec(pr, (void *)va, len, vma_io))
+ return -1;
+
+ pr->skip_pages(pr, len);
+
+ va += len;
+ len >>= PAGE_SHIFT;
+ nr_restored += len;
+ i += len - 1;
+ pr_debug("Enqueue page-read\n");
+ continue;
+ }
+
+ /*
+ * Otherwise to the COW restore
+ */
+
off = (va - vma->e->start) / PAGE_SIZE;
p = decode_pointer((off) * PAGE_SIZE +
vma->premmaped_addr);
@@ -974,7 +1031,7 @@ int prepare_mappings(struct pstree_item *t)
pr.advance(&pr); /* shift to the 1st iovec */
- ret = premap_priv_vmas(t, vmas, addr, &pr);
+ ret = premap_priv_vmas(t, vmas, &addr, &pr);
if (ret < 0)
goto out;
@@ -991,6 +1048,23 @@ int prepare_mappings(struct pstree_item *t)
old_premmapped_addr, old_premmapped_len);
}
+ /*
+ * Not all VMAs were premmaped. Find out the unused tail of the
+ * premapped area and unmap it.
+ */
+ old_premmapped_len = addr - rsti(t)->premmapped_addr;
+ if (old_premmapped_len < rsti(t)->premmapped_len) {
+ unsigned long tail;
+
+ tail = rsti(t)->premmapped_len - old_premmapped_len;
+ ret = munmap(addr, tail);
+ if (ret < 0)
+ pr_perror("Unable to unmap %p(%lx)", addr, tail);
+ rsti(t)->premmapped_len = old_premmapped_len;
+ pr_info("Shrunk premap area to %p(%lx)\n",
+ rsti(t)->premmapped_addr, rsti(t)->premmapped_len);
+ }
+
out:
return ret;
}
@@ -1044,6 +1118,18 @@ int open_vmas(struct pstree_item *t)
return 0;
}
+static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta)
+{
+ struct cr_img *pages;
+
+ pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id);
+ if (!pages)
+ return -1;
+
+ ta->vma_ios_fd = img_raw_fd(pages);
+ return pagemap_render_iovec(&rsti(t)->vma_io, ta);
+}
+
int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta)
{
struct vma_area *vma;
@@ -1069,6 +1155,6 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta)
vma_premmaped_start(vme) = vma->premmaped_addr;
}
- return 0;
+ return prepare_vma_ios(t, ta);
}
diff --git a/criu/pagemap.c b/criu/pagemap.c
index dcc1332..d770e81 100644
--- a/criu/pagemap.c
+++ b/criu/pagemap.c
@@ -11,7 +11,8 @@
#include "servicefd.h"
#include "pagemap.h"
#include "page-xfer.h"
-
+#include "restorer.h"
+#include "rst-malloc.h"
#include "fault-injection.h"
#include "xmalloc.h"
#include "protobuf.h"
@@ -309,6 +310,32 @@ static int enqueue_async_iov(struct page_read *pr, void *buf,
return 0;
}
+int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta)
+{
+ struct page_read_iov *piov;
+
+ ta->vma_ios = (struct restore_vma_io *)rst_mem_align_cpos(RM_PRIVATE);
+ ta->vma_ios_n = 0;
+
+ list_for_each_entry(piov, from, l) {
+ struct restore_vma_io *rio;
+
+ pr_info("`- render %d iovs (%p:%zd...)\n", piov->nr,
+ piov->to[0].iov_base, piov->to[0].iov_len);
+ rio = rst_mem_alloc(RIO_SIZE(piov->nr), RM_PRIVATE);
+ if (!rio)
+ return -1;
+
+ rio->nr_iovs = piov->nr;
+ rio->off = piov->from;
+ memcpy(rio->iovs, piov->to, piov->nr * sizeof(struct iovec));
+
+ ta->vma_ios_n++;
+ }
+
+ return 0;
+}
+
int pagemap_enqueue_iovec(struct page_read *pr, void *buf,
unsigned long len, struct list_head *to)
{
@@ -795,6 +822,7 @@ int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
pr->bunch.iov_len = 0;
pr->bunch.iov_base = NULL;
pr->pmes = NULL;
+ pr->pieok = false;
pr->pmi = open_image_at(dfd, i_typ, O_RSTR, (long)pid);
if (!pr->pmi)
@@ -836,8 +864,11 @@ int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
pr->maybe_read_page = maybe_read_page_img_cache;
else if (remote)
pr->maybe_read_page = maybe_read_page_remote;
- else
+ else {
pr->maybe_read_page = maybe_read_page_local;
+ if (!pr->parent && !opts.lazy_pages)
+ pr->pieok = true;
+ }
pr_debug("Opened %s page read %u (parent %u)\n",
remote ? "remote" : "local", pr->id,
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 9852d77..98c81f3 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -616,6 +616,10 @@ static unsigned long restore_mapping(VmaEntry *vma_entry)
if (vma_entry_is(vma_entry, VMA_ANON_SHARED) && (vma_entry->fd != -1UL))
flags &= ~MAP_ANONYMOUS;
+ /* See comment in premap_private_vma() for this flag change */
+ if (vma_entry_is(vma_entry, VMA_AREA_AIORING))
+ flags |= MAP_ANONYMOUS;
+
/* A mapping of file with MAP_SHARED is up to date */
if (vma_entry->fd == -1 || !(vma_entry->flags & MAP_SHARED))
prot |= PROT_WRITE;
@@ -1156,7 +1160,7 @@ long __export_restore_task(struct task_restore_args *args)
int i, k;
VmaEntry *vma_entry;
unsigned long va;
-
+ struct restore_vma_io *rio;
struct rt_sigframe *rt_sigframe;
struct prctl_mm_map prctl_map;
unsigned long new_sp;
@@ -1265,7 +1269,8 @@ long __export_restore_task(struct task_restore_args *args)
for (i = 0; i < args->vmas_n; i++) {
vma_entry = args->vmas + i;
- if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
+ if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR) &&
+ !vma_entry_is(vma_entry, VMA_AREA_AIORING))
continue;
if (vma_entry_is(vma_entry, VMA_PREMMAPED))
@@ -1279,6 +1284,49 @@ long __export_restore_task(struct task_restore_args *args)
}
}
+ /*
+ * Now read the contents (if any)
+ */
+
+ rio = args->vma_ios;
+ for (i = 0; i < args->vma_ios_n; i++) {
+ struct iovec *iovs = rio->iovs;
+ int nr = rio->nr_iovs;
+ ssize_t r;
+
+ while (nr) {
+ pr_debug("Preadv %lx:%d... (%d iovs)\n",
+ (unsigned long)iovs->iov_base,
+ (int)iovs->iov_len, nr);
+ r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off);
+ if (r < 0) {
+ pr_err("Can't read pages data (%d)\n", (int)r);
+ goto core_restore_end;
+ }
+
+ pr_debug("`- returned %ld\n", (long)r);
+ rio->off += r;
+ /* Advance the iovecs */
+ do {
+ if (iovs->iov_len <= r) {
+ pr_debug(" `- skip pagemap\n");
+ r -= iovs->iov_len;
+ iovs++;
+ nr--;
+ continue;
+ }
+
+ iovs->iov_base += r;
+ iovs->iov_len -= r;
+ break;
+ } while (nr > 0);
+ }
+
+ rio = ((void *)rio) + RIO_SIZE(rio->nr_iovs);
+ }
+
+ sys_close(args->vma_ios_fd);
+
#ifdef CONFIG_VDSO
/*
* Proxify vDSO.
diff --git a/criu/pstree.c b/criu/pstree.c
index 2cc0844..bbad67d 100644
--- a/criu/pstree.c
+++ b/criu/pstree.c
@@ -228,6 +228,7 @@ struct pstree_item *__alloc_pstree_item(bool rst, int level)
return NULL;
memset(item, 0, sz);
vm_area_list_init(&rsti(item)->vmas);
+ INIT_LIST_HEAD(&rsti(item)->vma_io);
/*
* On restore we never expand pid level,
* so allocate them all at once.
--
2.1.4
More information about the CRIU
mailing list