[CRIU] [PATCH 11/11] mem: Delayed vma/pr restore (v2)
Kirill Tkhai
ktkhai at virtuozzo.com
Mon May 15 09:16:57 PDT 2017
After this commit (previous is OK) I got the following:
root at pro:/home/kirill/criu# ./test/zdtm.py run -t zdtm/static/maps06
=== Run 1/1 ================ zdtm/static/maps06
========================= Run zdtm/static/maps06 in h ==========================
Start test
./maps06 --pidfile=maps06.pid --outfile=maps06.out --filename=maps06.test
Run criu dump
Run criu restore
=[log]=> dump/zdtm/static/maps06/27/1/restore.log
------------------------ grep Error ------------------------
(00.006707) 27: Opening 0x007fcd1c7f4000-0x007fcd1c7f5000 0000000000000000 (41) vma
(00.006709) 27: Opening 0x007fcd1c7f6000-0x007fcd1c7f7000 0000000000000000 (41) vma
(00.006711) 27: Opening 0x007fcd1c7f8000-0x007fcd1c7f9000 0000000000000000 (41) vma
(00.006714) 27: Opening 0x007fcd1c7fa000-0x007fcd1c7fb000 0000000000000000 (41) vma
(00.006718) 27: Error (criu/files-reg.c:1567): Can't open file home/kirill/criu/test/zdtm/static/maps06.test on restore: Too many open files
(00.006721) 27: Error (criu/files-reg.c:1506): Can't open file home/kirill/criu/test/zdtm/static/maps06.test: Too many open files
(00.006722) 27: Error (criu/mem.c:1113): `- Can't open vma
(00.006761) Error (criu/cr-restore.c:2366): Restoring FAILED.
------------------------ ERROR OVER ------------------------
################# Test zdtm/static/maps06 FAIL at CRIU restore #################
##################################### FAIL #####################################
Also, the open files list looks like:
root at pro:/home/kirill/github/criu# ls /proc/5637/fd -l
P8QP>P3P> 0
lrwx------ 1 root root 64 P<P0P9 15 18:56 0 -> /dev/null
l-wx------ 1 root root 64 P<P0P9 15 18:56 1 -> /home/kirill/criu/test/zdtm/static/maps06.out.inprogress
lrwx------ 1 root root 64 P<P0P9 15 18:56 10 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 100 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 1000 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 1001 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 1002 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 1003 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 1004 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 1005 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 1006 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 1007 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 1008 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 1009 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 101 -> /home/kirill/criu/test/zdtm/static/maps06.test
...
lrwx------ 1 root root 64 P<P0P9 15 18:56 99 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 990 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 991 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 992 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 993 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 994 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 995 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 996 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 997 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 998 -> /home/kirill/criu/test/zdtm/static/maps06.test
lrwx------ 1 root root 64 P<P0P9 15 18:56 999 -> /home/kirill/criu/test/zdtm/static/maps06.test
On 11.05.2017 12:13, Pavel Emelyanov wrote:
> Performance experiments show, that we spend (relatively) a lot of time
> mremap-ing areas from premap area into their proper places. This time
> depends on the task being restored, but for those with many vmas this
> can be up to 20%.
>
> The thing is that premapping is only needed to restore cow pages since
> we don't have any API in the kernel to share a page between two or more
> anonymous vmas. For non-cowing areas we map mmap() them directly in
> place. But for such cases we'll also need to restore the page's contents
> also from the pie code.
>
> Doing the whole page-read code from PIE is way too complex (for now), so
> the proposal is to optimize the case when we have a single local pagemap
> layer. This is what pr.pieok boolean stands for.
>
> v2:
> * Fixed ARM compiling (vma addresses formatting)
> * Unused tail of premapped area was left in task after restore
> * Preadv-ing pages in restorer context worked on corrupted iovs
> due to mistakes in pointer arithmetics
> * AIO mapping skipped at premap wasn't mapped in pie
> * Growsdown VMAs should sometimes (when they are "guarded" by
> previous VMA and guard page's contents cannot be restored in
> place) be premmaped
> * Always premmap for lazy-pages restore
>
> Signed-off-by: Pavel Emelyanov <xemul at virtuozzo.com>
> ---
> criu/cr-restore.c | 1 +
> criu/include/pagemap.h | 6 ++++
> criu/include/restorer.h | 12 +++++++
> criu/include/rst_info.h | 2 ++
> criu/mem.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++---
> criu/pagemap.c | 35 ++++++++++++++++--
> criu/pie/restorer.c | 52 +++++++++++++++++++++++++--
> criu/pstree.c | 1 +
> 8 files changed, 195 insertions(+), 8 deletions(-)
>
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 66a5e91..63c86e8 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -3477,6 +3477,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
> RST_MEM_FIXUP_PPTR(task_args->helpers);
> RST_MEM_FIXUP_PPTR(task_args->zombies);
> RST_MEM_FIXUP_PPTR(task_args->seccomp_filters);
> + RST_MEM_FIXUP_PPTR(task_args->vma_ios);
>
> if (core->tc->has_seccomp_mode)
> task_args->seccomp_mode = core->tc->seccomp_mode;
> diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h
> index aa3c4aa..08633ef 100644
> --- a/criu/include/pagemap.h
> +++ b/criu/include/pagemap.h
> @@ -58,6 +58,9 @@ struct page_read {
> int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr,
> int nr, void *buf, unsigned flags);
>
> + /* Whether or not pages can be read in PIE code */
> + bool pieok;
> +
> /* Private data of reader */
> struct cr_img *pmi;
> struct cr_img *pi;
> @@ -104,8 +107,11 @@ extern int open_page_read(int pid, struct page_read *, int pr_flags);
> extern int open_page_read_at(int dfd, int pid, struct page_read *pr,
> int pr_flags);
>
> +struct task_restore_args;
> +
> int pagemap_enqueue_iovec(struct page_read *pr, void *buf,
> unsigned long len, struct list_head *to);
> +int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta);
>
> /*
> * Create a shallow copy of page_read object.
> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
> index 87de026..1866bbd 100644
> --- a/criu/include/restorer.h
> +++ b/criu/include/restorer.h
> @@ -102,6 +102,14 @@ struct thread_restore_args {
>
> typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
>
> +struct restore_vma_io {
> + int nr_iovs;
> + loff_t off;
> + struct iovec iovs[0];
> +};
> +
> +#define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec))
> +
> struct task_restore_args {
> struct thread_restore_args *t; /* thread group leader */
>
> @@ -124,6 +132,10 @@ struct task_restore_args {
> VmaEntry *vmas;
> unsigned int vmas_n;
>
> + int vma_ios_fd;
> + struct restore_vma_io *vma_ios;
> + unsigned int vma_ios_n;
> +
> struct restore_posix_timer *posix_timers;
> unsigned int posix_timers_n;
>
> diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h
> index 92dfc9d..c3dbe2d 100644
> --- a/criu/include/rst_info.h
> +++ b/criu/include/rst_info.h
> @@ -39,6 +39,8 @@ struct rst_info {
>
> struct vm_area_list vmas;
> struct _MmEntry *mm;
> + struct list_head vma_io;
> + unsigned int pages_img_id;
>
> u32 cg_set;
>
> diff --git a/criu/mem.c b/criu/mem.c
> index b4f9990..942d5d9 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -741,8 +741,34 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void
> return 0;
> }
>
> +static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head)
> +{
> + /*
> + * Growsdown VMAs always have one guard page at the
> + * beginning and sometimes this page contains data.
> + * In case the VMA is premmaped, we premmap one page
> + * larger VMA. In case of in place restore we can only
> + * do this if the VMA in question is not "guarded" by
> + * some other VMA.
> + */
> + if (vma->e->flags & MAP_GROWSDOWN) {
> + if (vma->list.prev != head) {
> + struct vma_area *prev;
> +
> + prev = list_entry(vma->list.prev, struct vma_area, list);
> + if (prev->e->end == vma->e->start) {
> + pr_debug("Force premmap for 0x%"PRIx64":0x%"PRIx64"\n",
> + vma->e->start, vma->e->end);
> + return true;
> + }
> + }
> + }
> +
> + return false;
> +}
> +
> static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas,
> - void *at, struct page_read *pr)
> + void **at, struct page_read *pr)
> {
> struct vma_area *vma;
> unsigned long pstart = 0;
> @@ -760,7 +786,14 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas,
> if (!vma_area_is_private(vma, kdat.task_size))
> continue;
>
> - ret = premap_private_vma(t, vma, &at);
> + if (vma->pvma == NULL && pr->pieok && !vma_force_premap(vma, &vmas->h))
> + /*
> + * VMA in question is not shared with anyone. We'll
> + * restore it with its contents in restorer.
> + */
> + continue;
> +
> + ret = premap_private_vma(t, vma, at);
> if (ret < 0)
> break;
> }
> @@ -773,6 +806,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr)
> struct vma_area *vma;
> int ret = 0;
> struct list_head *vmas = &rsti(t)->vmas.h;
> + struct list_head *vma_io = &rsti(t)->vma_io;
>
> unsigned int nr_restored = 0;
> unsigned int nr_shared = 0;
> @@ -787,6 +821,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr)
> }
>
> vma = list_first_entry(vmas, struct vma_area, list);
> + rsti(t)->pages_img_id = pr->pages_img_id;
>
> /*
> * Read page contents.
> @@ -839,6 +874,28 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr)
> goto err_addr;
> }
>
> + if (!vma_area_is(vma, VMA_PREMMAPED)) {
> + unsigned long len = min_t(unsigned long,
> + (nr_pages - i) * PAGE_SIZE,
> + vma->e->end - va);
> +
> + if (pagemap_enqueue_iovec(pr, (void *)va, len, vma_io))
> + return -1;
> +
> + pr->skip_pages(pr, len);
> +
> + va += len;
> + len >>= PAGE_SHIFT;
> + nr_restored += len;
> + i += len - 1;
> + pr_debug("Enqueue page-read\n");
> + continue;
> + }
> +
> + /*
> + * Otherwise to the COW restore
> + */
> +
> off = (va - vma->e->start) / PAGE_SIZE;
> p = decode_pointer((off) * PAGE_SIZE +
> vma->premmaped_addr);
> @@ -974,7 +1031,7 @@ int prepare_mappings(struct pstree_item *t)
>
> pr.advance(&pr); /* shift to the 1st iovec */
>
> - ret = premap_priv_vmas(t, vmas, addr, &pr);
> + ret = premap_priv_vmas(t, vmas, &addr, &pr);
> if (ret < 0)
> goto out;
>
> @@ -991,6 +1048,23 @@ int prepare_mappings(struct pstree_item *t)
> old_premmapped_addr, old_premmapped_len);
> }
>
> + /*
> + * Not all VMAs were premmaped. Find out the unused tail of the
> + * premapped area and unmap it.
> + */
> + old_premmapped_len = addr - rsti(t)->premmapped_addr;
> + if (old_premmapped_len < rsti(t)->premmapped_len) {
> + unsigned long tail;
> +
> + tail = rsti(t)->premmapped_len - old_premmapped_len;
> + ret = munmap(addr, tail);
> + if (ret < 0)
> + pr_perror("Unable to unmap %p(%lx)", addr, tail);
> + rsti(t)->premmapped_len = old_premmapped_len;
> + pr_info("Shrunk premap area to %p(%lx)\n",
> + rsti(t)->premmapped_addr, rsti(t)->premmapped_len);
> + }
> +
> out:
> return ret;
> }
> @@ -1044,6 +1118,18 @@ int open_vmas(struct pstree_item *t)
> return 0;
> }
>
> +static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta)
> +{
> + struct cr_img *pages;
> +
> + pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id);
> + if (!pages)
> + return -1;
> +
> + ta->vma_ios_fd = img_raw_fd(pages);
> + return pagemap_render_iovec(&rsti(t)->vma_io, ta);
> +}
> +
> int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta)
> {
> struct vma_area *vma;
> @@ -1069,6 +1155,6 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta)
> vma_premmaped_start(vme) = vma->premmaped_addr;
> }
>
> - return 0;
> + return prepare_vma_ios(t, ta);
> }
>
> diff --git a/criu/pagemap.c b/criu/pagemap.c
> index dcc1332..d770e81 100644
> --- a/criu/pagemap.c
> +++ b/criu/pagemap.c
> @@ -11,7 +11,8 @@
> #include "servicefd.h"
> #include "pagemap.h"
> #include "page-xfer.h"
> -
> +#include "restorer.h"
> +#include "rst-malloc.h"
> #include "fault-injection.h"
> #include "xmalloc.h"
> #include "protobuf.h"
> @@ -309,6 +310,32 @@ static int enqueue_async_iov(struct page_read *pr, void *buf,
> return 0;
> }
>
> +int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta)
> +{
> + struct page_read_iov *piov;
> +
> + ta->vma_ios = (struct restore_vma_io *)rst_mem_align_cpos(RM_PRIVATE);
> + ta->vma_ios_n = 0;
> +
> + list_for_each_entry(piov, from, l) {
> + struct restore_vma_io *rio;
> +
> + pr_info("`- render %d iovs (%p:%zd...)\n", piov->nr,
> + piov->to[0].iov_base, piov->to[0].iov_len);
> + rio = rst_mem_alloc(RIO_SIZE(piov->nr), RM_PRIVATE);
> + if (!rio)
> + return -1;
> +
> + rio->nr_iovs = piov->nr;
> + rio->off = piov->from;
> + memcpy(rio->iovs, piov->to, piov->nr * sizeof(struct iovec));
> +
> + ta->vma_ios_n++;
> + }
> +
> + return 0;
> +}
> +
> int pagemap_enqueue_iovec(struct page_read *pr, void *buf,
> unsigned long len, struct list_head *to)
> {
> @@ -795,6 +822,7 @@ int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
> pr->bunch.iov_len = 0;
> pr->bunch.iov_base = NULL;
> pr->pmes = NULL;
> + pr->pieok = false;
>
> pr->pmi = open_image_at(dfd, i_typ, O_RSTR, (long)pid);
> if (!pr->pmi)
> @@ -836,8 +864,11 @@ int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
> pr->maybe_read_page = maybe_read_page_img_cache;
> else if (remote)
> pr->maybe_read_page = maybe_read_page_remote;
> - else
> + else {
> pr->maybe_read_page = maybe_read_page_local;
> + if (!pr->parent && !opts.lazy_pages)
> + pr->pieok = true;
> + }
>
> pr_debug("Opened %s page read %u (parent %u)\n",
> remote ? "remote" : "local", pr->id,
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index 9852d77..98c81f3 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -616,6 +616,10 @@ static unsigned long restore_mapping(VmaEntry *vma_entry)
> if (vma_entry_is(vma_entry, VMA_ANON_SHARED) && (vma_entry->fd != -1UL))
> flags &= ~MAP_ANONYMOUS;
>
> + /* See comment in premap_private_vma() for this flag change */
> + if (vma_entry_is(vma_entry, VMA_AREA_AIORING))
> + flags |= MAP_ANONYMOUS;
> +
> /* A mapping of file with MAP_SHARED is up to date */
> if (vma_entry->fd == -1 || !(vma_entry->flags & MAP_SHARED))
> prot |= PROT_WRITE;
> @@ -1156,7 +1160,7 @@ long __export_restore_task(struct task_restore_args *args)
> int i, k;
> VmaEntry *vma_entry;
> unsigned long va;
> -
> + struct restore_vma_io *rio;
> struct rt_sigframe *rt_sigframe;
> struct prctl_mm_map prctl_map;
> unsigned long new_sp;
> @@ -1265,7 +1269,8 @@ long __export_restore_task(struct task_restore_args *args)
> for (i = 0; i < args->vmas_n; i++) {
> vma_entry = args->vmas + i;
>
> - if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
> + if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR) &&
> + !vma_entry_is(vma_entry, VMA_AREA_AIORING))
> continue;
>
> if (vma_entry_is(vma_entry, VMA_PREMMAPED))
> @@ -1279,6 +1284,49 @@ long __export_restore_task(struct task_restore_args *args)
> }
> }
>
> + /*
> + * Now read the contents (if any)
> + */
> +
> + rio = args->vma_ios;
> + for (i = 0; i < args->vma_ios_n; i++) {
> + struct iovec *iovs = rio->iovs;
> + int nr = rio->nr_iovs;
> + ssize_t r;
> +
> + while (nr) {
> + pr_debug("Preadv %lx:%d... (%d iovs)\n",
> + (unsigned long)iovs->iov_base,
> + (int)iovs->iov_len, nr);
> + r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off);
> + if (r < 0) {
> + pr_err("Can't read pages data (%d)\n", (int)r);
> + goto core_restore_end;
> + }
> +
> + pr_debug("`- returned %ld\n", (long)r);
> + rio->off += r;
> + /* Advance the iovecs */
> + do {
> + if (iovs->iov_len <= r) {
> + pr_debug(" `- skip pagemap\n");
> + r -= iovs->iov_len;
> + iovs++;
> + nr--;
> + continue;
> + }
> +
> + iovs->iov_base += r;
> + iovs->iov_len -= r;
> + break;
> + } while (nr > 0);
> + }
> +
> + rio = ((void *)rio) + RIO_SIZE(rio->nr_iovs);
> + }
> +
> + sys_close(args->vma_ios_fd);
> +
> #ifdef CONFIG_VDSO
> /*
> * Proxify vDSO.
> diff --git a/criu/pstree.c b/criu/pstree.c
> index 2cc0844..bbad67d 100644
> --- a/criu/pstree.c
> +++ b/criu/pstree.c
> @@ -228,6 +228,7 @@ struct pstree_item *__alloc_pstree_item(bool rst, int level)
> return NULL;
> memset(item, 0, sz);
> vm_area_list_init(&rsti(item)->vmas);
> + INIT_LIST_HEAD(&rsti(item)->vma_io);
> /*
> * On restore we never expand pid level,
> * so allocate them all at once.
>
More information about the CRIU
mailing list