[CRIU] [PATCH 2/2] lazy-pages: add support to combine pre-copy and post-copy
Mike Rapoport
rppt at linux.vnet.ibm.com
Thu Sep 8 00:16:37 PDT 2016
Hi Adrian,
On Wed, Sep 07, 2016 at 06:24:32PM +0200, Adrian Reber wrote:
> From: Adrian Reber <areber at redhat.com>
>
> To combine pre-copy (pre-dump) and post-copy (lazy-pages) mode the
> lazy-page mode must be made aware of pages which are only in the parent
> image and not in the current checkpoint image.
>
> As the restorer only works on VmaEntry-s and knows nothing about PageMap
> entries the VmaEntry-s need to be adapted to match the PageMap entries.
>
> This changes the lazy-page detection to not only rely on
> vma_entry_can_be_lazy() but to also check if the page is available in
> the parent. If the page is available in a parent checkpoint the page is
> not marked as lazy via a new VmaEntry field (optional bool lazy = 11).
>
> If the VmaEntry does not have the same size as the PageMap entry the
> VmaEntry needs to be adapted to match the PageMap entry and then the new
> lazy flag can be set in the VmaEntry.
>
> The restorer then additionally has to check if the VmaEntry has the lazy
> flag. If the lazy flag is not set, then the page is available in a
> parent checkpoint.
>
> This code additionally adds a 'return 0;' to unmap_guard_pages() as the
> VmaEntry splitting can create multiple VmaEntry-s with MAP_GROWSDOWN and
> only the first entry needs the guard page to be unmapped.
I believe, that restorer and lazy-pages daemon should switch to using
pagemap for detection of memory areas that can be lazily restored. The
pagemap entry should contain enough information about the actual location
of the pages it describes. Then both restorer and lazy-pages daemon will
not need VMA data to determine the way for restoring the pages.
When we discussed it earlier (the thread staring at [1]), I missed the fact
that pagemap does not mark potentially lazy pages when dumping to disk. I'm
going to send patches that aim to fix it and make pagemap accurately
describe different kinds of entries.
[1] https://lists.openvz.org/pipermail/criu/2016-July/030561.html
> Following steps to migrate a process are now possible:
>
> Source system:
>
> * criu pre-dump -D /tmp/cp/1 -t <PID>
> * rsync -a /tmp/cp <destination>:/tmp
> * criu dump -D /tmp/cp/2 -t <PID> --port 27 --lazy-pages \
> --prev-images-dir ../1/ --track-mem
>
> Destination system:
>
> * rsync -a <source>:/tmp/cp /tmp/
> * criu lazy-pages --page-server --address <source> --port 27 \
> -D /tmp/cp/2 &
> * criu restore --lazy-pages -D /tmp/cp/2
>
> This will now restore all pages from the parent checkpoint if they
> are not marked as lazy in the second checkpoint.
>
> Signed-off-by: Adrian Reber <areber at redhat.com>
> ---
> criu/mem.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++----
> criu/pie/restorer.c | 2 +-
> criu/uffd.c | 3 +-
> images/vma.proto | 1 +
> 4 files changed, 113 insertions(+), 10 deletions(-)
>
> diff --git a/criu/mem.c b/criu/mem.c
> index a94a378..e155d9e 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -684,6 +684,35 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo
> return ret;
> }
>
> +static int split_priv_vma(unsigned long addr, struct vma_area *vma)
> +{
> + struct vma_area *new_vma;
> + VmaEntry *e;
> +
> + /* create new VMA Area */
> + new_vma = alloc_vma_area();
> + /* Store address of new VMA Entry */
> + e = new_vma->e;
> + /* Copy all old values */
> + memcpy(new_vma, vma, sizeof(struct vma_area));
> + /* Fill new VMA Entry with old values */
> + memcpy(e, vma->e, sizeof(VmaEntry));
> + /* overwrite start address with current */
> + e->start = addr;
> + /* Overwrite old end address */
> + vma->e->end = addr;
> + e->has_lazy = true;
> + e->lazy = false;
> + new_vma->e = e;
> + new_vma->page_bitmap = xzalloc(BITS_TO_LONGS(vma_entry_len(new_vma->e) / PAGE_SIZE) * sizeof(long));
> + if (new_vma->page_bitmap == NULL)
> + return -1;
> +
> + new_vma->premmaped_addr += vma_entry_len(vma->e);
> + list_add(&new_vma->list, &vma->list);
> + return 0;
> +}
> +
> static int restore_priv_vma_content(struct pstree_item *t)
> {
> struct vma_area *vma;
> @@ -745,20 +774,84 @@ static int restore_priv_vma_content(struct pstree_item *t)
> goto err_addr;
> }
>
> - off = (va - vma->e->start) / PAGE_SIZE;
> - p = decode_pointer((off) * PAGE_SIZE +
> - vma->premmaped_addr);
> -
> /*
> * This means that userfaultfd is used to load the pages
> * on demand.
> */
> if (opts.lazy_pages && vma_entry_can_be_lazy(vma->e)) {
> - pr_debug("Lazy restore skips %#016"PRIx64"\n", vma->e->start);
> - pr.skip_pages(&pr, PAGE_SIZE);
> - nr_lazy++;
> - continue;
> + pr_debug("Lazy restore skips %#016"PRIx64"\n", va);
> + if (!pr.pe->in_parent) {
> + pr_debug("%#016"PRIx64" not in parent\n", va);
> + pr.skip_pages(&pr, PAGE_SIZE);
> + nr_lazy++;
> + vma->e->has_lazy = true;
> + vma->e->lazy = true;
> + continue;
> + } else {
> + unsigned long new_addr;
> + /*
> + * First check if the PageMap Entry and the
> + * VMA Entry are the same size. That is the easy
> + * case where the whole VMA Entry can be marked
> + * as non-lazy as it present in the parent.
> + */
> + if (pr.pe->vaddr == vma->e->start &&
> + pr.pe->vaddr + (pr.pe->nr_pages * PAGE_SIZE) == vma->e->end) {
> + pr_debug("VMA Entry and PageMap Entry matches\n");
> + /*
> + * lazy defaults to false; explicitly set it for
> + * better readability.
> + */
> + vma->e->has_lazy = true;
> + vma->e->lazy = false;
> + goto read_pages;
> + }
> + /*
> + * Only those pages in the VMA Entry which
> + * are not available in the parent, should be
> + * marked as lazy.
> + * As only the PageMap Entry knows if the pages
> + * are available in the parent, the VMA Entry needs
> + * to be split into pages which actually should
> + * be loaded lazily and pages which are in the
> + * parent. This is necessary as the restore only
> + * knows about VMAs and not PageMap Entries.
> + */
> +
> + /* Check if this is the last page of the VMA Entry */
> + if (vma->e->end == va + PAGE_SIZE) {
> + pr_debug("VMA Entry end has already been reached\n");
> + goto read_pages;
> + }
> +
> + /*
> + * Check if the current address is the same
> + * as the current VMA Entries start address.
> + * If not a VMA Entry at the beginning has to be
> + * split off.
> + */
> + if (va != vma->e->start) {
> + pr_debug("Replacing VMA start address\n");
> + new_addr = va;
> + } else {
> + new_addr = pr.pe->vaddr + (pr.pe->nr_pages * PAGE_SIZE);
> + if (new_addr > vma->e->end) {
> + pr_debug("VMA Entry smaller than PageMap Entry\n");
> + new_addr = va;
> + }
> + }
> +
> + ret = split_priv_vma(new_addr, vma);
> + if (ret)
> + return -1;
> + rsti(t)->vmas.nr++;
> + }
> }
> +read_pages:
> +
> + off = (va - vma->e->start) / PAGE_SIZE;
> + p = decode_pointer((off) * PAGE_SIZE +
> + vma->premmaped_addr);
>
> set_bit(off, vma->page_bitmap);
> if (vma->ppage_bitmap) { /* inherited vma */
> @@ -923,6 +1016,14 @@ int unmap_guard_pages(struct pstree_item *t)
> pr_perror("Can't unmap guard page");
> return -1;
> }
> +
> + /*
> + * The code to combine pre-copy and post-copy
> + * can split existing MAP_GROWSDOWN VMA areas
> + * into two. Therefore returning once a guard
> + * page has been unmapped.
> + */
> + return 0;
> }
> }
>
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index d84d316..39719d3 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -841,7 +841,7 @@ static int vma_remap(VmaEntry *vma_entry, int uffd)
> * pages, so that the processes will hang until the memory is
> * injected via userfaultfd.
> */
> - if (vma_entry_can_be_lazy(vma_entry))
> + if (vma_entry_can_be_lazy(vma_entry) && vma_entry->lazy)
> if (enable_uffd(uffd, dst, len) != 0)
> return -1;
>
> diff --git a/criu/uffd.c b/criu/uffd.c
> index cf03c31..71dbed7 100644
> --- a/criu/uffd.c
> +++ b/criu/uffd.c
> @@ -508,7 +508,8 @@ static int collect_uffd_pages(struct page_read *pr, struct lazy_pages_info *lpi)
> */
> if (base >= vma->e->start && base < vma->e->end) {
> if (vma_entry_can_be_lazy(vma->e)) {
> - uffd_page = true;
> + if(!pr->pe->in_parent)
> + uffd_page = true;
> break;
> }
> }
> diff --git a/images/vma.proto b/images/vma.proto
> index 7085f42..843ba2b 100644
> --- a/images/vma.proto
> +++ b/images/vma.proto
> @@ -22,4 +22,5 @@ message vma_entry {
>
> /* file status flags */
> optional uint32 fdflags = 10 [(criu).hex = true];
> + optional bool lazy = 11 [(criu).hex = false];
> }
> --
> 2.7.4
>
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
>
More information about the CRIU
mailing list