[CRIU] [PATCH v3 2/2] lazy-pages: add support to combine pre-copy and post-copy

Mike Rapoport rppt at linux.vnet.ibm.com
Tue Sep 20 23:44:49 PDT 2016


Hi Adrian,

On Tue, Sep 20, 2016 at 06:54:11PM +0200, Adrian Reber wrote:
> From: Adrian Reber <areber at redhat.com>
 
[snip]
 
> v2:
>  - changed parent detection to use pagemap_in_parent()
> 
> v3:
>  - unfortunately this reverts
>    c11cf95afbe023a2816a3afaecb65cc4fee670d7
>    "criu: mem: skip lazy pages during restore based on pagemap info"
>    To be able to split the VMA-s in the right chunks for the restorer
>    it is necessary to make the decision lazy or not on the VmaEntry
>    level.

I've thought a little bit more about it and I'm not sure it is necessary to
split VMAs at all. The restorer can register the entire VMA with
userfaultfd, even if the VMA contains pages that are already restored. We
just need to make sure uffd.c can properly handle -EEXITS case and it seems
we are good.

Consider the following scenario:
There is a VMA that spawns from 0x10000 to 0x20000 (16 pages). Let's say
that the range from 0x10000 to 0x1a000 is dumped during pre-dump and there
were no changes in that memory, so during dump the range 0x10000 - 0x1a0000
will be marked with PE_PARENT, and the range 0x1a000 - 0x20000 will be
marked PE_LAZY.
During restore, the range marked as PE_PARENT will be filled with the
content from the disk image and the range marked PE_LAZY will remain
unpopulated.
restorer will register the entire VMA (0x10000 - 0x20000) with userfaultfd
and lazy-pages daemon will consider the entire range as lazy.
However, the pages at 0x10000 - 0x1a000 are already present, therefore
access to these pages won't cause a page fault.
When, at last lazy-pages daemon will try to restore the range 0x10000 - 0x1a000
during handle_remaining_pages stage, it'll get -EEXISTS from userfaultfd.

Sorry I've come really late with this and created some hassle for you...

> Signed-off-by: Adrian Reber <areber at redhat.com>
> ---
>  criu/mem.c          | 123 +++++++++++++++++++++++++++++++++++++++++++++++-----
>  criu/pie/restorer.c |   2 +-
>  criu/uffd.c         |   3 +-
>  images/vma.proto    |   1 +
>  4 files changed, 116 insertions(+), 13 deletions(-)
> 
> diff --git a/criu/mem.c b/criu/mem.c
> index 9043254..597f7e4 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -684,6 +684,35 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo
>  	return ret;
>  }
> 
> +static int split_priv_vma(unsigned long addr, struct vma_area *vma)
> +{
> +	struct vma_area *new_vma;
> +	VmaEntry *e;
> +
> +	/* create new VMA Area */
> +	new_vma = alloc_vma_area();
> +	/* Store address of new VMA Entry */
> +	e = new_vma->e;
> +	/* Copy all old values */
> +	memcpy(new_vma, vma, sizeof(struct vma_area));
> +	/* Fill new VMA Entry with old values */
> +	memcpy(e, vma->e, sizeof(VmaEntry));
> +	/* overwrite start address with current */
> +	e->start = addr;
> +	/* Overwrite old end address */
> +	vma->e->end = addr;
> +	e->has_lazy = true;
> +	e->lazy = false;
> +	new_vma->e = e;
> +	new_vma->page_bitmap = xzalloc(BITS_TO_LONGS(vma_entry_len(new_vma->e) / PAGE_SIZE) * sizeof(long));
> +	if (new_vma->page_bitmap == NULL)
> +		return -1;
> +
> +	new_vma->premmaped_addr += vma_entry_len(vma->e);
> +	list_add(&new_vma->list, &vma->list);
> +	return 0;
> +}
> +
>  static int restore_priv_vma_content(struct pstree_item *t)
>  {
>  	struct vma_area *vma;
> @@ -718,17 +747,6 @@ static int restore_priv_vma_content(struct pstree_item *t)
>  		va = (unsigned long)iov.iov_base;
>  		nr_pages = iov.iov_len / PAGE_SIZE;
> 
> -		/*
> -		 * This means that userfaultfd is used to load the pages
> -		 * on demand.
> -		 */
> -		if (opts.lazy_pages && pagemap_lazy(pr.pe)) {
> -			pr_debug("Lazy restore skips %ld pages at %p\n", nr_pages, iov.iov_base);
> -			pr.skip_pages(&pr, iov.iov_len);
> -			nr_lazy += nr_pages;
> -			continue;
> -		}
> -
>  		for (i = 0; i < nr_pages; i++) {
>  			unsigned char buf[PAGE_SIZE];
>  			void *p;
> @@ -756,6 +774,81 @@ static int restore_priv_vma_content(struct pstree_item *t)
>  				goto err_addr;
>  			}
> 
> +			/*
> +			 * This means that userfaultfd is used to load the pages
> +			 * on demand.
> +			 */
> +			if (opts.lazy_pages && vma_entry_can_be_lazy(vma->e)) {
> +				pr_debug("Lazy restore skips %#016"PRIx64"\n", va);
> +				if (!pagemap_in_parent(pr.pe)) {
> +					pr_debug("%#016"PRIx64" not in parent\n", va);
> +					pr.skip_pages(&pr, PAGE_SIZE);
> +					nr_lazy++;
> +					vma->e->has_lazy = true;
> +					vma->e->lazy = true;
> +					continue;
> +				} else {
> +					unsigned long new_addr;
> +					/*
> +					 * First check if the PageMap Entry and the
> +					 * VMA Entry are the same size. That is the easy
> +					 * case where the whole VMA Entry can be marked
> +					 * as non-lazy as it present in the parent.
> +					 */
> +					if (pr.pe->vaddr == vma->e->start &&
> +							pr.pe->vaddr + (pr.pe->nr_pages * PAGE_SIZE) == vma->e->end) {
> +						pr_debug("VMA Entry and PageMap Entry matches\n");
> +						/*
> +						 * lazy defaults to false; explicitly set it for
> +						 * better readability.
> +						 */
> +						vma->e->has_lazy = true;
> +						vma->e->lazy = false;
> +						goto read_pages;
> +					}
> +					/*
> +					 * Only those pages in the VMA Entry which
> +					 * are not available in the parent, should be
> +					 * marked as lazy.
> +					 * As only the PageMap Entry knows if the pages
> +					 * are available in the parent, the VMA Entry needs
> +					 * to be split into pages which actually should
> +					 * be loaded lazily and pages which are in the
> +					 * parent. This is necessary as the restore only
> +					 * knows about VMAs and not PageMap Entries.
> +					 */
> +
> +					/* Check if this is the last page of the VMA Entry */
> +					if (vma->e->end == va + PAGE_SIZE) {
> +						pr_debug("VMA Entry end has already been reached\n");
> +						goto read_pages;
> +					}
> +
> +					/*
> +					 * Check if the current address is the same
> +					 * as the current VMA Entries start address.
> +					 * If not a VMA Entry at the beginning has to be
> +					 * split off.
> +					 */
> +					if (va != vma->e->start) {
> +						pr_debug("Replacing VMA start address\n");
> +						new_addr = va;
> +					} else {
> +						new_addr = pr.pe->vaddr + (pr.pe->nr_pages * PAGE_SIZE);
> +						if (new_addr > vma->e->end) {
> +							pr_debug("VMA Entry smaller than PageMap Entry\n");
> +							new_addr = va;
> +						}
> +					}
> +
> +					ret = split_priv_vma(new_addr, vma);
> +					if (ret)
> +						return -1;
> +					rsti(t)->vmas.nr++;
> +				}
> +			}
> +read_pages:
> +
>  			off = (va - vma->e->start) / PAGE_SIZE;
>  			p = decode_pointer((off) * PAGE_SIZE +
>  					vma->premmaped_addr);
> @@ -920,6 +1013,14 @@ int unmap_guard_pages(struct pstree_item *t)
>  				pr_perror("Can't unmap guard page");
>  				return -1;
>  			}
> +
> +			/*
> +			 * The code to combine pre-copy and post-copy
> +			 * can split existing MAP_GROWSDOWN VMA areas
> +			 * into two. Therefore returning once a guard
> +			 * page has been unmapped.
> +			 */
> +			return 0;
>  		}
>  	}
> 
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index 6790118..304f91a 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -842,7 +842,7 @@ static int vma_remap(VmaEntry *vma_entry, int uffd)
>  	 * pages, so that the processes will hang until the memory is
>  	 * injected via userfaultfd.
>  	 */
> -	if (vma_entry_can_be_lazy(vma_entry))
> +	if (vma_entry_can_be_lazy(vma_entry) && vma_entry->lazy)
>  		if (enable_uffd(uffd, dst, len) != 0)
>  			return -1;
> 
> diff --git a/criu/uffd.c b/criu/uffd.c
> index 81dc7ae..fbf1cb6 100644
> --- a/criu/uffd.c
> +++ b/criu/uffd.c
> @@ -508,7 +508,8 @@ static int collect_uffd_pages(struct page_read *pr, struct lazy_pages_info *lpi)
>  			 */
>  			if (base >= vma->e->start && base < vma->e->end) {
>  				if (vma_entry_can_be_lazy(vma->e)) {
> -					uffd_page = true;
> +					if(!pagemap_in_parent(pr->pe))
> +						uffd_page = true;
>  					break;
>  				}
>  			}
> diff --git a/images/vma.proto b/images/vma.proto
> index 7085f42..843ba2b 100644
> --- a/images/vma.proto
> +++ b/images/vma.proto
> @@ -22,4 +22,5 @@ message vma_entry {
> 
>  	/* file status flags */
>  	optional uint32		fdflags	= 10 [(criu).hex = true];
> +	optional bool		lazy 	= 11 [(criu).hex = false];
>  }
> -- 
> 2.7.4
> 
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
> 



More information about the CRIU mailing list