[CRIU] [PATCH 04/11 v3] criu: lazy-pages: replace page list with IOVs list

Pavel Emelyanov xemul at virtuozzo.com
Tue Nov 15 07:03:34 PST 2016


On 11/15/2016 03:07 PM, Mike Rapoport wrote:
> Instead of tracking memory handled by userfaultfd on the page basis we can
> use IOVs for continious chunks.
> 
> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
> ---
> v3: s/min(/min_t(uint64_t should make arm compilation happy
> 
>  criu/uffd.c | 261 +++++++++++++++++++++++++++++++++---------------------------
>  1 file changed, 143 insertions(+), 118 deletions(-)
> 
> diff --git a/criu/uffd.c b/criu/uffd.c
> index 6250bf1..eae42fd 100644
> --- a/criu/uffd.c
> +++ b/criu/uffd.c
> @@ -46,11 +46,17 @@
>  
>  static mutex_t *lazy_sock_mutex;
>  
> +struct lazy_iovec {
> +	struct list_head l;
> +	unsigned long base;
> +	unsigned long len;
> +};
> +
>  struct lazy_pages_info {
>  	int pid;
>  	int uffd;
>  
> -	struct list_head pages;
> +	struct list_head iovs;
>  
>  	struct page_read pr;
>  
> @@ -72,7 +78,7 @@ static struct lazy_pages_info *lpi_init(void)
>  		return NULL;
>  
>  	memset(lpi, 0, sizeof(*lpi));
> -	INIT_LIST_HEAD(&lpi->pages);
> +	INIT_LIST_HEAD(&lpi->iovs);
>  	INIT_HLIST_NODE(&lpi->hash);
>  
>  	return lpi;
> @@ -80,8 +86,12 @@ static struct lazy_pages_info *lpi_init(void)
>  
>  static void lpi_fini(struct lazy_pages_info *lpi)
>  {
> +	struct lazy_iovec *p, *n;
> +
>  	if (!lpi)
>  		return;
> +	list_for_each_entry_safe(p, n, &lpi->iovs, l)
> +		xfree(p);
>  	if (lpi->uffd > 0)
>  		close(lpi->uffd);
>  	if (lpi->pr.close)
> @@ -320,118 +330,135 @@ out:
>  	return -1;
>  }
>  
> -#define UFFD_FLAG_SENT	0x1
> +static MmEntry *init_mm_entry(struct lazy_pages_info *lpi)
> +{
> +	struct cr_img *img;
> +	MmEntry *mm;
> +	int ret;
>  
> -struct uffd_pages_struct {
> -	struct list_head list;
> -	unsigned long addr;
> -	int flags;
> -};
> +	img = open_image(CR_FD_MM, O_RSTR, lpi->pid);
> +	if (!img)
> +		return NULL;
> +
> +	ret = pb_read_one_eof(img, &mm, PB_MM);
> +	close_image(img);
> +	if (ret == -1)
> +		return NULL;
> +	pr_debug("Found %zd VMAs in image\n", mm->n_vmas);
> +
> +	return mm;
> +}
>  
> -static int collect_uffd_pages(struct lazy_pages_info *lpi, MmEntry *mm)
> +static int update_lazy_iovecs(struct lazy_pages_info *lpi, unsigned long addr,
> +			      int len)
>  {
> -	unsigned long base;
> -	int i, j;
> -	struct iovec iov;
> -	unsigned long nr_pages;
> -	unsigned long ps;
> -	int rc;
> -	struct uffd_pages_struct *uffd_pages;
> -	struct page_read *pr = &lpi->pr;
> +	struct lazy_iovec *lazy_iov, *n;
>  
> -	rc = pr->get_pagemap(pr, &iov);
> -	if (rc <= 0)
> -		return 0;
> +	list_for_each_entry_safe(lazy_iov, n, &lpi->iovs, l) {
> +		unsigned long start = lazy_iov->base;
> +		unsigned long end = start + lazy_iov->len;
>  
> -	ps = page_size();
> -	nr_pages = iov.iov_len / ps;
> -	base = (unsigned long) iov.iov_base;
> -	pr_debug("iov.iov_base 0x%lx (%ld pages)\n", base, nr_pages);
> +		if (len <= 0)
> +			break;
>  
> -	for (i = 0; i < nr_pages; i++) {
> -		bool uffd_page = false;
> -		base = (unsigned long) iov.iov_base + (i * ps);
> -		/*
> -		 * Only pages which are MAP_ANONYMOUS and MAP_PRIVATE
> -		 * are relevant for userfaultfd handling.
> -		 * Loop over all VMAs to see if the flags matching.
> -		 */
> -		for (j = 0; j < mm->n_vmas; j++) {
> -			VmaEntry *vma = mm->vmas[j];
> -			/*
> -			 * This loop assumes that base can actually be found
> -			 * in the VMA list.
> -			 */
> -			if (base >= vma->start && base < vma->end) {
> -				if (vma_entry_can_be_lazy(vma)) {
> -					if(!pagemap_in_parent(pr->pe))
> -						uffd_page = true;
> -					break;
> -				}
> +		if (addr < start || addr >= end)
> +			continue;
> +
> +		if (addr + len < end) {
> +			if (addr == start) {
> +				lazy_iov->base += len;
> +				lazy_iov->len -= len;
> +			} else {
> +				struct lazy_iovec *new_iov;
> +
> +				lazy_iov->len -= (end - addr);
> +
> +				new_iov = xzalloc(sizeof(*new_iov));
> +				if (!new_iov)
> +					return -1;
> +
> +				new_iov->base = addr + len;
> +				new_iov->len = end - (addr + len);
> +
> +				list_add(&new_iov->l, &lazy_iov->l);
>  			}
> +			break;
>  		}
>  
> -		/* This is not a page we are looking for. Move along */
> -		if (!uffd_page)
> -			continue;
> -
> -		pr_debug("Adding 0x%lx to our list\n", base);
> +		if (addr == start) {
> +			list_del(&lazy_iov->l);
> +			xfree(lazy_iov);
> +		} else {
> +			lazy_iov->len -= (end - addr);
> +		}
>  
> -		uffd_pages = xzalloc(sizeof(struct uffd_pages_struct));
> -		if (!uffd_pages)
> -			return -1;
> -		uffd_pages->addr = base;
> -		list_add(&uffd_pages->list, &lpi->pages);
> +		len -= (end - addr);
> +		addr = end;
>  	}
>  
> -	return 1;
> +	return 0;
>  }
>  
>  /*
> - *  Setting up criu infrastructure and scan for VMAs.
> + * Create a list of IOVs that can be handled using userfaultfd. The
> + * IOVs generally correspond to lazy pagemap entries, except the cases
> + * when a single pagemap entry covers several VMAs. In those cases
> + * IOVs are split at VMA boundaries because UFFDIO_COPY may be done
> + * only inside a single VMA.
> + * We assume here that pagemaps and VMAs are sorted.
>   */
> -static int find_vmas(struct lazy_pages_info *lpi)
> +static int collect_lazy_iovecs(struct lazy_pages_info *lpi)
>  {
> -	struct cr_img *img;
> -	int ret;
> +	struct page_read *pr = &lpi->pr;
> +	struct lazy_iovec *lazy_iov, *n;
>  	MmEntry *mm;
> -	struct uffd_pages_struct *uffd_pages;
> +	int nr_pages = 0, n_vma = 0;
> +	int ret = -1;
> +	unsigned long start, end, len;
>  
> -	img = open_image(CR_FD_MM, O_RSTR, lpi->pid);
> -	if (!img)
> +	mm = init_mm_entry(lpi);
> +	if (!mm)
>  		return -1;
>  
> -	ret = pb_read_one_eof(img, &mm, PB_MM);
> -	close_image(img);
> -	if (ret == -1)
> -		return -1;
> -	pr_debug("Found %zd VMAs in image\n", mm->n_vmas);
> +	while (pr->advance(pr)) {
> +		if (!pagemap_lazy(pr->pe))
> +			continue;
>  
> -	ret = open_page_read(lpi->pid, &lpi->pr, PR_TASK);
> -	if (ret <= 0) {
> -		ret = -1;
> -		goto out;
> -	}
> -	/*
> -	 * This puts all pages which should be handled by userfaultfd
> -	 * in the list uffd_list. This list is later used to detect if
> -	 * a page has already been transferred or if it needs to be
> -	 * pushed into the process using userfaultfd.
> -	 */
> -	do {
> -		ret = collect_uffd_pages(lpi, mm);
> -		if (ret == -1) {
> -			goto out;
> +		start = pr->pe->vaddr;
> +		end = start + pr->pe->nr_pages * page_size();
> +		nr_pages += pr->pe->nr_pages;
> +
> +		for (; n_vma < mm->n_vmas; n_vma++) {
> +			VmaEntry *vma = mm->vmas[n_vma];
> +
> +			if (start >= vma->end)
> +				continue;
> +
> +			lazy_iov = xzalloc(sizeof(*lazy_iov));
> +			if (!lazy_iov)
> +				goto free_iovs;
> +
> +			len = min_t(uint64_t, end, vma->end) - start;
> +			lazy_iov->base = start;
> +			lazy_iov->len = len;
> +			list_add_tail(&lazy_iov->l, &lpi->iovs);
> +
> +			if (end <= vma->end)
> +				break;
> +
> +			start = vma->end;
>  		}
> -	} while (ret);
> +	}
>  
> -	/* Count detected pages */
> -	list_for_each_entry(uffd_pages, &lpi->pages, list)
> -	    ret++;
> +	ret = nr_pages;
> +	goto free_mm;
>  
> -	pr_debug("Found %d pages to be handled by UFFD\n", ret);
> +free_iovs:
> +	list_for_each_entry_safe(lazy_iov, n, &lpi->iovs, l)
> +		xfree(lazy_iov);
> +free_mm:
> +	mm_entry__free_unpacked(mm, NULL);
>  
> -out:
>  	return ret;
>  }
>  
> @@ -473,13 +500,22 @@ static int ud_open(int client, struct lazy_pages_info **_lpi)
>  	uffd_flags = fcntl(lpi->uffd, F_GETFD, NULL);
>  	pr_debug("uffd_flags are 0x%x\n", uffd_flags);
>  
> +	ret = open_page_read(lpi->pid, &lpi->pr, PR_TASK);
> +	if (ret <= 0) {
> +		ret = -1;
> +		goto out;
> +	}
> +
>  	/*
>  	 * Find the memory pages belonging to the restored process
>  	 * so that it is trackable when all pages have been transferred.
>  	 */
> -	if ((lpi->total_pages = find_vmas(lpi)) == -1)
> +	lpi->total_pages = collect_lazy_iovecs(lpi);
> +	if (lpi->total_pages < 0)
>  		goto out;
>  
> +	pr_debug("Found %ld pages to be handled by UFFD\n", lpi->total_pages);
> +
>  	hlist_add_head(&lpi->hash, &lpi_hash[lpi->uffd % LPI_HASH_SIZE]);
>  	*_lpi = lpi;
>  
> @@ -591,33 +627,33 @@ static int uffd_handle_page(struct lazy_pages_info *lpi, __u64 address,
>  
>  static int handle_remaining_pages(struct lazy_pages_info *lpi, void *dest)
>  {
> -	struct uffd_pages_struct *uffd_pages;
> -	int rc;
> +	struct lazy_iovec *lazy_iov;
> +	int nr_pages, i, err;
> +	unsigned long addr;
>  
> -	list_for_each_entry(uffd_pages, &lpi->pages, list) {
> -		pr_debug("Checking remaining pages 0x%lx (flags 0x%x)\n",
> -			 uffd_pages->addr, uffd_pages->flags);
> -		if (uffd_pages->flags & UFFD_FLAG_SENT)
> -			continue;
> +	lpi->pr.reset(&lpi->pr);
>  
> -		rc = uffd_handle_page(lpi, uffd_pages->addr, dest);
> -		if (rc < 0) {
> -			pr_err("Error during UFFD copy\n");
> -			return -1;
> -		}
> +	list_for_each_entry(lazy_iov, &lpi->iovs, l) {
> +		nr_pages = lazy_iov->len / PAGE_SIZE;
> +
> +		for (i = 0; i < nr_pages; i++) {
> +			addr = lazy_iov->base + i * PAGE_SIZE;
>  
> -		uffd_pages->flags |= UFFD_FLAG_SENT;
> +			err = uffd_handle_page(lpi, addr, dest);

This place looks dangerous. The iovs can be quite large and requesting the whole
IOV during single #PF handling may slow the restored tree down significantly.

Can we either keep the per-page API or make IOV-based work on IOV fractions if
needed?

> +			if (err < 0) {
> +				pr_err("Error during UFFD copy\n");
> +				return -1;
> +			}
> +		}
>  	}
>  
>  	return 0;
>  }
>  
> -
>  static int handle_regular_pages(struct lazy_pages_info *lpi, void *dest,
>  				__u64 address)
>  {
>  	int rc;
> -	struct uffd_pages_struct *uffd_pages;
>  
>  	rc = uffd_handle_page(lpi, address, dest);
>  	if (rc < 0) {
> @@ -625,14 +661,9 @@ static int handle_regular_pages(struct lazy_pages_info *lpi, void *dest,
>  		return -1;
>  	}
>  
> -	/*
> -	 * Mark this page as having been already transferred, so
> -	 * that it has not to be copied again later.
> -	 */
> -	list_for_each_entry(uffd_pages, &lpi->pages, list) {
> -		if (uffd_pages->addr == address)
> -			uffd_pages->flags |= UFFD_FLAG_SENT;
> -	}
> +	rc = update_lazy_iovecs(lpi, address, PAGE_SIZE);
> +	if (rc < 0)
> +		return -1;
>  
>  	return 0;
>  }
> @@ -642,7 +673,6 @@ static int handle_user_fault(struct lazy_pages_info *lpi, void *dest)
>  	struct uffd_msg msg;
>  	__u64 flags;
>  	__u64 address;
> -	struct uffd_pages_struct *uffd_pages;
>  	int ret;
>  
>  	ret = read(lpi->uffd, &msg, sizeof(msg));
> @@ -662,11 +692,6 @@ static int handle_user_fault(struct lazy_pages_info *lpi, void *dest)
>  	address = msg.arg.pagefault.address & ~(page_size() - 1);
>  	pr_debug("msg.arg.pagefault.address 0x%llx\n", address);
>  
> -	/* Make sure to not transfer a page twice */
> -	list_for_each_entry(uffd_pages, &lpi->pages, list)
> -		if ((uffd_pages->addr == address) && (uffd_pages->flags & UFFD_FLAG_SENT))
> -			return 0;
> -
>  	/* Now handle the pages actually requested. */
>  	flags = msg.arg.pagefault.flags;
>  	pr_debug("msg.arg.pagefault.flags 0x%llx\n", flags);
> 



More information about the CRIU mailing list