[CRIU] [PATCH 3/3] page-read: Implement asynchronous mode of reading pages (v2)

Mike Rapoport rppt at linux.vnet.ibm.com
Wed Nov 9 11:19:52 PST 2016


On Mon, Nov 07, 2016 at 09:05:58PM +0300, Pavel Emelyanov wrote:
> Currently each page read request results in pread() call. But
> we can do better -- if there's no urgent need in data (which is
> likely the case for non-cow pagemaps) we can save where the
> date should go to and then read it in in one preadv syscall.
 ^data                               ^one 'in too much
> 
> For lazy pages we get an ability to do async read_pages call
> that would send the request to server and return w/o data in
> the buffer.

For lazy pages it might be slightly more complex that just firing
->read_pages request and then calling ->sync or some similar method when
the data is actually available. We still need to refine interfaces between
pagemap, uffd and page-xfer to make all the lazy pages remoting work
properly, but having ASYNC page read will surely come in handy then.

> The explicit sync() method is added since .close is void and
> nobody checks its return code.
> 
> v2: Lost limits.h in pagemap.c for IOV_MAX (compilation).
> 
> Signed-off-by: Pavel Emelyanov <xemul at virtuozzo.com>
> ---
>  criu/include/pagemap.h |  16 ++++++
>  criu/mem.c             |   3 ++
>  criu/pagemap.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++----
>  3 files changed, 145 insertions(+), 10 deletions(-)
> 
> diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h
> index b445d82..00143c7 100644
> --- a/criu/include/pagemap.h
> +++ b/criu/include/pagemap.h
> @@ -1,6 +1,7 @@
>  #ifndef __CR_PAGE_READ_H__
>  #define __CR_PAGE_READ_H__
> 
> +#include "common/list.h"
>  #include "images/pagemap.pb-c.h"
> 
>  /*
> @@ -40,6 +41,18 @@
>   * All this is implemented in read_pagemap_page.
>   */
> 
> +/*
> + * One "job" for the preadv() syscall in pagemap.c
> + */
> +struct page_read_iov {
> +	off_t from;		/* offset in pi file where to start reading from */
> +	off_t end;		/* the end of the read == sum to.iov_len -s */
> +	struct iovec *to;	/* destination iovs */
> +	unsigned int nr;	/* their number */
> +
> +	struct list_head l;
> +};
> +

The 'struct page_read_iov' is pretty much private to pagemap.c, so I'd
suggest moving it there.

>  struct page_read {
>  	/*
>  	 * gets next vaddr:len pair to work on.
> @@ -52,6 +65,7 @@ struct page_read {
>  	void (*skip_pages)(struct page_read *, unsigned long len);
>  	int (*seek_page)(struct page_read *pr, unsigned long vaddr, bool warn);
>  	void (*reset)(struct page_read *pr);
> +	int (*sync)(struct page_read *pr);
> 
>  	/* Private data of reader */
>  	struct cr_img *pmi;
> @@ -72,6 +86,8 @@ struct page_read {
>  	PagemapEntry **pmes;
>  	int nr_pmes;
>  	int curr_pme;
> +
> +	struct list_head	async;
>  };
> 
>  /* flags for ->read_pages */
> diff --git a/criu/mem.c b/criu/mem.c
> index 7da64f5..d910207 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -833,6 +833,9 @@ static int restore_priv_vma_content(struct pstree_item *t)
>  	}
> 
>  err_read:
> +	if (pr.sync(&pr))
> +		return -1;
> +
>  	pr.close(&pr);
>  	if (ret < 0)
>  		return ret;
> diff --git a/criu/pagemap.c b/criu/pagemap.c
> index 0a072c8..ca14c36 100644
> --- a/criu/pagemap.c
> +++ b/criu/pagemap.c
> @@ -2,6 +2,8 @@
>  #include <stdio.h>
>  #include <unistd.h>
>  #include <linux/falloc.h>
> +#include <sys/uio.h>
> +#include <limits.h>
> 
>  #include "types.h"
>  #include "image.h"
> @@ -260,20 +262,101 @@ static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr,
>  		/* zero mappings should be skipped by get_pagemap */
>  		BUG();
>  	} else {

It would be nice to split read_pagemap_page to smaller chunks, e.g
read_page_from_parent and read_page_from_self even now. With the changes
below it is screaming :)

> -		int fd = img_raw_fd(pr->pi);
> +		/*
> +		 * There's no such read in the kernel that can
> +		 * read asynchronously from the page-cache, so
> +		 * in case of urgent async read, just do the
> +		 * regular syncronous read.
> +		 */
> +		if ((flags & (PR_ASYNC | PR_UNPLUG)) == PR_ASYNC) {

We can just have (flags & PR_ASYNC), can't we?
And, the entire ASYNC clause seem pretty much like a function, maybe even
several. 
 
> +			struct page_readeiov *cur_async = NULL;
> +			struct iovec *iov;
> +
> +			if (!list_empty(&pr->async))
> +				cur_async = list_entry(pr->async.prev, struct page_read_iov, l);
> +
> +			if (cur_async && pr->pi_off == cur_async->end) {
> +				/*
> +				 * This read is pure continuation of the
> +				 * previous one. Let's just add another
> +				 * IOV (or extend one of the existing).
> +				 */
> +
> +				iov = &cur_async->to[cur_async->nr - 1];
> +				if (iov->iov_base + iov->iov_len == buf)
> +					/* Extendable */
> +					iov->iov_len += len;

I know we ain't exactly kernel, but having braces would be nice here.

> +				else {
> +					/* Need one more target iovec */
> +					unsigned int n_iovs = cur_async->nr + 1;
> +
> +					if (n_iovs >= IOV_MAX)
> +						goto new_piov;
> +
> +					iov = xrealloc(cur_async->to, n_iovs * sizeof(*iov));
> +					if (!iov)
> +						return -1;
> +
> +					cur_async->to = iov;
> +
> +					iov += cur_async->nr;
> +					iov->iov_base = buf;
> +					iov->iov_len = len;
> +
> +					cur_async->nr = n_iovs;
> +				}
> +
> +				cur_async->end += len;
> +			} else {
> +				/*
> +				 * We have new read request that should happen at
> +				 * pos _after_ some hole from the previous one.
> +				 * Start the new preadv request here.
> +				 */
> +new_piov:
> +				cur_async = xmalloc(sizeof(*cur_async));
> +				if (!cur_async)
> +					return -1;
> +
> +				cur_async->from = pr->pi_off;
> +				cur_async->end = pr->pi_off + len;
> +
> +				iov = xmalloc(sizeof(*iov));
> +				if (!iov)
> +					return -1;
> +
> +				iov->iov_base = buf;
> +				iov->iov_len = len;
> +
> +				cur_async->to = iov;
> +				cur_async->nr = 1;
> +
> +				list_add_tail(&cur_async->l, &pr->async);
> +			}
> +		} else {
> +			int fd = img_raw_fd(pr->pi);
> 
> -		pr_debug("\tpr%u Read page from self %lx/%"PRIx64"\n", pr->id, pr->cvaddr, pr->pi_off);
> -		ret = pread(fd, buf, len, pr->pi_off);
> -		if (ret != len) {
> -			pr_perror("Can't read mapping page %d", ret);
> -			return -1;
> -		}
> +			/*
> +			 * Flush any pending async requests if any
> +			 * not to break the linear reading from the
> +			 * pages.img file.
> +			 */
> +			if (pr->sync(pr))
> +				return -1;
> 
> -		if (opts.auto_dedup) {
> -			ret = punch_hole(pr, pr->pi_off, len, false);
> -			if (ret == -1) {

And, the fallback to sync read could be yet another function :)

> +			pr_debug("\tpr%u Read page from self %lx/%"PRIx64"\n", pr->id, pr->cvaddr, pr->pi_off);
> +			ret = pread(fd, buf, len, pr->pi_off);
> +			if (ret != len) {
> +				pr_perror("Can't read mapping page %d", ret);
>  				return -1;
>  			}
> +
> +			if (opts.auto_dedup) {
> +				ret = punch_hole(pr, pr->pi_off, len, false);
> +				if (ret == -1) {
> +					return -1;
> +				}
> +			}
>  		}
> 
>  		pr->pi_off += len;
> @@ -294,10 +377,41 @@ static void free_pagemaps(struct page_read *pr)
>  	xfree(pr->pmes);
>  }
> 
> +static int process_async_reads(struct page_read *pr)
> +{
> +	int fd, ret = 0;
> +	struct page_read_iov *piov, *n;
> +
> +	fd = img_raw_fd(pr->pi);
> +	list_for_each_entry_safe(piov, n, &pr->async, l) {
> +		int ret;
> +
> +		ret = preadv(fd, piov->to, piov->nr, piov->from);
> +		if (ret != piov->end - piov->from) {
> +			pr_err("Can't read async pr bytes\n");
> +			return -1;
> +		}
> +
> +		if (opts.auto_dedup && punch_hole(pr, piov->from, ret, false))
> +			return -1;
> +
> +		list_del(&piov->l);
> +		xfree(piov->to);
> +		xfree(piov);
> +	}
> +
> +	if (pr->parent)
> +		ret = process_async_reads(pr->parent);
> +
> +	return ret;
> +}
> +
>  static void close_page_read(struct page_read *pr)
>  {
>  	int ret;
> 
> +	BUG_ON(!list_empty(&pr->async));
> +
>  	if (pr->bunch.iov_len > 0) {
>  		ret = punch_hole(pr, 0, 0, true);
>  		if (ret == -1)
> @@ -462,6 +576,7 @@ int open_page_read_at(int dfd, int id, struct page_read *pr, int pr_flags)
>  		return -1;
>  	}
> 
> +	INIT_LIST_HEAD(&pr->async);
>  	pr->pe = NULL;
>  	pr->parent = NULL;
>  	pr->cvaddr = 0;
> @@ -501,6 +616,7 @@ int open_page_read_at(int dfd, int id, struct page_read *pr, int pr_flags)
>  	pr->skip_pages = skip_pagemap_pages;
>  	pr->seek_page = seek_pagemap_page;
>  	pr->reset = reset_pagemap;
> +	pr->sync = process_async_reads;
>  	pr->id = ids++;
> 
>  	pr_debug("Opened page read %u (parent %u)\n",
> -- 
> 2.1.4
> 
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
> 



More information about the CRIU mailing list