[CRIU] [PATCH 3/3] page-read: Implement asynchronous mode of reading pages (v2)
Mike Rapoport
rppt at linux.vnet.ibm.com
Wed Nov 9 11:19:52 PST 2016
On Mon, Nov 07, 2016 at 09:05:58PM +0300, Pavel Emelyanov wrote:
> Currently each page read request results in pread() call. But
> we can do better -- if there's no urgent need in data (which is
> likely the case for non-cow pagemaps) we can save where the
> date should go to and then read it in in one preadv syscall.
^data ^one 'in too much
>
> For lazy pages we get an ability to do async read_pages call
> that would send the request to server and return w/o data in
> the buffer.
For lazy pages it might be slightly more complex that just firing
->read_pages request and then calling ->sync or some similar method when
the data is actually available. We still need to refine interfaces between
pagemap, uffd and page-xfer to make all the lazy pages remoting work
properly, but having ASYNC page read will surely come in handy then.
> The explicit sync() method is added since .close is void and
> nobody checks its return code.
>
> v2: Lost limits.h in pagemap.c for IOV_MAX (compilation).
>
> Signed-off-by: Pavel Emelyanov <xemul at virtuozzo.com>
> ---
> criu/include/pagemap.h | 16 ++++++
> criu/mem.c | 3 ++
> criu/pagemap.c | 136 +++++++++++++++++++++++++++++++++++++++++++++----
> 3 files changed, 145 insertions(+), 10 deletions(-)
>
> diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h
> index b445d82..00143c7 100644
> --- a/criu/include/pagemap.h
> +++ b/criu/include/pagemap.h
> @@ -1,6 +1,7 @@
> #ifndef __CR_PAGE_READ_H__
> #define __CR_PAGE_READ_H__
>
> +#include "common/list.h"
> #include "images/pagemap.pb-c.h"
>
> /*
> @@ -40,6 +41,18 @@
> * All this is implemented in read_pagemap_page.
> */
>
> +/*
> + * One "job" for the preadv() syscall in pagemap.c
> + */
> +struct page_read_iov {
> + off_t from; /* offset in pi file where to start reading from */
> + off_t end; /* the end of the read == sum to.iov_len -s */
> + struct iovec *to; /* destination iovs */
> + unsigned int nr; /* their number */
> +
> + struct list_head l;
> +};
> +
The 'struct page_read_iov' is pretty much private to pagemap.c, so I'd
suggest moving it there.
> struct page_read {
> /*
> * gets next vaddr:len pair to work on.
> @@ -52,6 +65,7 @@ struct page_read {
> void (*skip_pages)(struct page_read *, unsigned long len);
> int (*seek_page)(struct page_read *pr, unsigned long vaddr, bool warn);
> void (*reset)(struct page_read *pr);
> + int (*sync)(struct page_read *pr);
>
> /* Private data of reader */
> struct cr_img *pmi;
> @@ -72,6 +86,8 @@ struct page_read {
> PagemapEntry **pmes;
> int nr_pmes;
> int curr_pme;
> +
> + struct list_head async;
> };
>
> /* flags for ->read_pages */
> diff --git a/criu/mem.c b/criu/mem.c
> index 7da64f5..d910207 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -833,6 +833,9 @@ static int restore_priv_vma_content(struct pstree_item *t)
> }
>
> err_read:
> + if (pr.sync(&pr))
> + return -1;
> +
> pr.close(&pr);
> if (ret < 0)
> return ret;
> diff --git a/criu/pagemap.c b/criu/pagemap.c
> index 0a072c8..ca14c36 100644
> --- a/criu/pagemap.c
> +++ b/criu/pagemap.c
> @@ -2,6 +2,8 @@
> #include <stdio.h>
> #include <unistd.h>
> #include <linux/falloc.h>
> +#include <sys/uio.h>
> +#include <limits.h>
>
> #include "types.h"
> #include "image.h"
> @@ -260,20 +262,101 @@ static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr,
> /* zero mappings should be skipped by get_pagemap */
> BUG();
> } else {
It would be nice to split read_pagemap_page to smaller chunks, e.g
read_page_from_parent and read_page_from_self even now. With the changes
below it is screaming :)
> - int fd = img_raw_fd(pr->pi);
> + /*
> + * There's no such read in the kernel that can
> + * read asynchronously from the page-cache, so
> + * in case of urgent async read, just do the
> + * regular syncronous read.
> + */
> + if ((flags & (PR_ASYNC | PR_UNPLUG)) == PR_ASYNC) {
We can just have (flags & PR_ASYNC), can't we?
And, the entire ASYNC clause seem pretty much like a function, maybe even
several.
> + struct page_readeiov *cur_async = NULL;
> + struct iovec *iov;
> +
> + if (!list_empty(&pr->async))
> + cur_async = list_entry(pr->async.prev, struct page_read_iov, l);
> +
> + if (cur_async && pr->pi_off == cur_async->end) {
> + /*
> + * This read is pure continuation of the
> + * previous one. Let's just add another
> + * IOV (or extend one of the existing).
> + */
> +
> + iov = &cur_async->to[cur_async->nr - 1];
> + if (iov->iov_base + iov->iov_len == buf)
> + /* Extendable */
> + iov->iov_len += len;
I know we ain't exactly kernel, but having braces would be nice here.
> + else {
> + /* Need one more target iovec */
> + unsigned int n_iovs = cur_async->nr + 1;
> +
> + if (n_iovs >= IOV_MAX)
> + goto new_piov;
> +
> + iov = xrealloc(cur_async->to, n_iovs * sizeof(*iov));
> + if (!iov)
> + return -1;
> +
> + cur_async->to = iov;
> +
> + iov += cur_async->nr;
> + iov->iov_base = buf;
> + iov->iov_len = len;
> +
> + cur_async->nr = n_iovs;
> + }
> +
> + cur_async->end += len;
> + } else {
> + /*
> + * We have new read request that should happen at
> + * pos _after_ some hole from the previous one.
> + * Start the new preadv request here.
> + */
> +new_piov:
> + cur_async = xmalloc(sizeof(*cur_async));
> + if (!cur_async)
> + return -1;
> +
> + cur_async->from = pr->pi_off;
> + cur_async->end = pr->pi_off + len;
> +
> + iov = xmalloc(sizeof(*iov));
> + if (!iov)
> + return -1;
> +
> + iov->iov_base = buf;
> + iov->iov_len = len;
> +
> + cur_async->to = iov;
> + cur_async->nr = 1;
> +
> + list_add_tail(&cur_async->l, &pr->async);
> + }
> + } else {
> + int fd = img_raw_fd(pr->pi);
>
> - pr_debug("\tpr%u Read page from self %lx/%"PRIx64"\n", pr->id, pr->cvaddr, pr->pi_off);
> - ret = pread(fd, buf, len, pr->pi_off);
> - if (ret != len) {
> - pr_perror("Can't read mapping page %d", ret);
> - return -1;
> - }
> + /*
> + * Flush any pending async requests if any
> + * not to break the linear reading from the
> + * pages.img file.
> + */
> + if (pr->sync(pr))
> + return -1;
>
> - if (opts.auto_dedup) {
> - ret = punch_hole(pr, pr->pi_off, len, false);
> - if (ret == -1) {
And, the fallback to sync read could be yet another function :)
> + pr_debug("\tpr%u Read page from self %lx/%"PRIx64"\n", pr->id, pr->cvaddr, pr->pi_off);
> + ret = pread(fd, buf, len, pr->pi_off);
> + if (ret != len) {
> + pr_perror("Can't read mapping page %d", ret);
> return -1;
> }
> +
> + if (opts.auto_dedup) {
> + ret = punch_hole(pr, pr->pi_off, len, false);
> + if (ret == -1) {
> + return -1;
> + }
> + }
> }
>
> pr->pi_off += len;
> @@ -294,10 +377,41 @@ static void free_pagemaps(struct page_read *pr)
> xfree(pr->pmes);
> }
>
> +static int process_async_reads(struct page_read *pr)
> +{
> + int fd, ret = 0;
> + struct page_read_iov *piov, *n;
> +
> + fd = img_raw_fd(pr->pi);
> + list_for_each_entry_safe(piov, n, &pr->async, l) {
> + int ret;
> +
> + ret = preadv(fd, piov->to, piov->nr, piov->from);
> + if (ret != piov->end - piov->from) {
> + pr_err("Can't read async pr bytes\n");
> + return -1;
> + }
> +
> + if (opts.auto_dedup && punch_hole(pr, piov->from, ret, false))
> + return -1;
> +
> + list_del(&piov->l);
> + xfree(piov->to);
> + xfree(piov);
> + }
> +
> + if (pr->parent)
> + ret = process_async_reads(pr->parent);
> +
> + return ret;
> +}
> +
> static void close_page_read(struct page_read *pr)
> {
> int ret;
>
> + BUG_ON(!list_empty(&pr->async));
> +
> if (pr->bunch.iov_len > 0) {
> ret = punch_hole(pr, 0, 0, true);
> if (ret == -1)
> @@ -462,6 +576,7 @@ int open_page_read_at(int dfd, int id, struct page_read *pr, int pr_flags)
> return -1;
> }
>
> + INIT_LIST_HEAD(&pr->async);
> pr->pe = NULL;
> pr->parent = NULL;
> pr->cvaddr = 0;
> @@ -501,6 +616,7 @@ int open_page_read_at(int dfd, int id, struct page_read *pr, int pr_flags)
> pr->skip_pages = skip_pagemap_pages;
> pr->seek_page = seek_pagemap_page;
> pr->reset = reset_pagemap;
> + pr->sync = process_async_reads;
> pr->id = ids++;
>
> pr_debug("Opened page read %u (parent %u)\n",
> --
> 2.1.4
>
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
>
More information about the CRIU
mailing list