[CRIU] [PATCH 3/3] page-read: Implement asynchronous mode of reading pages (v2)

Pavel Emelyanov xemul at virtuozzo.com
Mon Nov 7 10:05:58 PST 2016


Currently each page read request results in pread() call. But
we can do better -- if there's no urgent need in data (which is
likely the case for non-cow pagemaps) we can save where the
date should go to and then read it in in one preadv syscall.

For lazy pages we get an ability to do async read_pages call
that would send the request to server and return w/o data in
the buffer.

The explicit sync() method is added since .close is void and
nobody checks its return code.

v2: Lost limits.h in pagemap.c for IOV_MAX (compilation).

Signed-off-by: Pavel Emelyanov <xemul at virtuozzo.com>
---
 criu/include/pagemap.h |  16 ++++++
 criu/mem.c             |   3 ++
 criu/pagemap.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 145 insertions(+), 10 deletions(-)

diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h
index b445d82..00143c7 100644
--- a/criu/include/pagemap.h
+++ b/criu/include/pagemap.h
@@ -1,6 +1,7 @@
 #ifndef __CR_PAGE_READ_H__
 #define __CR_PAGE_READ_H__
 
+#include "common/list.h"
 #include "images/pagemap.pb-c.h"
 
 /*
@@ -40,6 +41,18 @@
  * All this is implemented in read_pagemap_page.
  */
 
+/*
+ * One "job" for the preadv() syscall in pagemap.c
+ */
+struct page_read_iov {
+	off_t from;		/* offset in pi file where to start reading from */
+	off_t end;		/* the end of the read == sum to.iov_len -s */
+	struct iovec *to;	/* destination iovs */
+	unsigned int nr;	/* their number */
+
+	struct list_head l;
+};
+
 struct page_read {
 	/*
 	 * gets next vaddr:len pair to work on.
@@ -52,6 +65,7 @@ struct page_read {
 	void (*skip_pages)(struct page_read *, unsigned long len);
 	int (*seek_page)(struct page_read *pr, unsigned long vaddr, bool warn);
 	void (*reset)(struct page_read *pr);
+	int (*sync)(struct page_read *pr);
 
 	/* Private data of reader */
 	struct cr_img *pmi;
@@ -72,6 +86,8 @@ struct page_read {
 	PagemapEntry **pmes;
 	int nr_pmes;
 	int curr_pme;
+
+	struct list_head	async;
 };
 
 /* flags for ->read_pages */
diff --git a/criu/mem.c b/criu/mem.c
index 7da64f5..d910207 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -833,6 +833,9 @@ static int restore_priv_vma_content(struct pstree_item *t)
 	}
 
 err_read:
+	if (pr.sync(&pr))
+		return -1;
+
 	pr.close(&pr);
 	if (ret < 0)
 		return ret;
diff --git a/criu/pagemap.c b/criu/pagemap.c
index 0a072c8..ca14c36 100644
--- a/criu/pagemap.c
+++ b/criu/pagemap.c
@@ -2,6 +2,8 @@
 #include <stdio.h>
 #include <unistd.h>
 #include <linux/falloc.h>
+#include <sys/uio.h>
+#include <limits.h>
 
 #include "types.h"
 #include "image.h"
@@ -260,20 +262,101 @@ static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr,
 		/* zero mappings should be skipped by get_pagemap */
 		BUG();
 	} else {
-		int fd = img_raw_fd(pr->pi);
+		/*
+		 * There's no such read in the kernel that can
+		 * read asynchronously from the page-cache, so
+		 * in case of urgent async read, just do the
+		 * regular syncronous read.
+		 */
+		if ((flags & (PR_ASYNC | PR_UNPLUG)) == PR_ASYNC) {
+			struct page_read_iov *cur_async = NULL;
+			struct iovec *iov;
+
+			if (!list_empty(&pr->async))
+				cur_async = list_entry(pr->async.prev, struct page_read_iov, l);
+
+			if (cur_async && pr->pi_off == cur_async->end) {
+				/*
+				 * This read is pure continuation of the
+				 * previous one. Let's just add another
+				 * IOV (or extend one of the existing).
+				 */
+
+				iov = &cur_async->to[cur_async->nr - 1];
+				if (iov->iov_base + iov->iov_len == buf)
+					/* Extendable */
+					iov->iov_len += len;
+				else {
+					/* Need one more target iovec */
+					unsigned int n_iovs = cur_async->nr + 1;
+
+					if (n_iovs >= IOV_MAX)
+						goto new_piov;
+
+					iov = xrealloc(cur_async->to, n_iovs * sizeof(*iov));
+					if (!iov)
+						return -1;
+
+					cur_async->to = iov;
+
+					iov += cur_async->nr;
+					iov->iov_base = buf;
+					iov->iov_len = len;
+
+					cur_async->nr = n_iovs;
+				}
+
+				cur_async->end += len;
+			} else {
+				/*
+				 * We have new read request that should happen at
+				 * pos _after_ some hole from the previous one.
+				 * Start the new preadv request here.
+				 */
+new_piov:
+				cur_async = xmalloc(sizeof(*cur_async));
+				if (!cur_async)
+					return -1;
+
+				cur_async->from = pr->pi_off;
+				cur_async->end = pr->pi_off + len;
+
+				iov = xmalloc(sizeof(*iov));
+				if (!iov)
+					return -1;
+
+				iov->iov_base = buf;
+				iov->iov_len = len;
+
+				cur_async->to = iov;
+				cur_async->nr = 1;
+
+				list_add_tail(&cur_async->l, &pr->async);
+			}
+		} else {
+			int fd = img_raw_fd(pr->pi);
 
-		pr_debug("\tpr%u Read page from self %lx/%"PRIx64"\n", pr->id, pr->cvaddr, pr->pi_off);
-		ret = pread(fd, buf, len, pr->pi_off);
-		if (ret != len) {
-			pr_perror("Can't read mapping page %d", ret);
-			return -1;
-		}
+			/*
+			 * Flush any pending async requests if any
+			 * not to break the linear reading from the
+			 * pages.img file.
+			 */
+			if (pr->sync(pr))
+				return -1;
 
-		if (opts.auto_dedup) {
-			ret = punch_hole(pr, pr->pi_off, len, false);
-			if (ret == -1) {
+			pr_debug("\tpr%u Read page from self %lx/%"PRIx64"\n", pr->id, pr->cvaddr, pr->pi_off);
+			ret = pread(fd, buf, len, pr->pi_off);
+			if (ret != len) {
+				pr_perror("Can't read mapping page %d", ret);
 				return -1;
 			}
+
+			if (opts.auto_dedup) {
+				ret = punch_hole(pr, pr->pi_off, len, false);
+				if (ret == -1) {
+					return -1;
+				}
+			}
 		}
 
 		pr->pi_off += len;
@@ -294,10 +377,41 @@ static void free_pagemaps(struct page_read *pr)
 	xfree(pr->pmes);
 }
 
+static int process_async_reads(struct page_read *pr)
+{
+	int fd, ret = 0;
+	struct page_read_iov *piov, *n;
+
+	fd = img_raw_fd(pr->pi);
+	list_for_each_entry_safe(piov, n, &pr->async, l) {
+		int ret;
+
+		ret = preadv(fd, piov->to, piov->nr, piov->from);
+		if (ret != piov->end - piov->from) {
+			pr_err("Can't read async pr bytes\n");
+			return -1;
+		}
+
+		if (opts.auto_dedup && punch_hole(pr, piov->from, ret, false))
+			return -1;
+
+		list_del(&piov->l);
+		xfree(piov->to);
+		xfree(piov);
+	}
+
+	if (pr->parent)
+		ret = process_async_reads(pr->parent);
+
+	return ret;
+}
+
 static void close_page_read(struct page_read *pr)
 {
 	int ret;
 
+	BUG_ON(!list_empty(&pr->async));
+
 	if (pr->bunch.iov_len > 0) {
 		ret = punch_hole(pr, 0, 0, true);
 		if (ret == -1)
@@ -462,6 +576,7 @@ int open_page_read_at(int dfd, int id, struct page_read *pr, int pr_flags)
 		return -1;
 	}
 
+	INIT_LIST_HEAD(&pr->async);
 	pr->pe = NULL;
 	pr->parent = NULL;
 	pr->cvaddr = 0;
@@ -501,6 +616,7 @@ int open_page_read_at(int dfd, int id, struct page_read *pr, int pr_flags)
 	pr->skip_pages = skip_pagemap_pages;
 	pr->seek_page = seek_pagemap_page;
 	pr->reset = reset_pagemap;
+	pr->sync = process_async_reads;
 	pr->id = ids++;
 
 	pr_debug("Opened page read %u (parent %u)\n",
-- 
2.1.4



More information about the CRIU mailing list