[CRIU] [PATCH] lazy-pages: use random read from page-pipe instead of splitting it

Mike Rapoport rppt at linux.vnet.ibm.com
Wed Jun 7 11:53:56 MSK 2017


For the remote lazy pages case, to access pages in the middle of a pipe we
are splitting the page_pipe_buffers and iovecs and use splice() to move the
data between the underlying pipes. After the splits we get page_pipe_buffer
with single iovec that can be used to splice() the data further into the
socket.
This patch replaces the splitting and splicing with use of a helper pipe
and tee(). We tee() the pages from beginning of the pipe up to the last
requested page into a helper pipe, sink the unneeded head part into
/dev/null and we get the requested pages ready for splice() into the
socket.
This allows lazy-pages daemon to request the same page several time, which
is required to properly support fork() after the restore.
As added bonus we simplify the code and reduce amount of pipes that live in
the system.

Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
---
 criu/include/page-pipe.h | 10 +++++++
 criu/page-pipe.c         | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
 criu/page-xfer.c         | 59 +++++++++++++-------------------------
 3 files changed, 102 insertions(+), 40 deletions(-)

diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h
index 10ae873..76ec1fd 100644
--- a/criu/include/page-pipe.h
+++ b/criu/include/page-pipe.h
@@ -138,4 +138,14 @@ extern int page_pipe_split(struct page_pipe *pp, unsigned long addr,
 
 extern void page_pipe_destroy_ppb(struct page_pipe_buf *ppb);
 
+struct pipe_read_dest {
+	int p[2];
+	int sink_fd;
+};
+
+extern int pipe_read_dest_init(struct pipe_read_dest *prd);
+extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd,
+			  unsigned long addr, unsigned int *nr_pages,
+			  unsigned int ppb_flags);
+
 #endif /* __CR_PAGE_PIPE_H__ */
diff --git a/criu/page-pipe.c b/criu/page-pipe.c
index 4ebd0cb..4b4b3fc 100644
--- a/criu/page-pipe.c
+++ b/criu/page-pipe.c
@@ -474,6 +474,79 @@ int page_pipe_split(struct page_pipe *pp, unsigned long addr,
 	return 0;
 }
 
+int pipe_read_dest_init(struct pipe_read_dest *prd)
+{
+	int ret;
+
+	if (pipe(prd->p)) {
+		pr_perror("Cannot create pipe for reading from page-pipe");
+		return -1;
+	}
+
+	ret = fcntl(prd->p[0], F_SETPIPE_SZ, PIPE_MAX_SIZE * PAGE_SIZE);
+	if (ret < 0)
+		return -1;
+
+	prd->sink_fd = open("/dev/null", O_WRONLY);
+	if (prd->sink_fd < 0) {
+		pr_perror("Cannot open sink for reading from page-pipe");
+		return -1;
+	}
+
+	ret = fcntl(prd->p[0], F_GETPIPE_SZ, 0);
+	pr_debug("Created tee pipe size %d\n", ret);
+
+	return 0;
+}
+
+int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd,
+		   unsigned long addr, unsigned int *nr_pages,
+		   unsigned int ppb_flags)
+{
+	struct page_pipe_buf *ppb;
+	struct iovec *iov = NULL;
+	unsigned long skip = 0, len;
+	int ret;
+
+	/*
+	 * Get ppb that contains addr and count length of data between
+	 * the beginning of the pipe and addr. If no ppb is found, the
+	 * requested page is mapped to zero pfn
+	 */
+	ppb = get_ppb(pp, addr, &iov, &skip);
+	if (!ppb) {
+		*nr_pages = 0;
+		return 0;
+	}
+
+	if (!(ppb->flags & ppb_flags)) {
+		pr_err("PPB flags mismatch: %x %x\n", ppb_flags, ppb->flags);
+		return false;
+	}
+
+	/* clamp the request if it passes the end of iovec */
+	len = min((unsigned long)iov->iov_base + iov->iov_len - addr,
+		  (unsigned long)(*nr_pages) * PAGE_SIZE);
+	*nr_pages = len / PAGE_SIZE;
+
+	/* we should tee() the requested lenth + the beginning of the pipe */
+	len += skip;
+
+	ret = tee(ppb->p[0], prd->p[1], len, 0);
+	if (ret != len) {
+		pr_perror("tee: %d", ret);
+		return -1;
+	}
+
+	ret = splice(prd->p[0], NULL, prd->sink_fd, NULL, skip, 0);
+	if (ret != skip) {
+		pr_perror("splice: %d", ret);
+		return -1;
+	}
+
+	return 0;
+}
+
 void page_pipe_destroy_ppb(struct page_pipe_buf *ppb)
 {
 	list_del(&ppb->l);
diff --git a/criu/page-xfer.c b/criu/page-xfer.c
index c557407..49693bb 100644
--- a/criu/page-xfer.c
+++ b/criu/page-xfer.c
@@ -554,10 +554,19 @@ static struct page_xfer_job cxfer = {
 	.dst_id = ~0,
 };
 
+static struct pipe_read_dest pipe_read_dest = {
+	.sink_fd = -1,
+};
+
 static void page_server_close(void)
 {
 	if (cxfer.dst_id != ~0)
 		cxfer.loc_xfer.close(&cxfer.loc_xfer);
+	if (pipe_read_dest.sink_fd != -1) {
+		close(pipe_read_dest.sink_fd);
+		close(pipe_read_dest.p[0]);
+		close(pipe_read_dest.p[1]);
+	}
 }
 
 static int page_server_open(int sk, struct page_server_iov *pi)
@@ -653,43 +662,18 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags)
 	return 0;
 }
 
-static bool can_send_pages(struct page_pipe_buf *ppb, struct iovec *iov,
-			   struct page_server_iov *pi)
-{
-	unsigned long len = pi->nr_pages * PAGE_SIZE;
-
-	if (!(ppb->flags & PPB_LAZY)) {
-		pr_err("Requested pages are not lazy\n");
-		return false;
-	}
-
-	if (iov->iov_len != len) {
-		pr_err("IOV len %zu does not match requested %lu\n",
-		       iov->iov_len, len);
-		return false;
-	}
-
-	if(pi->vaddr != encode_pointer(iov->iov_base)) {
-		pr_err("IOV start %p does not match requested addr %"PRIx64"\n",
-		       iov->iov_base, pi->vaddr);
-		return false;
-	}
-
-	return true;
-}
-
 static int page_server_get_pages(int sk, struct page_server_iov *pi)
 {
 	struct pstree_item *item;
 	struct page_pipe *pp;
-	struct page_pipe_buf *ppb;
-	struct iovec *iov;
+	unsigned long len;
 	int ret;
 
 	item = pstree_item_by_virt(pi->dst_id);
 	pp = dmpi(item)->mem_pp;
 
-	ret = page_pipe_split(pp, pi->vaddr, &pi->nr_pages);
+	ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr,
+			     &pi->nr_pages, PPB_LAZY);
 	if (ret)
 		return ret;
 
@@ -699,23 +683,17 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi)
 		return send_psi(sk, PS_IOV_ZERO, 0, 0, 0);
 	}
 
-	ppb = list_first_entry(&pp->bufs, struct page_pipe_buf, l);
-	iov = &ppb->iov[0];
-
-	if (!can_send_pages(ppb, iov, pi))
-		return -1;
+	len = pi->nr_pages * PAGE_SIZE;
 
 	if (send_psi(sk, PS_IOV_ADD, pi->nr_pages, pi->vaddr, pi->dst_id))
 		return -1;
 
-	ret = splice(ppb->p[0], NULL, sk, NULL, iov->iov_len, SPLICE_F_MOVE);
-	if (ret != iov->iov_len)
+	ret = splice(pipe_read_dest.p[0], NULL, sk, NULL, len, SPLICE_F_MOVE);
+	if (ret != len)
 		return -1;
 
 	tcp_nodelay(sk, true);
 
-	page_pipe_destroy_ppb(ppb);
-
 	return 0;
 }
 
@@ -723,8 +701,9 @@ static int page_server_serve(int sk)
 {
 	int ret = -1;
 	bool flushed = false;
+	bool receiving_pages = !opts.lazy_pages;
 
-	if (!opts.lazy_pages) {
+	if (receiving_pages) {
 		/*
 		 * This socket only accepts data except one thing -- it
 		 * writes back the has_parent bit from time to time, so
@@ -741,6 +720,7 @@ static int page_server_serve(int sk)
 		cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0);
 		pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size);
 	} else {
+		pipe_read_dest_init(&pipe_read_dest);
 		tcp_cork(sk, true);
 	}
 
@@ -800,7 +780,6 @@ static int page_server_serve(int sk)
 			break;
 		}
 		case PS_IOV_GET:
-			flushed = true;
 			ret = page_server_get_pages(sk, &pi);
 			break;
 		default:
@@ -813,7 +792,7 @@ static int page_server_serve(int sk)
 			break;
 	}
 
-	if (!ret && !flushed) {
+	if (receiving_pages && !ret && !flushed) {
 		pr_err("The data were not flushed\n");
 		ret = -1;
 	}
-- 
2.7.4



More information about the CRIU mailing list