[CRIU] [PATCH 1/4] criu: mv page-read.[ch] pagemap.[ch]

Mike Rapoport rppt at linux.vnet.ibm.com
Thu Jun 23 09:46:50 PDT 2016


Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
---
 criu/Makefile.crtools    |   2 +-
 criu/cr-dedup.c          |   2 +-
 criu/cr-restore.c        |   1 -
 criu/include/page-read.h |  95 ------------
 criu/include/page-xfer.h |   2 +-
 criu/include/pagemap.h   |  95 ++++++++++++
 criu/page-read.c         | 384 -----------------------------------------------
 criu/pagemap.c           | 384 +++++++++++++++++++++++++++++++++++++++++++++++
 criu/uffd.c              |   2 +-
 9 files changed, 483 insertions(+), 484 deletions(-)
 delete mode 100644 criu/include/page-read.h
 create mode 100644 criu/include/pagemap.h
 delete mode 100644 criu/page-read.c
 create mode 100644 criu/pagemap.c

diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools
index 9e4ec7a..b241bf2 100644
--- a/criu/Makefile.crtools
+++ b/criu/Makefile.crtools
@@ -40,7 +40,7 @@ obj-y			+= netfilter.o
 obj-y			+= net.o
 obj-y			+= pagemap-cache.o
 obj-y			+= page-pipe.o
-obj-y			+= page-read.o
+obj-y			+= pagemap.o
 obj-y			+= page-xfer.o
 obj-y			+= parasite-syscall.o
 obj-y			+= pie/pie-relocs.o
diff --git a/criu/cr-dedup.c b/criu/cr-dedup.c
index 39609a7..726cec9 100644
--- a/criu/cr-dedup.c
+++ b/criu/cr-dedup.c
@@ -4,7 +4,7 @@
 #include <unistd.h>
 
 #include "crtools.h"
-#include "page-read.h"
+#include "pagemap.h"
 #include "restorer.h"
 
 #define MAX_BUNCH_SIZE 256
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index e0b19fe..c8418b1 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -60,7 +60,6 @@
 #include "tty.h"
 #include "cpu.h"
 #include "file-lock.h"
-#include "page-read.h"
 #include "vdso.h"
 #include "stats.h"
 #include "tun.h"
diff --git a/criu/include/page-read.h b/criu/include/page-read.h
deleted file mode 100644
index 4c6d21e..0000000
--- a/criu/include/page-read.h
+++ /dev/null
@@ -1,95 +0,0 @@
-#ifndef __CR_PAGE_READ_H__
-#define __CR_PAGE_READ_H__
-
-#include "images/pagemap.pb-c.h"
-
-/*
- * page_read -- engine, that reads pages from image file(s)
- *
- * Several page-read's can be arranged in a chain to read
- * pages from a series of snapshot.
- *
- * A task's address space vs pagemaps+page image pairs can
- * look like this (taken from comment in page-pipe.h):
- *
- * task:
- *
- *       0  0  0    0      1    1    1
- *       0  3  6    B      2    7    C
- *       ---+++-----+++++++-----+++++----
- * pm1:  ---+++-----++++++-------++++----
- * pm2:  ---==+-----====+++-----++===----
- *
- * Here + is present page, - is non prsent, = is present,
- * but is not modified from last snapshot.
- *
- * Thus pagemap.img and pages.img entries are
- *
- * pm1:  03:3,0B:6,18:4
- * pm2:  03:2:P,05:1,0B:4:P,0F:3,17:2,19:3:P
- *
- * where P means "page is in parent pagemap".
- *
- * pg1:  03,04,05,0B,0C,0D,0E,0F,10,18,19,1A,1B
- * pg2:  05,0F,10,11,17,18
- *
- * When trying to restore from these 4 files we'd have
- * to carefull scan pagemap.img's one by one and read or
- * skip pages from pages.img where appropriate.
- *
- * All this is implemented in read_pagemap_page.
- */
-
-struct page_read {
-	/*
-	 * gets next vaddr:len pair to work on.
-	 * Pagemap entries should be returned in sorted order.
-	 */
-	int (*get_pagemap)(struct page_read *, struct iovec *iov);
-	/* reads page from current pagemap */
-	int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, void *);
-	/* stop working on current pagemap */
-	void (*put_pagemap)(struct page_read *);
-	void (*close)(struct page_read *);
-	void (*skip_pages)(struct page_read *, unsigned long len);
-	int (*seek_page)(struct page_read *pr, unsigned long vaddr, bool warn);
-
-	/* Private data of reader */
-	struct cr_img *pmi;
-	struct cr_img *pi;
-
-	PagemapEntry *pe;		/* current pagemap we are on */
-	struct page_read *parent;	/* parent pagemap (if ->in_parent
-					   pagemap is met in image, then
-					   go to this guy for page, see
-					   read_pagemap_page */
-	unsigned long cvaddr;		/* vaddr we are on */
-
-	struct iovec bunch;		/* record consequent neighbour
-					   iovecs to punch together */
-	unsigned id; /* for logging */
-
-	PagemapEntry **pmes;
-	int nr_pmes;
-	int curr_pme;
-};
-
-#define PR_SHMEM	0x1
-#define PR_TASK		0x2
-
-#define PR_TYPE_MASK	0x3
-#define PR_MOD		0x4	/* Will need to modify */
-
-/*
- * -1 -- error
- *  0 -- no images
- *  1 -- opened
- */
-extern int open_page_read(int pid, struct page_read *, int pr_flags);
-extern int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags);
-extern void pagemap2iovec(PagemapEntry *pe, struct iovec *iov);
-extern void iovec2pagemap(struct iovec *iov, PagemapEntry *pe);
-
-extern int dedup_one_iovec(struct page_read *pr, struct iovec *iov);
-extern int punch_hole(struct page_read *pr, unsigned long off, unsigned long len, bool cleanup);
-#endif /* __CR_PAGE_READ_H__ */
diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
index 8492daa..d19671b 100644
--- a/criu/include/page-xfer.h
+++ b/criu/include/page-xfer.h
@@ -1,6 +1,6 @@
 #ifndef __CR_PAGE_XFER__H__
 #define __CR_PAGE_XFER__H__
-#include "page-read.h"
+#include "pagemap.h"
 
 extern int cr_page_server(bool daemon_mode, int cfd);
 
diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h
new file mode 100644
index 0000000..4c6d21e
--- /dev/null
+++ b/criu/include/pagemap.h
@@ -0,0 +1,95 @@
+#ifndef __CR_PAGE_READ_H__
+#define __CR_PAGE_READ_H__
+
+#include "images/pagemap.pb-c.h"
+
+/*
+ * page_read -- engine, that reads pages from image file(s)
+ *
+ * Several page-read's can be arranged in a chain to read
+ * pages from a series of snapshot.
+ *
+ * A task's address space vs pagemaps+page image pairs can
+ * look like this (taken from comment in page-pipe.h):
+ *
+ * task:
+ *
+ *       0  0  0    0      1    1    1
+ *       0  3  6    B      2    7    C
+ *       ---+++-----+++++++-----+++++----
+ * pm1:  ---+++-----++++++-------++++----
+ * pm2:  ---==+-----====+++-----++===----
+ *
+ * Here + is present page, - is non prsent, = is present,
+ * but is not modified from last snapshot.
+ *
+ * Thus pagemap.img and pages.img entries are
+ *
+ * pm1:  03:3,0B:6,18:4
+ * pm2:  03:2:P,05:1,0B:4:P,0F:3,17:2,19:3:P
+ *
+ * where P means "page is in parent pagemap".
+ *
+ * pg1:  03,04,05,0B,0C,0D,0E,0F,10,18,19,1A,1B
+ * pg2:  05,0F,10,11,17,18
+ *
+ * When trying to restore from these 4 files we'd have
+ * to carefull scan pagemap.img's one by one and read or
+ * skip pages from pages.img where appropriate.
+ *
+ * All this is implemented in read_pagemap_page.
+ */
+
+struct page_read {
+	/*
+	 * gets next vaddr:len pair to work on.
+	 * Pagemap entries should be returned in sorted order.
+	 */
+	int (*get_pagemap)(struct page_read *, struct iovec *iov);
+	/* reads page from current pagemap */
+	int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, void *);
+	/* stop working on current pagemap */
+	void (*put_pagemap)(struct page_read *);
+	void (*close)(struct page_read *);
+	void (*skip_pages)(struct page_read *, unsigned long len);
+	int (*seek_page)(struct page_read *pr, unsigned long vaddr, bool warn);
+
+	/* Private data of reader */
+	struct cr_img *pmi;
+	struct cr_img *pi;
+
+	PagemapEntry *pe;		/* current pagemap we are on */
+	struct page_read *parent;	/* parent pagemap (if ->in_parent
+					   pagemap is met in image, then
+					   go to this guy for page, see
+					   read_pagemap_page */
+	unsigned long cvaddr;		/* vaddr we are on */
+
+	struct iovec bunch;		/* record consequent neighbour
+					   iovecs to punch together */
+	unsigned id; /* for logging */
+
+	PagemapEntry **pmes;
+	int nr_pmes;
+	int curr_pme;
+};
+
+#define PR_SHMEM	0x1
+#define PR_TASK		0x2
+
+#define PR_TYPE_MASK	0x3
+#define PR_MOD		0x4	/* Will need to modify */
+
+/*
+ * -1 -- error
+ *  0 -- no images
+ *  1 -- opened
+ */
+extern int open_page_read(int pid, struct page_read *, int pr_flags);
+extern int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags);
+extern void pagemap2iovec(PagemapEntry *pe, struct iovec *iov);
+extern void iovec2pagemap(struct iovec *iov, PagemapEntry *pe);
+
+extern int dedup_one_iovec(struct page_read *pr, struct iovec *iov);
+extern int punch_hole(struct page_read *pr, unsigned long off, unsigned long len, bool cleanup);
+#endif /* __CR_PAGE_READ_H__ */
diff --git a/criu/page-read.c b/criu/page-read.c
deleted file mode 100644
index 9a02a5d..0000000
--- a/criu/page-read.c
+++ /dev/null
@@ -1,384 +0,0 @@
-#include <fcntl.h>
-#include <stdio.h>
-#include <unistd.h>
-
-#include "image.h"
-#include "cr_options.h"
-#include "servicefd.h"
-#include "page-read.h"
-
-#include "protobuf.h"
-#include "images/pagemap.pb-c.h"
-
-#ifndef SEEK_DATA
-#define SEEK_DATA	3
-#define SEEK_HOLE	4
-#endif
-
-void pagemap2iovec(PagemapEntry *pe, struct iovec *iov)
-{
-	iov->iov_base = decode_pointer(pe->vaddr);
-	iov->iov_len = pe->nr_pages * PAGE_SIZE;
-}
-
-void iovec2pagemap(struct iovec *iov, PagemapEntry *pe)
-{
-	pe->vaddr = encode_pointer(iov->iov_base);
-	pe->nr_pages = iov->iov_len / PAGE_SIZE;
-}
-
-static int get_pagemap(struct page_read *pr, struct iovec *iov)
-{
-	PagemapEntry *pe;
-
-	if (pr->curr_pme >= pr->nr_pmes)
-		return 0;
-
-	pe = pr->pmes[pr->curr_pme];
-
-	pagemap2iovec(pe, iov);
-
-	pr->pe = pe;
-	pr->cvaddr = (unsigned long)iov->iov_base;
-
-	if (pe->in_parent && !pr->parent) {
-		pr_err("No parent for snapshot pagemap\n");
-		return -1;
-	}
-
-	return 1;
-}
-
-static void put_pagemap(struct page_read *pr)
-{
-	pr->curr_pme++;
-}
-
-static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
-{
-	if (!len)
-		return;
-
-	pr_debug("\tpr%u Skip %lu bytes from page-dump\n", pr->id, len);
-	if (!pr->pe->in_parent)
-		lseek(img_raw_fd(pr->pi), len, SEEK_CUR);
-	pr->cvaddr += len;
-}
-
-static int seek_pagemap_page(struct page_read *pr, unsigned long vaddr,
-			     bool warn)
-{
-	int ret;
-	struct iovec iov;
-
-	if (pr->pe)
-		pagemap2iovec(pr->pe, &iov);
-	else
-		goto new_pagemap;
-
-	while (1) {
-		unsigned long iov_end;
-
-		if (vaddr < pr->cvaddr) {
-			if (warn)
-				pr_err("Missing %lx in parent pagemap, current iov: base=%lx,len=%zu\n",
-					vaddr, (unsigned long)iov.iov_base, iov.iov_len);
-			return 0;
-		}
-		iov_end = (unsigned long)iov.iov_base + iov.iov_len;
-
-		if (iov_end <= vaddr) {
-			skip_pagemap_pages(pr, iov_end - pr->cvaddr);
-			put_pagemap(pr);
-new_pagemap:
-			ret = get_pagemap(pr, &iov);
-			if (ret <= 0)
-				return ret;
-
-			continue;
-		}
-
-		skip_pagemap_pages(pr, vaddr - pr->cvaddr);
-		return 1;
-	}
-}
-
-static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr)
-{
-	if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) {
-		pr_err("Page read err %"PRIx64":%u vs %lx:%u\n",
-				pe->vaddr, pe->nr_pages, vaddr, nr);
-		BUG();
-	}
-}
-
-static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf)
-{
-	int ret;
-	unsigned long len = nr * PAGE_SIZE;
-
-	pr_info("pr%u Read %lx %u pages\n", pr->id, vaddr, nr);
-	pagemap_bound_check(pr->pe, vaddr, nr);
-
-	if (pr->pe->in_parent) {
-		struct page_read *ppr = pr->parent;
-
-		/*
-		 * Parent pagemap at this point entry may be shorter
-		 * than the current vaddr:nr needs, so we have to
-		 * carefully 'split' the vaddr:nr into pieces and go
-		 * to parent page-read with the longest requests it
-		 * can handle.
-		 */
-
-		do {
-			int p_nr;
-
-			pr_debug("\tpr%u Read from parent\n", pr->id);
-			ret = seek_pagemap_page(ppr, vaddr, true);
-			if (ret <= 0)
-				return -1;
-
-			/*
-			 * This is how many pages we have in the parent
-			 * page_read starting from vaddr. Go ahead and
-			 * read as much as we can.
-			 */
-			p_nr = ppr->pe->nr_pages - (vaddr - ppr->pe->vaddr) / PAGE_SIZE;
-			pr_info("\tparent has %u pages in\n", p_nr);
-			if (p_nr > nr)
-				p_nr = nr;
-
-			ret = read_pagemap_page(ppr, vaddr, p_nr, buf);
-			if (ret == -1)
-				return ret;
-
-			/*
-			 * OK, let's see how much data we have left and go
-			 * to parent page-read again for the next pagemap
-			 * entry.
-			 */
-			nr -= p_nr;
-			vaddr += p_nr * PAGE_SIZE;
-			buf += p_nr * PAGE_SIZE;
-		} while (nr);
-	} else {
-		int fd = img_raw_fd(pr->pi);
-		off_t current_vaddr = lseek(fd, 0, SEEK_CUR);
-
-		pr_debug("\tpr%u Read page from self %lx/%"PRIx64"\n", pr->id, pr->cvaddr, current_vaddr);
-		ret = read(fd, buf, len);
-		if (ret != len) {
-			pr_perror("Can't read mapping page %d", ret);
-			return -1;
-		}
-
-		if (opts.auto_dedup) {
-			ret = punch_hole(pr, current_vaddr, len, false);
-			if (ret == -1) {
-				return -1;
-			}
-		}
-	}
-
-	pr->cvaddr += len;
-
-	return 1;
-}
-
-static void free_pagemaps(struct page_read *pr)
-{
-	int i;
-
-	for (i = 0; i < pr->nr_pmes; i++)
-		pagemap_entry__free_unpacked(pr->pmes[i], NULL);
-
-	xfree(pr->pmes);
-}
-
-static void close_page_read(struct page_read *pr)
-{
-	int ret;
-
-	if (pr->bunch.iov_len > 0) {
-		ret = punch_hole(pr, 0, 0, true);
-		if (ret == -1)
-			return;
-
-		pr->bunch.iov_len = 0;
-	}
-
-	if (pr->parent) {
-		close_page_read(pr->parent);
-		xfree(pr->parent);
-	}
-
-	if (pr->pmi)
-		close_image(pr->pmi);
-	if (pr->pi)
-		close_image(pr->pi);
-
-	if (pr->pmes)
-		free_pagemaps(pr);
-}
-
-static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags)
-{
-	int pfd, ret;
-	struct page_read *parent = NULL;
-
-	pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
-	if (pfd < 0 && errno == ENOENT)
-		goto out;
-
-	parent = xmalloc(sizeof(*parent));
-	if (!parent)
-		goto err_cl;
-
-	ret = open_page_read_at(pfd, pid, parent, pr_flags);
-	if (ret < 0)
-		goto err_free;
-
-	if (!ret) {
-		xfree(parent);
-		parent = NULL;
-	}
-
-	close(pfd);
-out:
-	pr->parent = parent;
-	return 0;
-
-err_free:
-	xfree(parent);
-err_cl:
-	close(pfd);
-	return -1;
-}
-
-/*
- * The pagemap entry size is at least 8 bytes for small mappings with
- * low address and may get to 18 bytes or even more for large mappings
- * with high address and in_parent flag set. 16 seems to be nice round
- * number to minimize {over,under}-allocations
- */
-#define PAGEMAP_ENTRY_SIZE_ESTIMATE 16
-
-static int init_pagemaps(struct page_read *pr)
-{
-	off_t fsize;
-	int nr_pmes, nr_realloc;
-
-	fsize = img_raw_size(pr->pmi);
-	if (fsize < 0)
-		return -1;
-
-	nr_pmes = fsize / PAGEMAP_ENTRY_SIZE_ESTIMATE + 1;
-	nr_realloc = nr_pmes / 2;
-
-	pr->pmes = xzalloc(nr_pmes * sizeof(*pr->pmes));
-	if (!pr->pmes)
-		return -1;
-
-	pr->nr_pmes = pr->curr_pme = 0;
-
-	while (1) {
-		int ret = pb_read_one_eof(pr->pmi, &pr->pmes[pr->nr_pmes],
-					  PB_PAGEMAP);
-		if (ret < 0)
-			goto free_pagemaps;
-		if (ret == 0)
-			break;
-
-		pr->nr_pmes++;
-		if (pr->nr_pmes >= nr_pmes) {
-			nr_pmes += nr_realloc;
-			pr->pmes = xrealloc(pr->pmes,
-					    nr_pmes * sizeof(*pr->pmes));
-			if (!pr->pmes)
-				goto free_pagemaps;
-		}
-	}
-
-	return 0;
-
-	close_image(pr->pmi);
-	pr->pmi = NULL;
-
-free_pagemaps:
-	free_pagemaps(pr);
-	return -1;
-}
-
-int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
-{
-	int flags, i_typ;
-	static unsigned ids = 1;
-
-	if (opts.auto_dedup)
-		pr_flags |= PR_MOD;
-	if (pr_flags & PR_MOD)
-		flags = O_RDWR;
-	else
-		flags = O_RSTR;
-
-	switch (pr_flags & PR_TYPE_MASK) {
-	case PR_TASK:
-		i_typ = CR_FD_PAGEMAP;
-		break;
-	case PR_SHMEM:
-		i_typ = CR_FD_SHMEM_PAGEMAP;
-		break;
-	default:
-		BUG();
-		return -1;
-	}
-
-	pr->pe = NULL;
-	pr->parent = NULL;
-	pr->bunch.iov_len = 0;
-	pr->bunch.iov_base = NULL;
-
-	pr->pmi = open_image_at(dfd, i_typ, O_RSTR, (long)pid);
-	if (!pr->pmi)
-		return -1;
-
-	if (empty_image(pr->pmi)) {
-		close_image(pr->pmi);
-		return 0;
-	}
-
-	if ((i_typ != CR_FD_SHMEM_PAGEMAP) && try_open_parent(dfd, pid, pr, pr_flags)) {
-		close_image(pr->pmi);
-		return -1;
-	}
-
-	pr->pi = open_pages_image_at(dfd, flags, pr->pmi);
-	if (!pr->pi) {
-		close_page_read(pr);
-		return -1;
-	}
-
-	if (init_pagemaps(pr)) {
-		close_page_read(pr);
-		return -1;
-	}
-
-	pr->get_pagemap = get_pagemap;
-	pr->put_pagemap = put_pagemap;
-	pr->read_pages = read_pagemap_page;
-	pr->close = close_page_read;
-	pr->skip_pages = skip_pagemap_pages;
-	pr->seek_page = seek_pagemap_page;
-	pr->id = ids++;
-
-	pr_debug("Opened page read %u (parent %u)\n",
-			pr->id, pr->parent ? pr->parent->id : 0);
-
-	return 1;
-}
-
-int open_page_read(int pid, struct page_read *pr, int pr_flags)
-{
-	return open_page_read_at(get_service_fd(IMG_FD_OFF), pid, pr, pr_flags);
-}
diff --git a/criu/pagemap.c b/criu/pagemap.c
new file mode 100644
index 0000000..278eb77
--- /dev/null
+++ b/criu/pagemap.c
@@ -0,0 +1,384 @@
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "image.h"
+#include "cr_options.h"
+#include "servicefd.h"
+#include "pagemap.h"
+
+#include "protobuf.h"
+#include "images/pagemap.pb-c.h"
+
+#ifndef SEEK_DATA
+#define SEEK_DATA	3
+#define SEEK_HOLE	4
+#endif
+
+void pagemap2iovec(PagemapEntry *pe, struct iovec *iov)
+{
+	iov->iov_base = decode_pointer(pe->vaddr);
+	iov->iov_len = pe->nr_pages * PAGE_SIZE;
+}
+
+void iovec2pagemap(struct iovec *iov, PagemapEntry *pe)
+{
+	pe->vaddr = encode_pointer(iov->iov_base);
+	pe->nr_pages = iov->iov_len / PAGE_SIZE;
+}
+
+static int get_pagemap(struct page_read *pr, struct iovec *iov)
+{
+	PagemapEntry *pe;
+
+	if (pr->curr_pme >= pr->nr_pmes)
+		return 0;
+
+	pe = pr->pmes[pr->curr_pme];
+
+	pagemap2iovec(pe, iov);
+
+	pr->pe = pe;
+	pr->cvaddr = (unsigned long)iov->iov_base;
+
+	if (pe->in_parent && !pr->parent) {
+		pr_err("No parent for snapshot pagemap\n");
+		return -1;
+	}
+
+	return 1;
+}
+
+static void put_pagemap(struct page_read *pr)
+{
+	pr->curr_pme++;
+}
+
+static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
+{
+	if (!len)
+		return;
+
+	pr_debug("\tpr%u Skip %lu bytes from page-dump\n", pr->id, len);
+	if (!pr->pe->in_parent)
+		lseek(img_raw_fd(pr->pi), len, SEEK_CUR);
+	pr->cvaddr += len;
+}
+
+static int seek_pagemap_page(struct page_read *pr, unsigned long vaddr,
+			     bool warn)
+{
+	int ret;
+	struct iovec iov;
+
+	if (pr->pe)
+		pagemap2iovec(pr->pe, &iov);
+	else
+		goto new_pagemap;
+
+	while (1) {
+		unsigned long iov_end;
+
+		if (vaddr < pr->cvaddr) {
+			if (warn)
+				pr_err("Missing %lx in parent pagemap, current iov: base=%lx,len=%zu\n",
+					vaddr, (unsigned long)iov.iov_base, iov.iov_len);
+			return 0;
+		}
+		iov_end = (unsigned long)iov.iov_base + iov.iov_len;
+
+		if (iov_end <= vaddr) {
+			skip_pagemap_pages(pr, iov_end - pr->cvaddr);
+			put_pagemap(pr);
+new_pagemap:
+			ret = get_pagemap(pr, &iov);
+			if (ret <= 0)
+				return ret;
+
+			continue;
+		}
+
+		skip_pagemap_pages(pr, vaddr - pr->cvaddr);
+		return 1;
+	}
+}
+
+static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr)
+{
+	if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) {
+		pr_err("Page read err %"PRIx64":%u vs %lx:%u\n",
+				pe->vaddr, pe->nr_pages, vaddr, nr);
+		BUG();
+	}
+}
+
+static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf)
+{
+	int ret;
+	unsigned long len = nr * PAGE_SIZE;
+
+	pr_info("pr%u Read %lx %u pages\n", pr->id, vaddr, nr);
+	pagemap_bound_check(pr->pe, vaddr, nr);
+
+	if (pr->pe->in_parent) {
+		struct page_read *ppr = pr->parent;
+
+		/*
+		 * Parent pagemap at this point entry may be shorter
+		 * than the current vaddr:nr needs, so we have to
+		 * carefully 'split' the vaddr:nr into pieces and go
+		 * to parent page-read with the longest requests it
+		 * can handle.
+		 */
+
+		do {
+			int p_nr;
+
+			pr_debug("\tpr%u Read from parent\n", pr->id);
+			ret = seek_pagemap_page(ppr, vaddr, true);
+			if (ret <= 0)
+				return -1;
+
+			/*
+			 * This is how many pages we have in the parent
+			 * page_read starting from vaddr. Go ahead and
+			 * read as much as we can.
+			 */
+			p_nr = ppr->pe->nr_pages - (vaddr - ppr->pe->vaddr) / PAGE_SIZE;
+			pr_info("\tparent has %u pages in\n", p_nr);
+			if (p_nr > nr)
+				p_nr = nr;
+
+			ret = read_pagemap_page(ppr, vaddr, p_nr, buf);
+			if (ret == -1)
+				return ret;
+
+			/*
+			 * OK, let's see how much data we have left and go
+			 * to parent page-read again for the next pagemap
+			 * entry.
+			 */
+			nr -= p_nr;
+			vaddr += p_nr * PAGE_SIZE;
+			buf += p_nr * PAGE_SIZE;
+		} while (nr);
+	} else {
+		int fd = img_raw_fd(pr->pi);
+		off_t current_vaddr = lseek(fd, 0, SEEK_CUR);
+
+		pr_debug("\tpr%u Read page from self %lx/%"PRIx64"\n", pr->id, pr->cvaddr, current_vaddr);
+		ret = read(fd, buf, len);
+		if (ret != len) {
+			pr_perror("Can't read mapping page %d", ret);
+			return -1;
+		}
+
+		if (opts.auto_dedup) {
+			ret = punch_hole(pr, current_vaddr, len, false);
+			if (ret == -1) {
+				return -1;
+			}
+		}
+	}
+
+	pr->cvaddr += len;
+
+	return 1;
+}
+
+static void free_pagemaps(struct page_read *pr)
+{
+	int i;
+
+	for (i = 0; i < pr->nr_pmes; i++)
+		pagemap_entry__free_unpacked(pr->pmes[i], NULL);
+
+	xfree(pr->pmes);
+}
+
+static void close_page_read(struct page_read *pr)
+{
+	int ret;
+
+	if (pr->bunch.iov_len > 0) {
+		ret = punch_hole(pr, 0, 0, true);
+		if (ret == -1)
+			return;
+
+		pr->bunch.iov_len = 0;
+	}
+
+	if (pr->parent) {
+		close_page_read(pr->parent);
+		xfree(pr->parent);
+	}
+
+	if (pr->pmi)
+		close_image(pr->pmi);
+	if (pr->pi)
+		close_image(pr->pi);
+
+	if (pr->pmes)
+		free_pagemaps(pr);
+}
+
+static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags)
+{
+	int pfd, ret;
+	struct page_read *parent = NULL;
+
+	pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
+	if (pfd < 0 && errno == ENOENT)
+		goto out;
+
+	parent = xmalloc(sizeof(*parent));
+	if (!parent)
+		goto err_cl;
+
+	ret = open_page_read_at(pfd, pid, parent, pr_flags);
+	if (ret < 0)
+		goto err_free;
+
+	if (!ret) {
+		xfree(parent);
+		parent = NULL;
+	}
+
+	close(pfd);
+out:
+	pr->parent = parent;
+	return 0;
+
+err_free:
+	xfree(parent);
+err_cl:
+	close(pfd);
+	return -1;
+}
+
+/*
+ * The pagemap entry size is at least 8 bytes for small mappings with
+ * low address and may get to 18 bytes or even more for large mappings
+ * with high address and in_parent flag set. 16 seems to be nice round
+ * number to minimize {over,under}-allocations
+ */
+#define PAGEMAP_ENTRY_SIZE_ESTIMATE 16
+
+static int init_pagemaps(struct page_read *pr)
+{
+	off_t fsize;
+	int nr_pmes, nr_realloc;
+
+	fsize = img_raw_size(pr->pmi);
+	if (fsize < 0)
+		return -1;
+
+	nr_pmes = fsize / PAGEMAP_ENTRY_SIZE_ESTIMATE + 1;
+	nr_realloc = nr_pmes / 2;
+
+	pr->pmes = xzalloc(nr_pmes * sizeof(*pr->pmes));
+	if (!pr->pmes)
+		return -1;
+
+	pr->nr_pmes = pr->curr_pme = 0;
+
+	while (1) {
+		int ret = pb_read_one_eof(pr->pmi, &pr->pmes[pr->nr_pmes],
+					  PB_PAGEMAP);
+		if (ret < 0)
+			goto free_pagemaps;
+		if (ret == 0)
+			break;
+
+		pr->nr_pmes++;
+		if (pr->nr_pmes >= nr_pmes) {
+			nr_pmes += nr_realloc;
+			pr->pmes = xrealloc(pr->pmes,
+					    nr_pmes * sizeof(*pr->pmes));
+			if (!pr->pmes)
+				goto free_pagemaps;
+		}
+	}
+
+	return 0;
+
+	close_image(pr->pmi);
+	pr->pmi = NULL;
+
+free_pagemaps:
+	free_pagemaps(pr);
+	return -1;
+}
+
+int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
+{
+	int flags, i_typ;
+	static unsigned ids = 1;
+
+	if (opts.auto_dedup)
+		pr_flags |= PR_MOD;
+	if (pr_flags & PR_MOD)
+		flags = O_RDWR;
+	else
+		flags = O_RSTR;
+
+	switch (pr_flags & PR_TYPE_MASK) {
+	case PR_TASK:
+		i_typ = CR_FD_PAGEMAP;
+		break;
+	case PR_SHMEM:
+		i_typ = CR_FD_SHMEM_PAGEMAP;
+		break;
+	default:
+		BUG();
+		return -1;
+	}
+
+	pr->pe = NULL;
+	pr->parent = NULL;
+	pr->bunch.iov_len = 0;
+	pr->bunch.iov_base = NULL;
+
+	pr->pmi = open_image_at(dfd, i_typ, O_RSTR, (long)pid);
+	if (!pr->pmi)
+		return -1;
+
+	if (empty_image(pr->pmi)) {
+		close_image(pr->pmi);
+		return 0;
+	}
+
+	if ((i_typ != CR_FD_SHMEM_PAGEMAP) && try_open_parent(dfd, pid, pr, pr_flags)) {
+		close_image(pr->pmi);
+		return -1;
+	}
+
+	pr->pi = open_pages_image_at(dfd, flags, pr->pmi);
+	if (!pr->pi) {
+		close_page_read(pr);
+		return -1;
+	}
+
+	if (init_pagemaps(pr)) {
+		close_page_read(pr);
+		return -1;
+	}
+
+	pr->get_pagemap = get_pagemap;
+	pr->put_pagemap = put_pagemap;
+	pr->read_pages = read_pagemap_page;
+	pr->close = close_page_read;
+	pr->skip_pages = skip_pagemap_pages;
+	pr->seek_page = seek_pagemap_page;
+	pr->id = ids++;
+
+	pr_debug("Opened page read %u (parent %u)\n",
+			pr->id, pr->parent ? pr->parent->id : 0);
+
+	return 1;
+}
+
+int open_page_read(int pid, struct page_read *pr, int pr_flags)
+{
+	return open_page_read_at(get_service_fd(IMG_FD_OFF), pid, pr, pr_flags);
+}
diff --git a/criu/uffd.c b/criu/uffd.c
index 46d9be4..f310ddd 100644
--- a/criu/uffd.c
+++ b/criu/uffd.c
@@ -21,7 +21,7 @@
 #include "asm/page.h"
 #include "log.h"
 #include "criu-plugin.h"
-#include "page-read.h"
+#include "pagemap.h"
 #include "files-reg.h"
 #include "kerndat.h"
 #include "mem.h"
-- 
1.9.1



More information about the CRIU mailing list