[CRIU] [PATCH v2 5/5] UFFD: Support lazy-pages restore between two hosts

Adrian Reber adrian at lisas.de
Thu Mar 24 08:52:54 PDT 2016


From: Adrian Reber <areber at redhat.com>

This enhances lazy-pages mode to work with two different hosts. Instead
of lazy restoring a process on the same host this enables to keep the
memory pages on the source system and actually only transfer the memory
pages on demand from the source to the destination system.

The previous, only on one host, lazy restore consisted of two process.

 criu restore --lazy-pages --address /path/to/unix-domain-socket

and

 criu lazy-pages --address /path/to/unix-domain-socket

The unix domain socket was used to transfer the userfault FD (UFFD) from
the 'criu restore' process to the 'criu lazy-pages' process. The 'criu
lazy-pages' was then listening on the UFFD for userfaultfd messages
which were used to retrieve the requested memory page from the
checkpoint directory and transfer that page into the process to be
restored.

This commit introduces the ability to keep the pages on the remote host
and only request the transfer of the required pages over TCP on demand.
Therefore criu needs to be started differently than previously.

Host1:

   criu restore --lazy-pages --address /path/to/unix-domain-socket

  and

   criu lazy-pages --address /path/to/unix-domain-socket \
   --lazy-client ADDR-Host2 --port 27

Host2:

   criu lazy-pages --lazy-server --port 27

On Host1 the process is now restored (as criu always does) except that
the memory pages are not read from pages.img and that the appropriate
pages are marked as being userfaultfd handled. As soon as the restored
process tries to access one the pages a UFFD MSG is received by the
lazy-client (on Host1). This UFFD MSG is then transferred via TCP to the
lazy-sever (on Host2). The lazy-server retrieves the memory page from
the local checkpoint and returns a UFFDIO COPY answer back to the
lazy-client which can the forward this message to the local UFFD which
inserts the page into the restored process.

The remote lazy restore has the same behavior as the local lazy restore
that, if after 5 seconds no more messages are received on the socket
waiting for UFFD MSG, it switches to copy remaining pages mode, where
all non-UFFD-requested pages are transferred into the restored process.

TODO:
  * Create from the checkpoint directory a checkpoint without the memory
    pages which are UFFD handled. This would enable a real UFFD remote
    restore where the UFFD pages do not need to be transferred to the
    destination host.

Signed-off-by: Adrian Reber <areber at redhat.com>
---
 criu/uffd.c | 269 +++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 240 insertions(+), 29 deletions(-)

diff --git a/criu/uffd.c b/criu/uffd.c
index 2d82d46..1c3c3a6 100644
--- a/criu/uffd.c
+++ b/criu/uffd.c
@@ -7,12 +7,16 @@
 #include <fcntl.h>
 #include <string.h>
 #include <time.h>
+#include <arpa/inet.h>
+#include <netinet/ip.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
 #include <sys/un.h>
 #include <sys/socket.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
 
 #include "asm/page.h"
 #include "include/log.h"
@@ -242,41 +246,83 @@ struct uffd_pages_struct {
 	int flags;
 };
 
+struct remote_uffd {
+	struct uffdio_copy uffdio_copy;
+	int remaining;
+};
+
+static unsigned long vdso_pages;
 static unsigned long total_pages;
 static unsigned long uffd_copied_pages;
 
-static int uffd_copy_page(int uffd, __u64 address, void *dest)
+static void page_copied_status()
 {
-	struct uffdio_copy uffdio_copy;
+	/*
+	 * In lazy_client mode the number of total pages is unknown.
+	 * Still print out some status message how many pages have already
+	 * been transferred.
+	 */
+	if (opts.lazy_client)
+		pr_debug("With UFFD transferred pages: (%ld)\n", uffd_copied_pages);
+	else
+		pr_debug("With UFFD transferred pages: (%ld/%ld)\n", uffd_copied_pages,
+			 total_pages);
+}
+
+static int uffd_copy_page(int uffd, __u64 address, void *dest, int remaining)
+{
+	struct remote_uffd remote_uffd;
+	struct stat statbuf;
 	int rc;
 
-	rc = get_page(address, dest);
-	if (rc <= 0)
+	if (fstat(uffd, &statbuf)) {
+		pr_perror("fstat of FD failed: ");
 		return -1;
+	}
 
-	uffdio_copy.dst = address;
-	uffdio_copy.src = (unsigned long) dest;
-	uffdio_copy.len = page_size();
-	uffdio_copy.mode = 0;
-	uffdio_copy.copy = 0;
+	if (!opts.lazy_client) {
+		rc = get_page(address, dest);
+		if (rc <= 0)
+			return -1;
+	}
 
-	pr_debug("uffdio_copy.dst 0x%llx\n", uffdio_copy.dst);
-	rc = ioctl(uffd, UFFDIO_COPY, &uffdio_copy);
+	remote_uffd.uffdio_copy.dst = address;
+	remote_uffd.uffdio_copy.src = (unsigned long) dest;
+	remote_uffd.uffdio_copy.len = page_size();
+	remote_uffd.uffdio_copy.mode = 0;
+	remote_uffd.uffdio_copy.copy = 0;
+	remote_uffd.remaining = remaining;
+
+	pr_debug("uffdio_copy.dst 0x%llx\n", remote_uffd.uffdio_copy.dst);
+	if (S_ISSOCK(statbuf.st_mode)) {
+		pr_debug("FD is a socket\n");
+		if (write(uffd, &remote_uffd, sizeof(remote_uffd)) != sizeof(remote_uffd)) {
+			pr_perror("Can't write to lazy client");
+			return -1;
+		}
+		if (write(uffd, dest, remote_uffd.uffdio_copy.len) != remote_uffd.uffdio_copy.len) {
+			pr_perror("Can't write to lazy client");
+			return -1;
+		}
+		return sizeof(remote_uffd.uffdio_copy);
+	} else {
+		rc = ioctl(uffd, UFFDIO_COPY, &remote_uffd.uffdio_copy);
+	}
 	pr_debug("ioctl UFFDIO_COPY rc 0x%x\n", rc);
-	pr_debug("uffdio_copy.copy 0x%llx\n", uffdio_copy.copy);
+	pr_debug("uffdio_copy.copy 0x%llx\n", remote_uffd.uffdio_copy.copy);
 	if (rc) {
-		/* real retval in ufdio_copy.copy */
-		if (uffdio_copy.copy != -EEXIST) {
-			pr_err("UFFDIO_COPY error %Ld\n", uffdio_copy.copy);
+		/* real retval in remote_uffd.ufdio_copy.copy */
+		if (remote_uffd.uffdio_copy.copy != -EEXIST) {
+			pr_err("UFFDIO_COPY error %Ld\n", remote_uffd.uffdio_copy.copy);
 			return -1;
 		}
-	} else if (uffdio_copy.copy != page_size()) {
-		pr_err("UFFDIO_COPY unexpected size %Ld\n", uffdio_copy.copy);
+	} else if (remote_uffd.uffdio_copy.copy != page_size()) {
+		pr_err("UFFDIO_COPY unexpected size %Ld\n", remote_uffd.uffdio_copy.copy);
 		return -1;
 	}
 
 
-	return uffdio_copy.copy;
+	return remote_uffd.uffdio_copy.copy;
 
 }
 
@@ -338,8 +384,10 @@ static int collect_uffd_pages(struct page_read *pr, struct list_head *uffd_list)
 		if (!uffd_pages)
 			return -1;
 		uffd_pages->addr = base;
-		if (uffd_vdso)
+		if (uffd_vdso) {
 			uffd_pages->flags |= UFFD_FLAG_VDSO;
+			vdso_pages++;
+		}
 		list_add(&uffd_pages->list, uffd_list);
 	}
 
@@ -357,13 +405,14 @@ static int handle_remaining_pages(int uffd, struct list_head *uffd_list, void *d
 		if (uffd_pages->flags & UFFD_FLAG_SENT)
 			continue;
 
-		rc = uffd_copy_page(uffd, uffd_pages->addr, dest);
+		rc = uffd_copy_page(uffd, uffd_pages->addr, dest,
+				    total_pages - ++uffd_copied_pages);
 		if (rc < 0) {
 			pr_err("Error during UFFD copy\n");
 			return -1;
 		}
 
-		uffd_copied_pages++;
+		page_copied_status();
 		uffd_pages->flags |= UFFD_FLAG_SENT;
 	}
 
@@ -376,13 +425,15 @@ static int handle_regular_pages(int uffd, struct list_head *uffd_list, void *des
 	int rc;
 	struct uffd_pages_struct *uffd_pages;
 
-	rc = uffd_copy_page(uffd, address, dest);
+	rc = uffd_copy_page(uffd, address, dest, 0);
 	if (rc < 0) {
 		pr_err("Error during UFFD copy\n");
 		return -1;
 	}
 
 	uffd_copied_pages++;
+	page_copied_status();
+
 	/*
 	 * Mark this page as having been already transferred, so
 	 * that it has not to be copied again later.
@@ -403,12 +454,13 @@ static int handle_vdso_pages(int uffd, struct list_head *uffd_list, void *dest)
 	list_for_each_entry(uffd_pages, uffd_list, list) {
 		if (!(uffd_pages->flags & UFFD_FLAG_VDSO))
 			continue;
-		rc = uffd_copy_page(uffd, uffd_pages->addr, dest);
+		rc = uffd_copy_page(uffd, uffd_pages->addr, dest, --vdso_pages);
 		if (rc < 0) {
 			pr_err("Error during UFFD copy\n");
 			return -1;
 		}
 		uffd_copied_pages++;
+		page_copied_status();
 		uffd_pages->flags |= UFFD_FLAG_SENT;
 	}
 	return 0;
@@ -427,6 +479,9 @@ static int find_vmas(struct list_head *uffd_list)
 	struct page_read pr;
 	struct uffd_pages_struct *uffd_pages;
 
+	/* No need to scan for VMAs in lazy_client mode. */
+	if (opts.lazy_client)
+		return 0;
 
 	if (check_img_inventory() == -1)
 		return -1;
@@ -475,7 +530,7 @@ static int find_vmas(struct list_head *uffd_list)
 				vmas.priv_size += PAGE_SIZE;
 		}
 
-		pr_info("vma 0x%"PRIx64" 0x%"PRIx64"\n", vma->e->start, vma->e->end);
+		pr_info("vma 0x%" PRIx64 " 0x%" PRIx64 "\n", vma->e->start, vma->e->end);
 	}
 
 	ret = open_page_read(pid, &pr, PR_TASK);
@@ -509,7 +564,63 @@ out:
 	return ret;
 }
 
-static int handle_requests(int fd)
+static int receive_loop(int fd, int lazy_server, void *dest)
+{
+	struct remote_uffd remote_uffd;
+	int read_counter;
+	int rc;
+	unsigned long ps;
+
+	ps = page_size();
+
+	do {
+		/* Let's wait for the answer. Should be sizeof(remote_uffd) */
+		if (read(lazy_server, &remote_uffd, sizeof(remote_uffd)) != sizeof(remote_uffd)) {
+			pr_perror("Can't read from lazy server");
+			return -1;
+		}
+		pr_debug("from lazy server: uffdio_copy.dst 0x%llx\n", remote_uffd.uffdio_copy.dst);
+		/* And now the actual data, should be exactly a page */
+		if (remote_uffd.uffdio_copy.len != ps) {
+			pr_err("uffdio_copy.len should never be != page size (%ld)\n", ps);
+			return -1;
+		}
+		read_counter = 0;
+		while (read_counter < ps) {
+			rc = read(lazy_server, dest + read_counter, ps - read_counter);
+			if (rc == -1) {
+				pr_perror("Can't read from lazy server");
+				return -1;
+			}
+			read_counter += rc;
+			pr_debug("read_counter %d\n", read_counter);
+		}
+		rc = uffd_copy_page(fd, remote_uffd.uffdio_copy.dst, dest, 0);
+		if (rc < 0) {
+			pr_err("Error during UFFD copy\n");
+			return -1;
+		}
+		uffd_copied_pages++;
+		page_copied_status();
+		pr_debug("remote_uffd.remaining: %d\n", remote_uffd.remaining);
+	} while (remote_uffd.remaining != 0);
+
+	return 0;
+}
+
+static int handle_remote_requests(int fd, int lazy_server, struct uffd_msg *msg, void *dest)
+{
+	pr_debug("Sending userfaultfd msg to lazy server\n");
+
+	if (write(lazy_server, msg, sizeof(*msg)) != sizeof(*msg)) {
+		pr_perror("Can't write to lazy server");
+		return -1;
+	}
+
+	return receive_loop(fd, lazy_server, dest);
+}
+
+static int handle_requests(int fd, int lazy_server)
 {
 	fd_set set;
 	int ret = -1;
@@ -552,6 +663,15 @@ static int handle_requests(int fd)
 		 */
 		timeout.tv_sec = 5;
 		timeout.tv_usec = 0;
+		/*
+		 * If lazy-pages is running in lazy_server mode the timeout
+		 * needs to be longer than on the lazy_client side to make
+		 * sure the lazy_server does not end the whole thing
+		 * before the lazy_client is done/ready.
+		 */
+		if (lazy_server > 0)
+			timeout.tv_sec++;
+
 		ret = select(fd + 1, &set, NULL, NULL, &timeout);
 		pr_debug("select() rc: 0x%x\n", ret);
 		if (ret == 0) {
@@ -570,6 +690,16 @@ static int handle_requests(int fd)
 			break;
 		}
 
+
+		if (lazy_server > 0) {
+			ret = handle_remote_requests(fd, lazy_server, &msg, dest);
+			if (ret >= 0)
+				continue;
+			ret = -1;
+			goto out;
+
+		}
+
 		ret = 0;
 		/* Align requested address to the next page boundary */
 		address = msg.arg.pagefault.address & ~(ps - 1);
@@ -620,14 +750,17 @@ static int handle_requests(int fd)
 		}
 	}
 	pr_debug("Handle remaining pages\n");
-	ret = handle_remaining_pages(fd, &uffd_list, dest);
+	if (lazy_server > 0)
+		ret = receive_loop(fd, lazy_server, dest);
+	else
+		ret = handle_remaining_pages(fd, &uffd_list, dest);
 	if (ret < 0) {
 		pr_err("Error during remaining page copy\n");
 		ret = 1;
 		goto out;
 	}
 
-	pr_debug("With UFFD transferred pages: (%ld/%ld)\n", uffd_copied_pages, total_pages);
+	page_copied_status();
 	if ((uffd_copied_pages != total_pages) && (total_pages > 0)) {
 		pr_warn("Only %ld of %ld pages transferred via UFFD\n", uffd_copied_pages,
 			total_pages);
@@ -643,13 +776,82 @@ out:
 	return ret;
 
 }
+
+static int lazy_server_serve(int sk)
+{
+	int ret;
+
+	/* The "transfer protocol" is first the pid as int and then the rest */
+	ret = recv(sk, &pid, sizeof(pid), 0);
+	if (ret != sizeof(pid)) {
+		pr_perror("PID recv error:");
+		return -1;
+	}
+	pr_debug("received PID: %d\n", pid);
+
+	return handle_requests(sk, 0);
+
+}
+
+static int lazy_server()
+{
+	int sk = -1;
+	int ask = -1;
+	int ret = 0;
+
+	sk = setup_tcp_server("lazy");
+	if (sk == -1)
+		return -1;
+
+	ret = run_tcp_server(opts.daemon_mode, &ask, -1, sk);
+	if (ret != 0)
+		return ret;
+
+	if (ask >= 0)
+		ret = lazy_server_serve(ask);
+
+	if (opts.daemon_mode)
+		exit(ret);
+
+	return ret;
+}
+
+static int init_lazy_client()
+{
+	int lazy_server_sk;
+
+	lazy_server_sk = setup_tcp_client(opts.lazy_client);
+	if (lazy_server_sk == -1)
+		return -1;
+
+	/*
+	 * CORK the socket at the very beginning. As per ANK
+	 * the corked by default socket with sporadic NODELAY-s
+	 * on urgent data is the smartest mode ever.
+	 */
+	tcp_cork(lazy_server_sk, true);
+	return lazy_server_sk;
+}
+
 int uffd_listen()
 {
 	int uffd;
 	int uffd_flags;
+	int lazy_server_sk = 0;
 
 	LIST_HEAD(uffd_list);
 
+	if (opts.lazy_server)
+		return lazy_server();
+
+	if (opts.lazy_client)
+		lazy_server_sk = init_lazy_client();
+
+	if (lazy_server_sk == -1) {
+		pr_info("Connection to the lazy server failed. Exiting\n");
+		return -1;
+	}
+
 	if (!opts.addr) {
 		pr_info("Please specify a file name for the unix domain socket\n");
 		pr_info("used to communicate between the lazy-pages server\n");
@@ -660,11 +862,20 @@ int uffd_listen()
 
 	pr_debug("Waiting for incoming connections on %s\n", opts.addr);
 	if ((uffd = ud_open()) < 0)
-		exit(0);
+		return -1;
 
 	pr_debug("uffd is 0x%d\n", uffd);
 	uffd_flags = fcntl(uffd, F_GETFD, NULL);
 	pr_debug("uffd_flags are 0x%x\n", uffd_flags);
 
-	return handle_requests(uffd);
+	if (lazy_server_sk) {
+		/* The "transfer protocol" is first the pid as int and then the rest */
+		pr_debug("Sending PID %d\n", pid);
+		if (send(lazy_server_sk, &pid, sizeof(pid), 0) < 0) {
+			pr_perror("PID sending error:");
+			return -1;
+		}
+	}
+
+	return handle_requests(uffd, lazy_server_sk);
 }
-- 
1.8.3.1



More information about the CRIU mailing list