[CRIU] [PATCH v2 5/5] UFFD: Support lazy-pages restore between two hosts

Pavel Emelyanov xemul at virtuozzo.com
Mon Mar 28 08:28:54 PDT 2016


On 03/24/2016 06:52 PM, Adrian Reber wrote:
> From: Adrian Reber <areber at redhat.com>

Here's my comments on the patch :) I probably should have sent them earlier,
so sorry for the not-so-fast response.

> This enhances lazy-pages mode to work with two different hosts. Instead
> of lazy restoring a process on the same host this enables to keep the
> memory pages on the source system and actually only transfer the memory
> pages on demand from the source to the destination system.
> 
> The previous, only on one host, lazy restore consisted of two process.
> 
>  criu restore --lazy-pages --address /path/to/unix-domain-socket
> 
> and
> 
>  criu lazy-pages --address /path/to/unix-domain-socket

I would say that's OK to have separate command to start the lazy server.
Mike's suggestion to spawn the server automatically after restore also
makes sense, I will accept such a patch, but for debugging purpose I'd
keep the separate lazy-pages action.

> The unix domain socket was used to transfer the userfault FD (UFFD) from
> the 'criu restore' process to the 'criu lazy-pages' process. The 'criu
> lazy-pages' was then listening on the UFFD for userfaultfd messages
> which were used to retrieve the requested memory page from the
> checkpoint directory and transfer that page into the process to be
> restored.
> 
> This commit introduces the ability to keep the pages on the remote host
> and only request the transfer of the required pages over TCP on demand.
> Therefore criu needs to be started differently than previously.
> 
> Host1:
> 
>    criu restore --lazy-pages --address /path/to/unix-domain-socket
> 
>   and
> 
>    criu lazy-pages --address /path/to/unix-domain-socket \
>    --lazy-client ADDR-Host2 --port 27
> 
> Host2:
> 
>    criu lazy-pages --lazy-server --port 27

And this patch definitely requires tuning. First, as Mike notices, we already
have the code that makes dump send pages over the network -- the dump
--page-server makes this work, so for the server side I'd just extend the
dump action with the --lazy-pages option that would feed _more_ data into
page_sever_xfer after dump.

For the destination side, according to https://criu.org/Userfaultfd I planned
to see the 3rd page_read driver, that gets memory from the network and teach
the lazy_pages action (and daemon) to use this page read.

> On Host1 the process is now restored (as criu always does) except that
> the memory pages are not read from pages.img and that the appropriate
> pages are marked as being userfaultfd handled. As soon as the restored
> process tries to access one the pages a UFFD MSG is received by the
> lazy-client (on Host1). This UFFD MSG is then transferred via TCP to the
> lazy-sever (on Host2). The lazy-server retrieves the memory page from
> the local checkpoint and returns a UFFDIO COPY answer back to the
> lazy-client which can the forward this message to the local UFFD which
> inserts the page into the restored process.
> 
> The remote lazy restore has the same behavior as the local lazy restore
> that, if after 5 seconds no more messages are received on the socket
> waiting for UFFD MSG, it switches to copy remaining pages mode, where
> all non-UFFD-requested pages are transferred into the restored process.
> 
> TODO:
>   * Create from the checkpoint directory a checkpoint without the memory
>     pages which are UFFD handled. This would enable a real UFFD remote
>     restore where the UFFD pages do not need to be transferred to the
>     destination host.
> 
> Signed-off-by: Adrian Reber <areber at redhat.com>
> ---
>  criu/uffd.c | 269 +++++++++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 240 insertions(+), 29 deletions(-)
> 
> diff --git a/criu/uffd.c b/criu/uffd.c
> index 2d82d46..1c3c3a6 100644
> --- a/criu/uffd.c
> +++ b/criu/uffd.c
> @@ -7,12 +7,16 @@
>  #include <fcntl.h>
>  #include <string.h>
>  #include <time.h>
> +#include <arpa/inet.h>
> +#include <netinet/ip.h>
>  #include <sys/stat.h>
>  #include <sys/mman.h>
>  #include <sys/syscall.h>
>  #include <sys/ioctl.h>
>  #include <sys/un.h>
>  #include <sys/socket.h>
> +#include <sys/socket.h>
> +#include <sys/wait.h>
>  
>  #include "asm/page.h"
>  #include "include/log.h"
> @@ -242,41 +246,83 @@ struct uffd_pages_struct {
>  	int flags;
>  };
>  
> +struct remote_uffd {
> +	struct uffdio_copy uffdio_copy;
> +	int remaining;
> +};
> +
> +static unsigned long vdso_pages;
>  static unsigned long total_pages;
>  static unsigned long uffd_copied_pages;
>  
> -static int uffd_copy_page(int uffd, __u64 address, void *dest)
> +static void page_copied_status()
>  {
> -	struct uffdio_copy uffdio_copy;
> +	/*
> +	 * In lazy_client mode the number of total pages is unknown.
> +	 * Still print out some status message how many pages have already
> +	 * been transferred.
> +	 */
> +	if (opts.lazy_client)
> +		pr_debug("With UFFD transferred pages: (%ld)\n", uffd_copied_pages);
> +	else
> +		pr_debug("With UFFD transferred pages: (%ld/%ld)\n", uffd_copied_pages,
> +			 total_pages);
> +}
> +
> +static int uffd_copy_page(int uffd, __u64 address, void *dest, int remaining)
> +{
> +	struct remote_uffd remote_uffd;
> +	struct stat statbuf;
>  	int rc;
>  
> -	rc = get_page(address, dest);
> -	if (rc <= 0)
> +	if (fstat(uffd, &statbuf)) {
> +		pr_perror("fstat of FD failed: ");
>  		return -1;
> +	}
>  
> -	uffdio_copy.dst = address;
> -	uffdio_copy.src = (unsigned long) dest;
> -	uffdio_copy.len = page_size();
> -	uffdio_copy.mode = 0;
> -	uffdio_copy.copy = 0;
> +	if (!opts.lazy_client) {
> +		rc = get_page(address, dest);
> +		if (rc <= 0)
> +			return -1;
> +	}
>  
> -	pr_debug("uffdio_copy.dst 0x%llx\n", uffdio_copy.dst);
> -	rc = ioctl(uffd, UFFDIO_COPY, &uffdio_copy);
> +	remote_uffd.uffdio_copy.dst = address;
> +	remote_uffd.uffdio_copy.src = (unsigned long) dest;
> +	remote_uffd.uffdio_copy.len = page_size();
> +	remote_uffd.uffdio_copy.mode = 0;
> +	remote_uffd.uffdio_copy.copy = 0;
> +	remote_uffd.remaining = remaining;
> +
> +	pr_debug("uffdio_copy.dst 0x%llx\n", remote_uffd.uffdio_copy.dst);
> +	if (S_ISSOCK(statbuf.st_mode)) {
> +		pr_debug("FD is a socket\n");
> +		if (write(uffd, &remote_uffd, sizeof(remote_uffd)) != sizeof(remote_uffd)) {
> +			pr_perror("Can't write to lazy client");
> +			return -1;
> +		}
> +		if (write(uffd, dest, remote_uffd.uffdio_copy.len) != remote_uffd.uffdio_copy.len) {
> +			pr_perror("Can't write to lazy client");
> +			return -1;
> +		}
> +		return sizeof(remote_uffd.uffdio_copy);
> +	} else {
> +		rc = ioctl(uffd, UFFDIO_COPY, &remote_uffd.uffdio_copy);
> +	}
>  	pr_debug("ioctl UFFDIO_COPY rc 0x%x\n", rc);
> -	pr_debug("uffdio_copy.copy 0x%llx\n", uffdio_copy.copy);
> +	pr_debug("uffdio_copy.copy 0x%llx\n", remote_uffd.uffdio_copy.copy);
>  	if (rc) {
> -		/* real retval in ufdio_copy.copy */
> -		if (uffdio_copy.copy != -EEXIST) {
> -			pr_err("UFFDIO_COPY error %Ld\n", uffdio_copy.copy);
> +		/* real retval in remote_uffd.ufdio_copy.copy */
> +		if (remote_uffd.uffdio_copy.copy != -EEXIST) {
> +			pr_err("UFFDIO_COPY error %Ld\n", remote_uffd.uffdio_copy.copy);
>  			return -1;
>  		}
> -	} else if (uffdio_copy.copy != page_size()) {
> -		pr_err("UFFDIO_COPY unexpected size %Ld\n", uffdio_copy.copy);
> +	} else if (remote_uffd.uffdio_copy.copy != page_size()) {
> +		pr_err("UFFDIO_COPY unexpected size %Ld\n", remote_uffd.uffdio_copy.copy);
>  		return -1;
>  	}
>  
>  
> -	return uffdio_copy.copy;
> +	return remote_uffd.uffdio_copy.copy;
>  
>  }
>  
> @@ -338,8 +384,10 @@ static int collect_uffd_pages(struct page_read *pr, struct list_head *uffd_list)
>  		if (!uffd_pages)
>  			return -1;
>  		uffd_pages->addr = base;
> -		if (uffd_vdso)
> +		if (uffd_vdso) {
>  			uffd_pages->flags |= UFFD_FLAG_VDSO;
> +			vdso_pages++;
> +		}
>  		list_add(&uffd_pages->list, uffd_list);
>  	}
>  
> @@ -357,13 +405,14 @@ static int handle_remaining_pages(int uffd, struct list_head *uffd_list, void *d
>  		if (uffd_pages->flags & UFFD_FLAG_SENT)
>  			continue;
>  
> -		rc = uffd_copy_page(uffd, uffd_pages->addr, dest);
> +		rc = uffd_copy_page(uffd, uffd_pages->addr, dest,
> +				    total_pages - ++uffd_copied_pages);
>  		if (rc < 0) {
>  			pr_err("Error during UFFD copy\n");
>  			return -1;
>  		}
>  
> -		uffd_copied_pages++;
> +		page_copied_status();
>  		uffd_pages->flags |= UFFD_FLAG_SENT;
>  	}
>  
> @@ -376,13 +425,15 @@ static int handle_regular_pages(int uffd, struct list_head *uffd_list, void *des
>  	int rc;
>  	struct uffd_pages_struct *uffd_pages;
>  
> -	rc = uffd_copy_page(uffd, address, dest);
> +	rc = uffd_copy_page(uffd, address, dest, 0);
>  	if (rc < 0) {
>  		pr_err("Error during UFFD copy\n");
>  		return -1;
>  	}
>  
>  	uffd_copied_pages++;
> +	page_copied_status();
> +
>  	/*
>  	 * Mark this page as having been already transferred, so
>  	 * that it has not to be copied again later.
> @@ -403,12 +454,13 @@ static int handle_vdso_pages(int uffd, struct list_head *uffd_list, void *dest)
>  	list_for_each_entry(uffd_pages, uffd_list, list) {
>  		if (!(uffd_pages->flags & UFFD_FLAG_VDSO))
>  			continue;
> -		rc = uffd_copy_page(uffd, uffd_pages->addr, dest);
> +		rc = uffd_copy_page(uffd, uffd_pages->addr, dest, --vdso_pages);
>  		if (rc < 0) {
>  			pr_err("Error during UFFD copy\n");
>  			return -1;
>  		}
>  		uffd_copied_pages++;
> +		page_copied_status();
>  		uffd_pages->flags |= UFFD_FLAG_SENT;
>  	}
>  	return 0;
> @@ -427,6 +479,9 @@ static int find_vmas(struct list_head *uffd_list)
>  	struct page_read pr;
>  	struct uffd_pages_struct *uffd_pages;
>  
> +	/* No need to scan for VMAs in lazy_client mode. */
> +	if (opts.lazy_client)
> +		return 0;
>  
>  	if (check_img_inventory() == -1)
>  		return -1;
> @@ -475,7 +530,7 @@ static int find_vmas(struct list_head *uffd_list)
>  				vmas.priv_size += PAGE_SIZE;
>  		}
>  
> -		pr_info("vma 0x%"PRIx64" 0x%"PRIx64"\n", vma->e->start, vma->e->end);
> +		pr_info("vma 0x%" PRIx64 " 0x%" PRIx64 "\n", vma->e->start, vma->e->end);
>  	}
>  
>  	ret = open_page_read(pid, &pr, PR_TASK);
> @@ -509,7 +564,63 @@ out:
>  	return ret;
>  }
>  
> -static int handle_requests(int fd)
> +static int receive_loop(int fd, int lazy_server, void *dest)
> +{
> +	struct remote_uffd remote_uffd;
> +	int read_counter;
> +	int rc;
> +	unsigned long ps;
> +
> +	ps = page_size();
> +
> +	do {
> +		/* Let's wait for the answer. Should be sizeof(remote_uffd) */
> +		if (read(lazy_server, &remote_uffd, sizeof(remote_uffd)) != sizeof(remote_uffd)) {
> +			pr_perror("Can't read from lazy server");
> +			return -1;
> +		}
> +		pr_debug("from lazy server: uffdio_copy.dst 0x%llx\n", remote_uffd.uffdio_copy.dst);
> +		/* And now the actual data, should be exactly a page */
> +		if (remote_uffd.uffdio_copy.len != ps) {
> +			pr_err("uffdio_copy.len should never be != page size (%ld)\n", ps);
> +			return -1;
> +		}
> +		read_counter = 0;
> +		while (read_counter < ps) {
> +			rc = read(lazy_server, dest + read_counter, ps - read_counter);
> +			if (rc == -1) {
> +				pr_perror("Can't read from lazy server");
> +				return -1;
> +			}
> +			read_counter += rc;
> +			pr_debug("read_counter %d\n", read_counter);
> +		}
> +		rc = uffd_copy_page(fd, remote_uffd.uffdio_copy.dst, dest, 0);
> +		if (rc < 0) {
> +			pr_err("Error during UFFD copy\n");
> +			return -1;
> +		}
> +		uffd_copied_pages++;
> +		page_copied_status();
> +		pr_debug("remote_uffd.remaining: %d\n", remote_uffd.remaining);
> +	} while (remote_uffd.remaining != 0);
> +
> +	return 0;
> +}
> +
> +static int handle_remote_requests(int fd, int lazy_server, struct uffd_msg *msg, void *dest)
> +{
> +	pr_debug("Sending userfaultfd msg to lazy server\n");
> +
> +	if (write(lazy_server, msg, sizeof(*msg)) != sizeof(*msg)) {
> +		pr_perror("Can't write to lazy server");
> +		return -1;
> +	}
> +
> +	return receive_loop(fd, lazy_server, dest);
> +}
> +
> +static int handle_requests(int fd, int lazy_server)
>  {
>  	fd_set set;
>  	int ret = -1;
> @@ -552,6 +663,15 @@ static int handle_requests(int fd)
>  		 */
>  		timeout.tv_sec = 5;
>  		timeout.tv_usec = 0;
> +		/*
> +		 * If lazy-pages is running in lazy_server mode the timeout
> +		 * needs to be longer than on the lazy_client side to make
> +		 * sure the lazy_server does not end the whole thing
> +		 * before the lazy_client is done/ready.
> +		 */
> +		if (lazy_server > 0)
> +			timeout.tv_sec++;
> +
>  		ret = select(fd + 1, &set, NULL, NULL, &timeout);
>  		pr_debug("select() rc: 0x%x\n", ret);
>  		if (ret == 0) {
> @@ -570,6 +690,16 @@ static int handle_requests(int fd)
>  			break;
>  		}
>  
> +
> +		if (lazy_server > 0) {
> +			ret = handle_remote_requests(fd, lazy_server, &msg, dest);
> +			if (ret >= 0)
> +				continue;
> +			ret = -1;
> +			goto out;
> +
> +		}
> +
>  		ret = 0;
>  		/* Align requested address to the next page boundary */
>  		address = msg.arg.pagefault.address & ~(ps - 1);
> @@ -620,14 +750,17 @@ static int handle_requests(int fd)
>  		}
>  	}
>  	pr_debug("Handle remaining pages\n");
> -	ret = handle_remaining_pages(fd, &uffd_list, dest);
> +	if (lazy_server > 0)
> +		ret = receive_loop(fd, lazy_server, dest);
> +	else
> +		ret = handle_remaining_pages(fd, &uffd_list, dest);
>  	if (ret < 0) {
>  		pr_err("Error during remaining page copy\n");
>  		ret = 1;
>  		goto out;
>  	}
>  
> -	pr_debug("With UFFD transferred pages: (%ld/%ld)\n", uffd_copied_pages, total_pages);
> +	page_copied_status();
>  	if ((uffd_copied_pages != total_pages) && (total_pages > 0)) {
>  		pr_warn("Only %ld of %ld pages transferred via UFFD\n", uffd_copied_pages,
>  			total_pages);
> @@ -643,13 +776,82 @@ out:
>  	return ret;
>  
>  }
> +
> +static int lazy_server_serve(int sk)
> +{
> +	int ret;
> +
> +	/* The "transfer protocol" is first the pid as int and then the rest */
> +	ret = recv(sk, &pid, sizeof(pid), 0);
> +	if (ret != sizeof(pid)) {
> +		pr_perror("PID recv error:");
> +		return -1;
> +	}
> +	pr_debug("received PID: %d\n", pid);
> +
> +	return handle_requests(sk, 0);
> +
> +}
> +
> +static int lazy_server()
> +{
> +	int sk = -1;
> +	int ask = -1;
> +	int ret = 0;
> +
> +	sk = setup_tcp_server("lazy");
> +	if (sk == -1)
> +		return -1;
> +
> +	ret = run_tcp_server(opts.daemon_mode, &ask, -1, sk);
> +	if (ret != 0)
> +		return ret;
> +
> +	if (ask >= 0)
> +		ret = lazy_server_serve(ask);
> +
> +	if (opts.daemon_mode)
> +		exit(ret);
> +
> +	return ret;
> +}
> +
> +static int init_lazy_client()
> +{
> +	int lazy_server_sk;
> +
> +	lazy_server_sk = setup_tcp_client(opts.lazy_client);
> +	if (lazy_server_sk == -1)
> +		return -1;
> +
> +	/*
> +	 * CORK the socket at the very beginning. As per ANK
> +	 * the corked by default socket with sporadic NODELAY-s
> +	 * on urgent data is the smartest mode ever.
> +	 */
> +	tcp_cork(lazy_server_sk, true);
> +	return lazy_server_sk;
> +}
> +
>  int uffd_listen()
>  {
>  	int uffd;
>  	int uffd_flags;
> +	int lazy_server_sk = 0;
>  
>  	LIST_HEAD(uffd_list);
>  
> +	if (opts.lazy_server)
> +		return lazy_server();
> +
> +	if (opts.lazy_client)
> +		lazy_server_sk = init_lazy_client();
> +
> +	if (lazy_server_sk == -1) {
> +		pr_info("Connection to the lazy server failed. Exiting\n");
> +		return -1;
> +	}
> +
>  	if (!opts.addr) {
>  		pr_info("Please specify a file name for the unix domain socket\n");
>  		pr_info("used to communicate between the lazy-pages server\n");
> @@ -660,11 +862,20 @@ int uffd_listen()
>  
>  	pr_debug("Waiting for incoming connections on %s\n", opts.addr);
>  	if ((uffd = ud_open()) < 0)
> -		exit(0);
> +		return -1;
>  
>  	pr_debug("uffd is 0x%d\n", uffd);
>  	uffd_flags = fcntl(uffd, F_GETFD, NULL);
>  	pr_debug("uffd_flags are 0x%x\n", uffd_flags);
>  
> -	return handle_requests(uffd);
> +	if (lazy_server_sk) {
> +		/* The "transfer protocol" is first the pid as int and then the rest */
> +		pr_debug("Sending PID %d\n", pid);
> +		if (send(lazy_server_sk, &pid, sizeof(pid), 0) < 0) {
> +			pr_perror("PID sending error:");
> +			return -1;
> +		}
> +	}
> +
> +	return handle_requests(uffd, lazy_server_sk);
>  }
> 



More information about the CRIU mailing list