[CRIU] [PATCH v5 3/5] Try to include userfaultfd with criu (part 1)

Adrian Reber adrian at lisas.de
Wed Mar 9 23:39:52 PST 2016


From: Adrian Reber <areber at redhat.com>

This is a first try to include userfaultfd with criu. Right now it
still requires a "normal" checkpoint. After checkpointing the
application it can be restored with the help of userfaultfd.

All restored pages with MAP_ANONYMOUS and MAP_PRIVATE set are marked as
being handled by userfaultfd.

As soon as the process is restored it blocks on the first memory access
and waits for pages being transferred by userfaultfd.

To handle the required pages a new criu command has been added. For a
userfaultfd supported restore the first step is to start the
'lazy-pages' server:

  criu lazy-pages -v4 -D /tmp/3/ --address /tmp/userfault.socket

This is part 1 of the userfaultfd integration which provides the
'lazy-pages' server implementation.

v2:
    * provide option '--lazy-pages' to enable uffd style restore
    * use send_fd()/recv_fd() provided by criu (instead of own
      implementation)
    * do not install the uffd as service_fd
    * use named constants for MAP_ANONYMOUS
    * do not restore memory pages and then later mark them as uffd
      handled
    * remove function find_pages() to search in pages-<id>.img;
      now using criu functions to find the necessary pages;
      for each new page search the pages-<id>.img file is opened
    * only check the UFFDIO_API once
    * trying to protect uffd code by CONFIG_UFFD;
      use make UFFD=1 to compile criu with this patch

v3:
   * renamed the server mode from 'uffd' -> 'lazy-pages'
   * switched client and server roles transferring the UFFD FD
     * the criu part running in lazy-pages server mode is now
       waiting for connections
     * the criu restore process connects to the lazy-pages server
       to pass the UFFD FD
   * before UFFD copying anything else the VDSO pages are copied
     as it fails to copy unused VDSO pages once the process is running.
     this was necessary to be able to copy all pages.
   * if there are no more UFFD messages for 5 seconds the lazy-pages
     server switches in copy mode to copy all remaining pages, which
     have not been requested yet, into the restored process
   * check the UFFDIO_API at the correct place
   * close UFFD FD in the restorer to remove open UFFD FD in the
     restored process

v4:
    * removed unnecessary madvise() calls ; it seemed necessary when
      first running tests with uffd; it actually is not necessary
    * auto-detect if build-system provides linux/userfaultfd.h
      header
    * simplify unix domain socket setup and communication.
    * use --address to specify the location of the used
      unix domain socket

v5:
    * split the userfaultfd patch in multiple smaller patches
    * introduced vma_can_be_lazy() function to check if a page
      can be handled by uffd
    * moved uffd related code from cr-restore.c to uffd.c
    * handle failure to register a memory page of the restored process
      with userfaultfd

Signed-off-by: Adrian Reber <areber at redhat.com>
---
 criu/Makefile.config      |   6 +-
 criu/Makefile.crtools     |   4 +
 criu/crtools.c            |   6 +
 criu/include/crtools.h    |   5 +
 criu/include/uffd.h       |  16 ++
 criu/uffd.c               | 437 ++++++++++++++++++++++++++++++++++++++++++++++
 scripts/feature-tests.mak |  15 ++
 7 files changed, 488 insertions(+), 1 deletion(-)
 create mode 100644 criu/include/uffd.h
 create mode 100644 criu/uffd.c

diff --git a/criu/Makefile.config b/criu/Makefile.config
index aaaca1f..c3841b9 100644
--- a/criu/Makefile.config
+++ b/criu/Makefile.config
@@ -14,8 +14,12 @@ ifeq ($(call pkg-config-check,libselinux),y)
         DEFINES	+= -DCONFIG_HAS_SELINUX
 endif
 
+ifeq ($(call try-cc,$(FEATURE_TEST_UFFD)),y)
+	export UFFD := 1
+endif
+
 FEATURES_LIST	:= TCP_REPAIR PRLIMIT STRLCPY STRLCAT PTRACE_PEEKSIGINFO \
-	SETPROCTITLE_INIT MEMFD
+	SETPROCTITLE_INIT MEMFD UFFD
 
 # $1 - config name
 define gen-feature-test
diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools
index 4448047..f1573d1 100644
--- a/criu/Makefile.crtools
+++ b/criu/Makefile.crtools
@@ -81,6 +81,10 @@ obj-y			+= pie-util-vdso.o
 obj-y			+= vdso.o
 endif
 
+ifeq ($(UFFD),1)
+obj-y	+= uffd.o
+endif
+
 PROTOBUF_GEN := $(SRC_DIR)/scripts/protobuf-gen.sh
 
 protobuf-desc.c: protobuf-desc-gen.h
diff --git a/criu/crtools.c b/criu/crtools.c
index 800fa6d..763b5fd 100644
--- a/criu/crtools.c
+++ b/criu/crtools.c
@@ -728,6 +728,9 @@ int main(int argc, char *argv[], char *envp[])
 		return -1;
 	}
 
+	if (!strcmp(argv[optind], "lazy-pages"))
+		return uffd_listen() != 0;
+
 	if (!strcmp(argv[optind], "check"))
 		return cr_check() != 0;
 
@@ -768,6 +771,9 @@ usage:
 "  criu page-server\n"
 "  criu service [<options>]\n"
 "  criu dedup\n"
+#ifdef CONFIG_HAS_UFFD
+"  criu lazy-pages -D DIR [<options>]\n"
+#endif
 "\n"
 "Commands:\n"
 "  dump           checkpoint a process/tree identified by pid\n"
diff --git a/criu/include/crtools.h b/criu/include/crtools.h
index 82bb398..49ba0cc 100644
--- a/criu/include/crtools.h
+++ b/criu/include/crtools.h
@@ -25,6 +25,11 @@ extern int convert_to_elf(char *elf_path, int fd_core);
 extern int cr_check(void);
 extern int cr_exec(int pid, char **opts);
 extern int cr_dedup(void);
+#ifdef CONFIG_HAS_UFFD
+extern int uffd_listen(void);
+#else
+static inline int uffd_listen() { return 0; };
+#endif /* CONFIG_HAS_UFFD */
 extern int prepare_task_entries(void);
 
 extern int check_add_feature(char *arg);
diff --git a/criu/include/uffd.h b/criu/include/uffd.h
new file mode 100644
index 0000000..d5a043b
--- /dev/null
+++ b/criu/include/uffd.h
@@ -0,0 +1,16 @@
+#ifndef __CR_UFFD_H_
+#define __CR_UFFD_H_
+
+#include "config.h"
+
+#ifdef CONFIG_HAS_UFFD
+
+#include <syscall.h>
+#include <linux/userfaultfd.h>
+
+#ifndef __NR_userfaultfd
+#error "missing __NR_userfaultfd definition"
+#endif
+#endif /* CONFIG_HAS_UFFD */
+
+#endif /* __CR_UFFD_H_ */
diff --git a/criu/uffd.c b/criu/uffd.c
new file mode 100644
index 0000000..fd2d30e
--- /dev/null
+++ b/criu/uffd.c
@@ -0,0 +1,437 @@
+#include <stddef.h>
+#include <stdio.h>
+#include <errno.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/un.h>
+#include <sys/socket.h>
+
+#include "asm/page.h"
+#include "include/log.h"
+#include "include/criu-plugin.h"
+#include "include/page-read.h"
+#include "include/files-reg.h"
+#include "include/mem.h"
+#include "include/uffd.h"
+#include "include/util-pie.h"
+#include "include/pstree.h"
+#include "include/crtools.h"
+#include "include/cr_options.h"
+#include "xmalloc.h"
+
+#undef  LOG_PREFIX
+#define LOG_PREFIX "lazy-pages: "
+
+static int server_listen(struct sockaddr_un *saddr)
+{
+	int fd;
+	int len;
+
+	if (strlen(opts.addr) >= sizeof(saddr->sun_path)) {
+		return -1;
+	}
+
+	if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
+		return -1;
+
+	unlink(opts.addr);
+
+	memset(saddr, 0, sizeof(struct sockaddr_un));
+	saddr->sun_family = AF_UNIX;
+	strcpy(saddr->sun_path, opts.addr);
+	len = offsetof(struct sockaddr_un, sun_path) + strlen(opts.addr);
+
+	if (bind(fd, (struct sockaddr *) saddr, len) < 0) {
+		goto out;
+	}
+
+	if (listen(fd, 10) < 0) {
+		goto out;
+	}
+
+	return fd;
+
+out:
+	close(fd);
+	return -1;
+}
+
+static int ud_open()
+{
+	int client;
+	int listen;
+	int newfd;
+	int ret = -1;
+	struct sockaddr_un saddr;
+	socklen_t len;
+
+	if ((listen = server_listen(&saddr)) < 0) {
+		pr_perror("server_listen error");
+		return -1;
+	}
+
+	/* accept new client request */
+	len = sizeof(struct sockaddr_un);
+	if ((client = accept(listen, &saddr, &len)) < 0) {
+		pr_perror("server_accept error: %d", client);
+		close(listen);
+		return -1;
+	}
+
+	pr_debug("client fd %d\n", client);
+	newfd = recv_fd(client);
+	if (newfd < 0) {
+		pr_perror("recv_fd error:");
+		goto out;
+	}
+	pr_debug("newfd %d\n", newfd);
+	close(client);
+
+	return newfd;
+out:
+	close(listen);
+	close(client);
+	return ret;
+}
+
+static void get_page(unsigned long addr, void *dest, struct page_read *pr)
+{
+	struct iovec iov;
+	int ret;
+	unsigned char buf[PAGE_SIZE];
+
+	ret = open_page_read(root_item->pid.virt, pr, PR_TASK | PR_MOD);
+	pr_debug("ret %d\n", ret);
+
+	/* TODO: return code checking */
+	ret = pr->get_pagemap(pr, &iov);
+	pr_debug("get_pagemap  ret %d\n", ret);
+	ret = seek_pagemap_page(pr, addr, true);
+	pr_debug("seek_pagemap_page %x\n", ret);
+	ret = pr->read_pages(pr, addr, 1, buf);
+	pr_debug("read_pages ret %d\n", ret);
+	memcpy(dest, buf, PAGE_SIZE);
+	pr->close(pr);
+}
+
+#define UFFD_FLAG_SENT	0x1
+#define UFFD_FLAG_VDSO	0x2
+
+struct uffd_pages_struct {
+	struct list_head list;
+	unsigned long addr;
+	int flags;
+};
+
+static int uffd_copy_page(int uffd, struct page_read *pr, __u64 address, void *dest)
+{
+	struct uffdio_copy uffdio_copy;
+	int rc;
+
+	get_page(address, dest, pr);
+
+	uffdio_copy.dst = address;
+	uffdio_copy.src = (unsigned long) dest;
+	uffdio_copy.len = page_size();
+	uffdio_copy.mode = 0;
+	uffdio_copy.copy = 0;
+
+	pr_debug("uffdio_copy.dst 0x%llx\n", uffdio_copy.dst);
+	rc = ioctl(uffd, UFFDIO_COPY, &uffdio_copy);
+	pr_debug("ioctl UFFDIO_COPY rc 0x%x\n", rc);
+	pr_debug("uffdio_copy.copy 0x%llx\n", uffdio_copy.copy);
+	if (rc) {
+		/* real retval in ufdio_copy.copy */
+		if (uffdio_copy.copy != -EEXIST) {
+			pr_err("UFFDIO_COPY error %Ld\n", uffdio_copy.copy);
+			return -1;
+		}
+	} else if (uffdio_copy.copy != page_size()) {
+		pr_err("UFFDIO_COPY unexpected size %Ld\n", uffdio_copy.copy);
+		return -1;
+	}
+
+
+	return uffdio_copy.copy;
+
+}
+
+static int collect_uffd_pages(struct page_read *pr, struct list_head *uffd_list,
+			      unsigned long *vma_size)
+{
+	unsigned long base;
+	int i;
+	struct iovec iov;
+	unsigned long nr_pages;
+	unsigned long ps;
+	int rc;
+	struct uffd_pages_struct *uffd_pages;
+	struct vma_area *vma;
+	struct vm_area_list *vmas = &rsti(root_item)->vmas;
+
+	rc = pr->get_pagemap(pr, &iov);
+	if (rc <= 0)
+		return 0;
+
+	ps = page_size();
+	nr_pages = iov.iov_len / ps;
+	base = (unsigned long) iov.iov_base;
+	pr_debug("iov.iov_base 0x%lx (%ld pages)\n", base, nr_pages);
+
+	if (pr->put_pagemap)
+		pr->put_pagemap(pr);
+
+	for (i = 0; i < nr_pages; i++) {
+		bool uffd_page = false;
+		bool uffd_vdso = false;
+		base = (unsigned long) iov.iov_base + (i * ps);
+		/*
+		 * Only pages which are MAP_ANONYMOUS and MAP_PRIVATE
+		 * are relevant for userfaultfd handling.
+		 * Loop over all VMAs to see if the flags matching.
+		 */
+		list_for_each_entry(vma, &vmas->h, list) {
+			/*
+			 * This loop assumes that base can actually be found
+			 * in the VMA list.
+			 */
+			if (base >= vma->e->start && base < vma->e->end) {
+				if ((vma->e->flags & MAP_ANONYMOUS) &&
+				    (vma->e->flags & MAP_PRIVATE) &&
+				    !(vma_area_is(vma, VMA_AREA_VSYSCALL))) {
+					uffd_page = true;
+					if (vma_area_is(vma, VMA_AREA_VDSO))
+						uffd_vdso = true;
+					break;
+				}
+			}
+		}
+
+		/* This is not a page we are looking for. Move along */
+		if (!uffd_page)
+			continue;
+
+		pr_debug("Adding 0x%lx to our list\n", base);
+
+		*vma_size += ps;
+		uffd_pages = xzalloc(sizeof(struct uffd_pages_struct));
+		if (!uffd_pages)
+			return -1;
+		uffd_pages->addr = base;
+		if (uffd_vdso)
+			uffd_pages->flags |= UFFD_FLAG_VDSO;
+		list_add(&uffd_pages->list, uffd_list);
+	}
+
+	return 1;
+}
+
+/*
+ *  Setting up criu infrastructure to easily
+ *  access the dump results.
+ */
+static void criu_init()
+{
+	/* TODO: return code checking */
+	check_img_inventory();
+	prepare_task_entries();
+	prepare_pstree();
+	collect_remaps_and_regfiles();
+	prepare_shared_reg_files();
+	prepare_remaps();
+	prepare_mm_pid(root_item);
+
+	/* We found a PID */
+	pr_debug("root_item->pid.virt %d\n", root_item->pid.virt);
+	pr_debug("root_item->pid.real %d\n", root_item->pid.real);
+}
+
+int uffd_listen()
+{
+	__u64 address;
+	void *dest;
+	__u64 flags;
+	struct uffd_msg msg;
+	struct page_read pr;
+	unsigned long ps;
+	int rc;
+	fd_set set;
+	struct timeval timeout;
+	int uffd;
+	unsigned long uffd_copied_pages = 0;
+	int uffd_flags;
+	struct uffd_pages_struct *uffd_pages;
+	bool vdso_sent = false;
+	unsigned long vma_size = 0;
+
+	LIST_HEAD(uffd_list);
+
+	if (!opts.addr) {
+		pr_info("Please specify a file name for the unix domain socket\n");
+		pr_info("used to communicate between the lazy-pages server\n");
+		pr_info("and the restore process. Use the --address option like\n");
+		pr_info("criu --lazy-pages --address /tmp/userfault.socket\n");
+		return -1;
+	}
+
+	pr_debug("Waiting for incoming connections on %s\n", opts.addr);
+	if ((uffd = ud_open()) < 0)
+		exit(0);
+
+	pr_debug("uffd is 0x%d\n", uffd);
+	uffd_flags = fcntl(uffd, F_GETFD, NULL);
+	pr_debug("uffd_flags are 0x%x\n", uffd_flags);
+
+	/* Setting up criu infrastructure to easily access the dump results */
+	criu_init();
+
+	/* Initialize FD sets for read() with timeouts (using select()) */
+	FD_ZERO(&set);
+	FD_SET(uffd, &set);
+
+	/* All operations will be done on page size */
+	ps = page_size();
+	dest = malloc(ps);
+
+	rc = open_page_read(root_item->pid.virt, &pr, PR_TASK);
+	if (rc <= 0)
+		return 1;
+	/*
+	 * This puts all pages which should be handled by userfaultfd
+	 * in the list uffd_list. This list is later used to detect if
+	 * a page has already been transferred or if it needs to be
+	 * pushed into the process using userfaultfd.
+	 */
+	do {
+		rc = collect_uffd_pages(&pr, &uffd_list, &vma_size);
+		if (rc == -1)
+			return 1;
+	} while (rc);
+
+	if (pr.close)
+		pr.close(&pr);
+
+	while (1) {
+		bool page_sent = false;
+		/*
+		 * Setting the timeout to 5 seconds. If after this time
+		 * no uffd pages are requested the code switches to
+		 * copying the remaining pages.
+		 *
+		 * Timeout is re-defined every time select() is run as
+		 * select(2) says:
+		 *  Consider timeout to be undefined after select() returns.
+		 */
+		timeout.tv_sec = 5;
+		timeout.tv_usec = 0;
+		rc = select(uffd + 1, &set, NULL, NULL, &timeout);
+		pr_debug("select() rc: 0x%x\n", rc);
+		if (rc == 0) {
+			pr_debug("read timeout\n");
+			pr_debug("switching from request to copy mode\n");
+			break;
+		}
+		rc = read(uffd, &msg, sizeof(msg));
+		pr_debug("read() rc: 0x%x\n", rc);
+
+		if (rc != sizeof(msg)) {
+			if (rc < 0)
+				pr_perror("read error");
+			else
+				pr_debug("short read\n");
+			continue;
+		}
+
+		/* Align requested address to the next page boundary */
+		address = msg.arg.pagefault.address & ~(ps - 1);
+		pr_debug("msg.arg.pagefault.address 0x%llx\n", address);
+
+		/*
+		 * At this point the process on the other side waits for the first page.
+		 * In the first step we will force the vdso pages into the new process.
+		 */
+		if (!vdso_sent) {
+			pr_debug("Pushing VDSO pages once\n");
+			list_for_each_entry(uffd_pages, &uffd_list, list) {
+				if (!(uffd_pages->flags & UFFD_FLAG_VDSO))
+					continue;
+				rc = uffd_copy_page(uffd, &pr, uffd_pages->addr, dest);
+				if (rc < 0) {
+					pr_err("Error during UFFD copy\n");
+					return 1;
+				}
+				vma_size -= rc;
+				uffd_copied_pages++;
+				uffd_pages->flags |= UFFD_FLAG_SENT;
+			}
+			vdso_sent = true;
+		}
+
+		/* Make sure to not transfer a page twice */
+		list_for_each_entry(uffd_pages, &uffd_list, list) {
+			if ((uffd_pages->addr == address) && (uffd_pages->flags & UFFD_FLAG_SENT)) {
+				page_sent = true;
+				break;
+			}
+		}
+
+		if (page_sent)
+			continue;
+
+		/* Now handle the pages actually requested. */
+
+		flags = msg.arg.pagefault.flags;
+		pr_debug("msg.arg.pagefault.flags 0x%llx\n", flags);
+
+		if (msg.event != UFFD_EVENT_PAGEFAULT) {
+			pr_err("unexpected msg event %u\n", msg.event);
+			return 1;
+		}
+
+		rc = uffd_copy_page(uffd, &pr, address, dest);
+		if (rc < 0) {
+			pr_err("Error during UFFD copy\n");
+			return 1;
+		}
+		vma_size -= rc;
+		uffd_copied_pages++;
+
+		/*
+		 * Mark this page as having been already transferred, so
+		 * that it has not to be copied again later.
+		 */
+		list_for_each_entry(uffd_pages, &uffd_list, list) {
+			if (uffd_pages->addr == address)
+				uffd_pages->flags |= UFFD_FLAG_SENT;
+		}
+	}
+	pr_debug("remaining vma_size: 0x%lx\n", vma_size);
+	pr_debug("uffd_copied_pages:    %ld\n", uffd_copied_pages);
+	list_for_each_entry(uffd_pages, &uffd_list, list) {
+		pr_debug("Checking remaining pages 0x%lx (flags 0x%x)\n",
+			 uffd_pages->addr, uffd_pages->flags);
+		if (uffd_pages->flags & UFFD_FLAG_SENT)
+			continue;
+
+		rc = uffd_copy_page(uffd, &pr, uffd_pages->addr, dest);
+		if (rc < 0) {
+			pr_err("Error during UFFD copy\n");
+			return 1;
+		}
+		vma_size -= rc;
+
+		pr_debug("remaining vma_size: 0x%lx\n", vma_size);
+		pr_debug("uffd_copied_pages:    %ld\n", ++uffd_copied_pages);
+		uffd_pages->flags |= UFFD_FLAG_SENT;
+	}
+	close(uffd);
+	return 0;
+}
diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak
index 525c73e..51a54de 100644
--- a/scripts/feature-tests.mak
+++ b/scripts/feature-tests.mak
@@ -108,3 +108,18 @@ int main(void)
 }
 
 endef
+
+define FEATURE_TEST_UFFD
+
+#include <syscall.h>
+#include <linux/userfaultfd.h>
+
+int main(void)
+{
+#ifndef __NR_userfaultfd
+#error "missing __NR_userfaultfd definition"
+#endif
+	return 0;
+}
+
+endef
-- 
1.8.3.1



More information about the CRIU mailing list