[CRIU] [PATCH v3] Try to include userfaultfd with criu

Adrian Reber adrian at lisas.de
Wed Dec 9 12:24:47 PST 2015


From: Adrian Reber <areber at redhat.com>

This is a first try to include userfaultfd with criu. Right now it
still requires a "normal" checkpoint. After checkpointing the application
it can be restored with the help of userfaultfd.

All restored pages with MAP_ANONYMOUS and MAP_PRIVATE set are marked as
being handled by userfaultfd and also madvise()'d as MADV_DONTNEED.
MADV_DONTNEED is needed to make sure the pages are not mapped in and are
actually triggering a userfault.

As soon as the process is restored it blocks on the first memory access
and waits for pages being transferred by userfaultfd.

To handle the required pages a new criu command has been added. For a
userfaultfd supported restore the first step is to start the
'lazy-pages' server:

  criu lazy-pages -v4 -D /tmp/3/

This waits on a unix domain socket (hard-coded to /tmp/userfault.socket
for now) to receive a userfaultfd file descriptor from a '--lazy-pages'
enabled 'criu restore:

  criu restore -D /tmp/3 -j -v4 --lazy-pages

In the first step the VDSO pages are pushed from the lazy-pages server
into the restored process. After that the lazy-pages server waits on the
UFFD FD for a UFFD requested page. If there are no requests received
during a period of 5 seconds the lazy-pages server switches into a mode
where the remaining, non-transferred pages are copied into the
destination process. After all remaining pages have been copied the
lazy-pages server exits.

The use case to use usefaultfd with a checkpointed process on a remote
machine will probably benefit from the current work related to
image-cache and image-proxy.

For the final implementation it would be nice to have a restore running
in uffd mode on one system which requests the memory pages over the
network from another system which is running 'criu checkpoint' also in
uffd mode. This way the pages need to be copied only 'once' from the
checkpoint process to the uffd restore process.

TODO:
    * Contains still many debug outputs which need to be cleaned up.
    * Maybe transfer the dump directory FD also via unix domain sockets
      so that the 'uffd'/'lazy-pages' server can keep running without
      the need to specify the dump directory with '-D'
    * Keep the lazy-pages server running after all pages have been
      transferred and start waiting for new connections to serve.

v2:
    * provide option '--lazy-pages' to enable uffd style restore
    * use send_fd()/recv_fd() provided by criu (instead of own
      implementation)
    * do not install the uffd as service_fd
    * use named constants for MAP_ANONYMOUS
    * do not restore memory pages and then later mark them as uffd
      handled
    * remove function find_pages() to search in pages-<id>.img;
      now using criu functions to find the necessary pages;
      for each new page search the pages-<id>.img file is opened
    * only check the UFFDIO_API once
    * trying to protect uffd code by CONFIG_UFFD;
      use make UFFD=1 to compile criu with this patch

v3:
   * renamed the server mode from 'uffd' -> 'lazy-pages'
   * switched client and server roles transferring the UFFD FD
     * the criu part running in lazy-pages server mode is now
       waiting for connections
     * the criu restore process connects to the lazy-pages server
       to pass the UFFD FD
   * before UFFD copying anything else the VDSO pages are copied
     as it fails to copy unused VDSO pages once the process is running.
     this was necessary to be able to copy all pages.
   * if there are no more UFFD messages for 5 seconds the lazy-pages
     server switches in copy mode to copy all remaining pages, which
     have not been requested yet, into the restored process
   * check the UFFDIO_API at the correct place
   * close UFFD FD in the restorer to remove open UFFD FD in the
     restored process

Signed-off-by: Adrian Reber <areber at redhat.com>
---
 Makefile             |   4 +
 Makefile.config      |   3 +
 Makefile.crtools     |   3 +
 cr-restore.c         | 103 ++++++++++-
 crtools.c            |  20 +++
 include/cr_options.h |   1 +
 include/crtools.h    |   2 +
 include/page-read.h  |   2 +
 include/restorer.h   |   2 +
 include/uffd.h       |  24 +++
 page-read.c          |  10 ++
 pie/restorer.c       |  76 +++++++-
 uffd.c               | 496 +++++++++++++++++++++++++++++++++++++++++++++++++++
 13 files changed, 742 insertions(+), 4 deletions(-)
 create mode 100644 include/uffd.h
 create mode 100644 uffd.c

diff --git a/Makefile b/Makefile
index 1793091..fe62488 100644
--- a/Makefile
+++ b/Makefile
@@ -163,6 +163,10 @@ ifeq ($(GMON),1)
 	GMONLDOPT = -pg
 endif
 
+ifeq ($(UFFD),1)
+	DEFINES += -DUFFD
+endif
+
 CFLAGS		+= $(WARNINGS) $(DEFINES)
 SYSCALL-LIB	:= $(ARCH_DIR)/syscalls.built-in.o
 ARCH-LIB	:= $(ARCH_DIR)/crtools.built-in.o
diff --git a/Makefile.config b/Makefile.config
index ce4b8d8..3df8f53 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -44,6 +44,9 @@ endif
 ifeq ($(piegen-y),y)
 	$(Q) @echo '#define CONFIG_PIEGEN' >> $@
 endif
+ifeq ($(UFFD),1)
+	$(Q) @echo '#define CONFIG_UFFD' >> $@
+endif
 	$(Q) @echo '#endif /* __CR_CONFIG_H__ */' >> $@
 
 config: $(CONFIG)
diff --git a/Makefile.crtools b/Makefile.crtools
index 77a7421..32c540f 100644
--- a/Makefile.crtools
+++ b/Makefile.crtools
@@ -79,6 +79,9 @@ obj-y	+= fault-injection.o
 obj-y	+= pie/util-fd.o
 obj-y	+= pie/util.o
 obj-y	+= seccomp.o
+ifeq ($(UFFD),1)
+obj-y	+= uffd.o
+endif
 
 ifneq ($(MAKECMDGOALS),clean)
 incdeps := y
diff --git a/cr-restore.c b/cr-restore.c
index 3c636b9..abf762b 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -19,6 +19,7 @@
 #include <sys/shm.h>
 #include <sys/mount.h>
 #include <sys/prctl.h>
+#include <sys/syscall.h>
 
 #include <sched.h>
 
@@ -78,6 +79,8 @@
 #include "seccomp.h"
 #include "bitmap.h"
 #include "fault-injection.h"
+#include "uffd.h"
+
 #include "parasite-syscall.h"
 
 #include "protobuf.h"
@@ -471,6 +474,17 @@ static int restore_priv_vma_content(void)
 			p = decode_pointer((off) * PAGE_SIZE +
 					vma->premmaped_addr);
 
+			/*
+			 * This means that userfaultfd is used to load the pages
+			 * on demand.
+			 */
+			if (opts.lazy_pages && (vma->e->flags & MAP_ANONYMOUS) &&
+					(vma->e->flags & MAP_PRIVATE)) {
+				pr_debug("Lazy restore skips %lx\n", vma->e->start);
+				pr.skip_pages(&pr, PAGE_SIZE);
+				continue;
+			}
+
 			set_bit(off, vma->page_bitmap);
 			if (vma->ppage_bitmap) { /* inherited vma */
 				clear_bit(off, vma->ppage_bitmap);
@@ -1996,7 +2010,7 @@ out:
 	return -1;
 }
 
-static int prepare_task_entries(void)
+int prepare_task_entries(void)
 {
 	task_entries_pos = rst_mem_cpos(RM_SHREMAP);
 	task_entries = rst_mem_alloc(sizeof(*task_entries), RM_SHREMAP);
@@ -2712,6 +2726,62 @@ out:
 extern void __gcov_flush(void) __attribute__((weak));
 void __gcov_flush(void) {}
 
+#define CLI_PATH "/var/tmp/"
+
+static int send_uffd(int sendfd)
+{
+	int fd;
+	int len;
+	int ret = -1;
+	struct sockaddr_un un;
+	struct sockaddr_un sun;
+
+	if (sendfd < 0)
+		return -1;
+
+	if (strlen(UFFD_SK) >= sizeof(un.sun_path)) {
+		return -1;
+	}
+
+	if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
+		return -1;
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	sprintf(un.sun_path, "%s%05ld", CLI_PATH, (long) getpid());
+	pr_debug("file is %s\n", un.sun_path);
+	len = offsetof(struct sockaddr_un, sun_path) + strlen(un.sun_path);
+
+	/* remove it, if it exists */
+	unlink(un.sun_path);
+	if (bind(fd, (struct sockaddr *) &un, len) < 0) {
+		pr_perror("bind failed");
+		goto out;
+	}
+	if (chmod(un.sun_path, S_IRWXU) < 0) {
+		pr_perror("chmod failed");
+		goto out;
+	}
+
+	memset(&sun, 0, sizeof(sun));
+	sun.sun_family = AF_UNIX;
+	strcpy(sun.sun_path, UFFD_SK);
+	len = offsetof(struct sockaddr_un, sun_path) + strlen(UFFD_SK);
+	if (connect(fd, (struct sockaddr *) &sun, len) < 0) {
+		pr_perror("connect failed");
+		goto out;
+	}
+
+	if (send_fd(fd, NULL, 0, sendfd) < 0) {
+		pr_perror("send_fd error:");
+		goto out;
+	}
+	ret = 0;
+out:
+	close(fd);
+	return ret;
+}
+
 static int sigreturn_restore(pid_t pid, CoreEntry *core)
 {
 	void *mem = MAP_FAILED;
@@ -3008,6 +3078,37 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 
 	strncpy(task_args->comm, core->tc->comm, sizeof(task_args->comm));
 
+	if (!opts.lazy_pages)
+		task_args->uffd = -1;
+#ifdef CONFIG_UFFD
+	else {
+		struct uffdio_api uffdio_api;
+		/*
+		 * Open userfaulfd FD which is passed to the restorer blob and
+		 * to a second process handling the userfaultfd page faults.
+		 */
+		task_args->uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+
+		/*
+		 * Check if the UFFD_API is the one which is expected
+		 */
+		uffdio_api.api = UFFD_API;
+		uffdio_api.features = 0;
+		if (ioctl(task_args->uffd, UFFDIO_API, &uffdio_api)) {
+			pr_err("Checking for UFFDIO_API failed.\n");
+			goto err;
+		}
+		if (uffdio_api.api != UFFD_API) {
+			pr_err("Result of looking up UFFDIO_API does not match: %Lu\n", uffdio_api.api);
+			goto err;
+		}
+
+		if (send_uffd(task_args->uffd) < 0) {
+			close(task_args->uffd);
+			goto err;
+		}
+	}
+#endif
 
 	/*
 	 * Fill up per-thread data.
diff --git a/crtools.c b/crtools.c
index 68756a0..c123671 100644
--- a/crtools.c
+++ b/crtools.c
@@ -255,6 +255,9 @@ int main(int argc, char *argv[], char *envp[])
 		{ "ghost-limit",		required_argument,	0, 1069 },
 		{ "irmap-scan-path",		required_argument,	0, 1070 },
 		{ "lsm-profile",		required_argument,	0, 1071 },
+#ifdef CONFIG_UFFD
+		{ "lazy-pages",			no_argument,		0, 1072 },
+#endif
 		{ },
 	};
 
@@ -504,6 +507,11 @@ int main(int argc, char *argv[], char *envp[])
 			if (parse_lsm_arg(optarg) < 0)
 				return -1;
 			break;
+#ifdef CONFIG_UFFD
+		case 1072:
+			opts.lazy_pages = true;
+			break;
+#endif
 		case 'M':
 			{
 				char *aux;
@@ -635,6 +643,9 @@ int main(int argc, char *argv[], char *envp[])
 		return ret != 0;
 	}
 
+	if (!strcmp(argv[optind], "lazy-pages"))
+		return uffd_listen() != 0;
+
 	if (!strcmp(argv[optind], "show"))
 		return cr_show(pid) != 0;
 
@@ -678,6 +689,9 @@ usage:
 "  criu page-server\n"
 "  criu service [<options>]\n"
 "  criu dedup\n"
+#ifdef CONFIG_UFFD
+"  criu lazy-pages -D DIR [<options>]\n"
+#endif
 "\n"
 "Commands:\n"
 "  dump           checkpoint a process/tree identified by pid\n"
@@ -716,6 +730,12 @@ usage:
 "                        restore making it the parent of the restored process\n"
 "  --freeze-cgroup\n"
 "                        use cgroup freezer to collect processes\n"
+#ifdef CONFIG_UFFD
+"  --lazy-pages          restore pages on demand\n"
+"                        this requires running a second instance of criu\n"
+"                        in lazy-pages mode: 'criu lazy-pages -D DIR'\n"
+"                        --lazy-pages and lazy-pages mode require userfaultfd\n"
+#endif
 "\n"
 "* Special resources support:\n"
 "  -x|--" USK_EXT_PARAM "inode,.." "      allow external unix connections (optionally can be assign socket's inode that allows one-sided dump)\n"
diff --git a/include/cr_options.h b/include/cr_options.h
index d0c74fe..682ab7a 100644
--- a/include/cr_options.h
+++ b/include/cr_options.h
@@ -97,6 +97,7 @@ struct cr_options {
 	struct list_head	irmap_scan_paths;
 	bool			lsm_supplied;
 	char			*lsm_profile;
+	bool			lazy_pages;
 };
 
 extern struct cr_options opts;
diff --git a/include/crtools.h b/include/crtools.h
index bbed0ef..9e98e92 100644
--- a/include/crtools.h
+++ b/include/crtools.h
@@ -23,6 +23,8 @@ extern int convert_to_elf(char *elf_path, int fd_core);
 extern int cr_check(void);
 extern int cr_exec(int pid, char **opts);
 extern int cr_dedup(void);
+extern int uffd_listen(void);
+extern int prepare_task_entries(void);
 
 extern int check_add_feature(char *arg);
 
diff --git a/include/page-read.h b/include/page-read.h
index 827e4ac..f5a267a 100644
--- a/include/page-read.h
+++ b/include/page-read.h
@@ -51,6 +51,7 @@ struct page_read {
 	/* stop working on current pagemap */
 	void (*put_pagemap)(struct page_read *);
 	void (*close)(struct page_read *);
+	void (*skip_pages)(struct page_read *, unsigned long len);
 
 	/* Private data of reader */
 	struct cr_img *pmi;
@@ -84,6 +85,7 @@ extern int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flag
 extern void pagemap2iovec(PagemapEntry *pe, struct iovec *iov);
 extern void iovec2pagemap(struct iovec *iov, PagemapEntry *pe);
 extern int seek_pagemap_page(struct page_read *pr, unsigned long vaddr, bool warn);
+extern void rewind_pagemap(struct page_read *pr);
 
 extern int dedup_one_iovec(struct page_read *pr, struct iovec *iov);
 extern int punch_hole(struct page_read *pr, unsigned long off, unsigned long len, bool cleanup);
diff --git a/include/restorer.h b/include/restorer.h
index 74be81d..6269e25 100644
--- a/include/restorer.h
+++ b/include/restorer.h
@@ -102,6 +102,8 @@ struct task_restore_args {
 	int				logfd;
 	unsigned int			loglevel;
 
+	int				uffd;
+
 	/* threads restoration */
 	int				nr_threads;		/* number of threads */
 	thread_restore_fcall_t		clone_restore_fn;	/* helper address for clone() call */
diff --git a/include/uffd.h b/include/uffd.h
new file mode 100644
index 0000000..5b8e10d
--- /dev/null
+++ b/include/uffd.h
@@ -0,0 +1,24 @@
+#ifndef __CR_UFFD_H_
+#define __CR_UFFD_H_
+
+#include "config.h"
+
+#ifdef CONFIG_UFFD
+
+#include <syscall.h>
+#include <linux/userfaultfd.h>
+
+#ifndef __NR_userfaultfd
+#error "missing __NR_userfaultfd definition"
+#endif
+
+/*
+ * This is the socket used to exchange the socket to transfer
+ * the uffd file descriptor from the restore process to the
+ * criu lazy-pages server
+ */
+#define	UFFD_SK "/tmp/userfault.socket"
+
+#endif /* UFFD */
+
+#endif /* __CR_UFFD_H_ */
diff --git a/page-read.c b/page-read.c
index 28ecd5b..5b71d9f 100644
--- a/page-read.c
+++ b/page-read.c
@@ -97,6 +97,14 @@ static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
 	pr->cvaddr += len;
 }
 
+void rewind_pagemap(struct page_read *pr)
+{
+	/* This does not work at all. This is only a test */
+	pr_debug("\tpr%u Rewind page-dump\n", pr->id);
+	lseek(img_raw_fd(pr->pi), 0, SEEK_SET);
+	pr->cvaddr = 0;
+}
+
 int seek_pagemap_page(struct page_read *pr, unsigned long vaddr, bool warn)
 {
 	int ret;
@@ -328,6 +336,7 @@ int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
 	pr->put_pagemap = put_pagemap;
 	pr->read_pages = read_pagemap_page;
 	pr->close = close_page_read;
+	pr->skip_pages = skip_pagemap_pages;
 	pr->id = ids++;
 
 	pr_debug("Opened page read %u (parent %u)\n",
@@ -350,6 +359,7 @@ open_old:
 	pr->read_pages = read_page;
 	pr->pi = NULL;
 	pr->close = close_page_read;
+	pr->skip_pages = NULL;
 
 	return 1;
 }
diff --git a/pie/restorer.c b/pie/restorer.c
index 4665c5d..4d1df88 100644
--- a/pie/restorer.c
+++ b/pie/restorer.c
@@ -25,6 +25,7 @@
 #include "image.h"
 #include "sk-inet.h"
 #include "vma.h"
+#include "uffd.h"
 
 #include "crtools.h"
 #include "lock.h"
@@ -555,7 +556,51 @@ static void rst_tcp_socks_all(struct task_restore_args *ta)
 		rst_tcp_repair_off(&ta->tcp_socks[i]);
 }
 
-static int vma_remap(unsigned long src, unsigned long dst, unsigned long len)
+
+
+
+static void enable_uffd(int uffd, unsigned long addr, unsigned long len)
+{
+	/*
+	 * If uffd == -1, this means that userfaultfd is not enabled
+	 * or it is not available.
+	 */
+	if (uffd == -1)
+		return;
+#ifdef CONFIG_UFFD
+	int rc;
+	struct uffdio_register uffdio_register;
+	unsigned long expected_ioctls;
+
+	uffdio_register.range.start = addr;
+	uffdio_register.range.len = len;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+	pr_info("lazy-pages: uffdio_register.range.start 0x%lx\n", (unsigned long) uffdio_register.range.start);
+	pr_info("lazy-pages: uffdio_register.len 0x%llx\n", uffdio_register.range.len);
+	rc = sys_ioctl(uffd, UFFDIO_REGISTER, &uffdio_register);
+	pr_info("lazy-pages: ioctl UFFDIO_REGISTER rc %d\n", rc);
+	pr_info("lazy-pages: uffdio_register.range.start 0x%lx\n", (unsigned long) uffdio_register.range.start);
+	pr_info("lazy-pages: uffdio_register.len 0x%llx\n", uffdio_register.range.len);
+
+	expected_ioctls = (1 << _UFFDIO_WAKE) | (1 << _UFFDIO_COPY) | (1 << _UFFDIO_ZEROPAGE);
+
+	if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
+		pr_err("lazy-pages: unexpected missing uffd ioctl for anon memory\n");
+	}
+
+	/*
+	 * madvise uffd handled pages as MADV_DONTNEED to guarantee that the
+	 * pages are not mapped; without madvise it will fail.
+	 */
+	if (sys_madvise(uffdio_register.range.start, uffdio_register.range.len, MADV_DONTNEED)) {
+		pr_err("lazy-pages: madvise MADV_DONTNEED uffd pages failed");
+	}
+#endif
+
+}
+
+
+static int vma_remap(unsigned long src, unsigned long dst, unsigned long len, int uffd, int flags)
 {
 	unsigned long guard = 0, tmp;
 
@@ -628,6 +673,17 @@ static int vma_remap(unsigned long src, unsigned long dst, unsigned long len)
 		return -1;
 	}
 
+	/*
+	 * If running in userfaultfd/lazy-pages mode pages with
+	 * MAP_ANONYMOUS and MAP_PRIVATE are remapped but without the
+	 * real content.
+	 * The function enable_uffd() marks the page(s) as userfaultfd
+	 * pages, so that the processes will hang until the memory is
+	 * injected via userfaultfd.
+	 */
+	if ((flags & MAP_PRIVATE) && (flags & MAP_ANONYMOUS))
+		enable_uffd(uffd, dst, len);
+
 	return 0;
 }
 
@@ -888,6 +944,10 @@ long __export_restore_task(struct task_restore_args *args)
 
 	pr_info("Switched to the restorer %d\n", my_pid);
 
+	if (args->uffd > -1) {
+		pr_debug("lazy-pages: uffd %d\n", args->uffd);
+	}
+
 	if (vdso_do_park(&args->vdso_sym_rt, args->vdso_rt_parked_at, vdso_rt_size))
 		goto core_restore_end;
 
@@ -909,7 +969,7 @@ long __export_restore_task(struct task_restore_args *args)
 			break;
 
 		if (vma_remap(vma_premmaped_start(vma_entry),
-				vma_entry->start, vma_entry_len(vma_entry)))
+				vma_entry->start, vma_entry_len(vma_entry), args->uffd, vma_entry->flags))
 			goto core_restore_end;
 	}
 
@@ -927,10 +987,20 @@ long __export_restore_task(struct task_restore_args *args)
 			break;
 
 		if (vma_remap(vma_premmaped_start(vma_entry),
-				vma_entry->start, vma_entry_len(vma_entry)))
+				vma_entry->start, vma_entry_len(vma_entry), args->uffd, vma_entry->flags))
 			goto core_restore_end;
 	}
 
+	if (args->uffd > -1) {
+		pr_debug("lazy-pages: closing uffd %d\n", args->uffd);
+		/*
+		 * All userfaultfd configuration has finished at this point.
+		 * Let's close the UFFD file descriptor, so that the restored
+		 * process does not have an opened UFFD FD for ever.
+		 */
+		sys_close(args->uffd);
+	}
+
 	/*
 	 * OK, lets try to map new one.
 	 */
diff --git a/uffd.c b/uffd.c
new file mode 100644
index 0000000..8923916
--- /dev/null
+++ b/uffd.c
@@ -0,0 +1,496 @@
+#include <stddef.h>
+#include <stdio.h>
+#include <errno.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/un.h>
+#include <sys/socket.h>
+
+#include "asm/page.h"
+#include "include/log.h"
+#include "include/criu-plugin.h"
+#include "include/page-read.h"
+#include "include/files-reg.h"
+#include "include/mem.h"
+#include "include/uffd.h"
+#include "include/util-pie.h"
+#include "include/pstree.h"
+#include "include/crtools.h"
+#include "xmalloc.h"
+
+#undef  LOG_PREFIX
+#define LOG_PREFIX "lazy-pages: "
+
+static int server_listen(struct sockaddr_un *saddr)
+{
+	int fd;
+	int len;
+
+	if (strlen(UFFD_SK) >= sizeof(saddr->sun_path)) {
+		return -1;
+	}
+
+	if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
+		return -1;
+
+	unlink(UFFD_SK);
+
+	memset(saddr, 0, sizeof(struct sockaddr_un));
+	saddr->sun_family = AF_UNIX;
+	strcpy(saddr->sun_path, UFFD_SK);
+	len = offsetof(struct sockaddr_un, sun_path) + strlen(UFFD_SK);
+
+	if (bind(fd, (struct sockaddr *) saddr, len) < 0) {
+		goto out;
+	}
+
+	if (listen(fd, 10) < 0) {
+		goto out;
+	}
+
+	return fd;
+
+out:
+	close(fd);
+	return -1;
+}
+
+static int server_accept(int listen, struct sockaddr_un *saddr)
+{
+	int client;
+	int ret = -1;
+	socklen_t len;
+	time_t staletime;
+	struct stat statbuf;
+	char *name;
+
+	if ((name = malloc(sizeof(saddr->sun_path + 1))) == NULL)
+		return ret;
+
+	len = sizeof(struct sockaddr_un);
+	if ((client = accept(listen, (struct sockaddr *) saddr, &len)) < 0) {
+		free(name);
+		return ret;
+	}
+
+	len -= offsetof(struct sockaddr_un, sun_path);
+	memcpy(name, saddr->sun_path, len);
+	name[len] = 0;
+	if (stat(name, &statbuf) < 0) {
+		goto out;
+	}
+
+	if (S_ISSOCK(statbuf.st_mode) == 0) {
+		goto out;
+	}
+
+	if ((statbuf.st_mode & (S_IRWXG | S_IRWXO)) || (statbuf.st_mode & S_IRWXU) != S_IRWXU) {
+		goto out;
+	}
+
+	staletime = time(NULL) - 30;
+	if (statbuf.st_atime < staletime ||
+	    statbuf.st_ctime < staletime || statbuf.st_mtime < staletime) {
+		/* inode is older than 30 seconds; see above */
+		goto out;
+	}
+
+	unlink(name);
+	free(name);
+	return client;
+
+out:
+	unlink(name);
+	close(client);
+	free(name);
+	return ret;
+}
+
+static int ud_open()
+{
+	int client;
+	int listen;
+	int newfd;
+	int ret = -1;
+	struct sockaddr_un saddr;
+
+	if ((listen = server_listen(&saddr)) < 0) {
+		pr_perror("server_listen error");
+		return -1;
+	}
+
+	/* accept new client request */
+	if ((client = server_accept(listen, &saddr)) < 0) {
+		pr_perror("server_accept error: %d", client);
+		close(listen);
+		return -1;
+	}
+
+	pr_debug("client fd %d\n", client);
+	newfd = recv_fd(client);
+	if (newfd < 0) {
+		pr_perror("recv_fd error:");
+		goto out;
+	}
+	pr_debug("newfd %d\n", newfd);
+	close(client);
+
+	return newfd;
+out:
+	close(listen);
+	close(client);
+	return ret;
+}
+
+static void get_page(unsigned long addr, void *dest, struct page_read *pr)
+{
+	struct iovec iov;
+	int ret;
+	unsigned char buf[PAGE_SIZE];
+
+#ifdef rewind
+	/*
+	 * TODO: The idea behind rewind is to get a function
+	 * which rewinds (seek(fd, 0, SEEK_SET)) the image pointer.
+	 * Unfortunetaly this does not yet exist and so the image
+	 * is opened anc closed for each page retrieved.
+	 */
+	rewind_pagemap(pr);
+#else
+	ret = open_page_read(root_item->pid.virt, pr, PR_TASK | PR_MOD);
+	pr_debug("ret %d\n", ret);
+#endif
+	/* TODO: return code checking */
+	ret = pr->get_pagemap(pr, &iov);
+	pr_debug("get_pagemap  ret %d\n", ret);
+	ret = seek_pagemap_page(pr, addr, true);
+	pr_debug("seek_pagemap_page %x\n", ret);
+	ret = pr->read_pages(pr, addr, 1, buf);
+	pr_debug("read_pages ret %d\n", ret);
+	memcpy(dest, buf, PAGE_SIZE);
+#ifndef rewind
+	pr->close(pr);
+#endif
+}
+
+#define UFFD_FLAG_SENT	0x1
+#define UFFD_FLAG_VDSO	0x2
+
+struct uffd_pages_struct {
+        struct list_head                list;
+        unsigned long                   addr;
+        int                             flags;
+};
+
+static int uffd_copy_page(int uffd, struct page_read *pr, __u64 address,
+			  void *dest)
+{
+	struct uffdio_copy uffdio_copy;
+	int rc;
+
+	get_page(address, dest, pr);
+
+	uffdio_copy.dst = address;
+	uffdio_copy.src = (unsigned long) dest;
+	uffdio_copy.len = page_size();
+	uffdio_copy.mode = 0;
+	uffdio_copy.copy = 0;
+
+	pr_debug("uffdio_copy.dst 0x%llx\n", uffdio_copy.dst);
+	rc = ioctl(uffd, UFFDIO_COPY, &uffdio_copy);
+	pr_debug("ioctl UFFDIO_COPY rc 0x%x\n", rc);
+	pr_debug("uffdio_copy.copy 0x%llx\n", uffdio_copy.copy);
+	if (rc) {
+		/* real retval in ufdio_copy.copy */
+		if (uffdio_copy.copy != -EEXIST) {
+			pr_err("UFFDIO_COPY error %Ld\n", uffdio_copy.copy);
+			return -1;
+		}
+	} else if (uffdio_copy.copy != page_size()) {
+		pr_err("UFFDIO_COPY unexpected size %Ld\n", uffdio_copy.copy);
+		return -1;
+	}
+
+
+	return uffdio_copy.copy;
+
+}
+
+static int collect_uffd_pages(struct page_read *pr, struct list_head * uffd_list, unsigned long *vma_size)
+{
+	unsigned long base;
+	int i;
+	struct iovec iov;
+	unsigned long nr_pages;
+	unsigned long ps;
+	int rc;
+	struct uffd_pages_struct *uffd_pages;
+	struct vma_area *vma;
+	struct vm_area_list *vmas = &rsti(root_item)->vmas;
+
+	rc = pr->get_pagemap(pr, &iov);
+	if (rc <= 0)
+		return 0;
+
+	ps = page_size();
+	nr_pages = iov.iov_len / ps;
+	base = (unsigned long) iov.iov_base;
+	pr_debug("iov.iov_base 0x%lx (%ld pages)\n", base, nr_pages);
+
+	if (pr->put_pagemap)
+		pr->put_pagemap(pr);
+
+	for (i = 0; i < nr_pages; i++) {
+		bool uffd_page = false;
+		bool uffd_vdso = false;
+		base = (unsigned long) iov.iov_base + (i * ps);
+		/*
+		 * Only pages which are MAP_ANONYMOUS and MAP_PRIVATE
+		 * are relevant for userfaultfd handling.
+		 * Loop over all VMAs to see if the flags matching.
+		 */
+		list_for_each_entry(vma, &vmas->h, list) {
+			/*
+			 * This loop assumes that base can actually be found
+			 * in the VMA list.
+			 */
+			if (base >= vma->e->start && base < vma->e->end) {
+				if ((vma->e->flags & MAP_ANONYMOUS) &&
+				    (vma->e->flags & MAP_PRIVATE) &&
+				    !(vma_area_is(vma, VMA_AREA_VSYSCALL))) {
+					uffd_page = true;
+					if (vma_area_is(vma, VMA_AREA_VDSO))
+						uffd_vdso = true;
+					break;
+				}
+			}
+		}
+
+		/* This is not a page we are looking for. Move along */
+		if (!uffd_page)
+			continue;
+
+		pr_debug("Adding 0x%lx to our list\n", base);
+
+		*vma_size += ps;
+		uffd_pages = xzalloc(sizeof(struct uffd_pages_struct));
+		if (!uffd_pages)
+			return -1;
+		uffd_pages->addr = base;
+		if (uffd_vdso)
+			uffd_pages->flags |= UFFD_FLAG_VDSO;
+		list_add(&uffd_pages->list, uffd_list);
+	}
+
+	return 1;
+}
+
+/*
+ *  Setting up criu infrastructure to easily
+ *  access the dump results.
+ */
+static void criu_init()
+{
+	/* TODO: return code checking */
+	check_img_inventory();
+	prepare_task_entries();
+	prepare_pstree();
+	collect_remaps_and_regfiles();
+	prepare_shared_reg_files();
+	prepare_remaps();
+	prepare_mm_pid(root_item);
+
+	/* We found a PID */
+	pr_debug("root_item->pid.virt %d\n", root_item->pid.virt);
+	pr_debug("root_item->pid.real %d\n", root_item->pid.real);
+}
+
+int uffd_listen()
+{
+	__u64 address;
+	void *dest;
+	__u64 flags;
+	struct uffd_msg msg;
+	struct page_read pr;
+	unsigned long ps;
+	int rc;
+	fd_set set;
+	struct timeval timeout;
+	int uffd;
+	unsigned long uffd_copied_pages = 0;
+	int uffd_flags;
+	struct uffd_pages_struct *uffd_pages;
+	bool vdso_sent;
+	unsigned long vma_size = 0;
+
+	LIST_HEAD(uffd_list);
+
+	pr_debug("Waiting for incoming connections\n");
+	if ((uffd = ud_open()) < 0)
+		exit(0);
+
+	pr_debug("uffd is 0x%d\n", uffd);
+	uffd_flags = fcntl(uffd, F_GETFD, NULL);
+	pr_debug("uffd_flags are 0x%x\n", uffd_flags);
+
+	/* Setting up criu infrastructure to easily access the dump results */
+	criu_init();
+
+	/* Initialize FD sets for read() with timeouts (using select()) */
+	FD_ZERO(&set);
+	FD_SET(uffd, &set);
+
+	/* All operations will be done on page size */
+	ps = page_size();
+	dest = malloc(ps);
+
+#ifdef rewind
+	rc = open_page_read(root_item->pid.virt, &pr, PR_TASK | PR_MOD);
+	pr_debug("open_page_read rc %d\n", rc);
+#endif
+
+
+	rc = open_page_read(root_item->pid.virt, &pr, PR_TASK);
+	if (rc <= 0)
+		return 1;
+	/*
+	 * This puts all pages which should be handled by userfaultfd
+	 * in the list uffd_list. This list is later used to detect if
+	 * a page has already been transferred or if it needs to be
+	 * pushed into the process using userfaultfd.
+	 */
+	do {
+		rc = collect_uffd_pages(&pr, &uffd_list, &vma_size);
+		if (rc == -1)
+			return 1;
+	} while (rc);
+
+	if (pr.close)
+		pr.close(&pr);
+
+	while (1) {
+		bool page_sent = false;
+		/*
+		 * Setting the timeout to 5 seconds. If after this time
+		 * no uffd pages are requested the code switches to
+		 * copying the remaining pages.
+		 *
+		 * Timeout is re-defined every time select() is run as
+		 * select(2) says:
+		 *  Consider timeout to be undefined after select() returns.
+		 */
+		timeout.tv_sec = 5;
+		timeout.tv_usec = 0;
+		rc = select(uffd + 1, &set, NULL, NULL, &timeout);
+		pr_debug("select() rc: 0x%x\n", rc);
+		if (rc == 0) {
+			pr_debug("read timeout\n");
+			pr_debug("switching from request to copy mode\n");
+			break;
+		}
+		rc = read(uffd, &msg, sizeof(msg));
+		pr_debug("read() rc: 0x%x\n", rc);
+
+		if (rc != sizeof(msg)) {
+			if (rc < 0)
+				pr_perror("read error");
+			else
+				pr_debug("short read\n");
+			continue;
+		}
+
+		/* Align requested address to the next page boundary */
+		address = msg.arg.pagefault.address & ~(ps - 1);
+		pr_debug("msg.arg.pagefault.address 0x%llx\n", address);
+
+		/*
+		 * At this point the process on the other side waits for the first page.
+		 * In the first step we will force the vdso pages into the new process.
+		 */
+		if (!vdso_sent) {
+			pr_debug("Pushing VDSO pages once\n");
+			list_for_each_entry(uffd_pages, &uffd_list, list) {
+				if (!(uffd_pages->flags & UFFD_FLAG_VDSO))
+					continue;
+				rc = uffd_copy_page(uffd, &pr, uffd_pages->addr,
+						    dest);
+				if (rc < 0) {
+					pr_err("Error during UFFD copy\n");
+					return 1;
+				}
+				vma_size -= rc;
+				uffd_copied_pages++;
+				uffd_pages->flags |= UFFD_FLAG_SENT;
+			}
+			vdso_sent = true;
+		}
+
+		/* Make sure to not transfer a page twice */
+		list_for_each_entry(uffd_pages, &uffd_list, list) {
+			if ((uffd_pages->addr == address) &&
+			    (uffd_pages->flags & UFFD_FLAG_SENT)) {
+				page_sent = true;
+				break;
+			}
+		}
+
+		if (page_sent)
+			continue;
+
+		/* Now handle the pages actually requested. */
+
+		flags = msg.arg.pagefault.flags;
+		pr_debug("msg.arg.pagefault.flags 0x%llx\n", flags);
+
+		if (msg.event != UFFD_EVENT_PAGEFAULT) {
+			pr_err("unexpected msg event %u\n", msg.event);
+			return 1;
+		}
+
+		rc = uffd_copy_page(uffd, &pr, address, dest);
+		if (rc < 0) {
+			pr_err("Error during UFFD copy\n");
+			return 1;
+		}
+		vma_size -= rc;
+		uffd_copied_pages++;
+
+		/*
+		 * Mark this page as having been already transferred, so
+		 * that it has not to be copied again later.
+		 */
+		list_for_each_entry(uffd_pages, &uffd_list, list) {
+			if (uffd_pages->addr == address)
+				uffd_pages->flags |= UFFD_FLAG_SENT;
+		}
+	}
+	pr_debug("remaining vma_size: 0x%lx\n", vma_size);
+	pr_debug("uffd_copied_pages:    %ld\n", uffd_copied_pages);
+	list_for_each_entry(uffd_pages, &uffd_list, list) {
+		pr_debug("Checking remaining pages 0x%lx (flags 0x%x)\n",
+			 uffd_pages->addr, uffd_pages->flags);
+		if (uffd_pages->flags & UFFD_FLAG_SENT)
+			continue;
+
+		rc = uffd_copy_page(uffd, &pr, uffd_pages->addr, dest);
+		if (rc < 0) {
+			pr_err("Error during UFFD copy\n");
+			return 1;
+		}
+		vma_size -= rc;
+
+		pr_debug("remaining vma_size: 0x%lx\n", vma_size);
+		pr_debug("uffd_copied_pages:    %ld\n", ++uffd_copied_pages);
+		uffd_pages->flags |= UFFD_FLAG_SENT;
+	}
+	close(uffd);
+	return 0;
+}
-- 
1.8.3.1



More information about the CRIU mailing list