[CRIU] [PATCH] Try to include userfaultfd with criu

Adrian Reber adrian at lisas.de
Mon Nov 16 06:31:15 PST 2015


From: Adrian Reber <areber at redhat.com>

This is a first try to include userfaultfd with criu. Right now it
still requires a "normal" checkpoint. After checkpointing the application
it can be restored with the help of userfaultfd.

All restored pages with MAP_ANONYMOUS set are marked as being handled by
userfaultfd and also madvise()'d as MADV_DONTNEED (still need to
understand why MADV_DONTNEED is necessary).

As soon as the process is restored it blocks on the first memory access
and waits for pages being transferred by userfaultfd.

To handle the required pages a new criu command has been added. The restore
works now like this:

  criu restore -D /tmp/3 -j -v4 --lazy-pages

This hangs after the restored process is running and needs:

  criu uffd -v4 -D /tmp/3/

This waits on a UFFD FD (/tmp/userfault.socket) which has been passed by
the 'criu restore' process over unix domain sockets for UFFD requests.
For my current test program following pages are transmitted over UFFD:

 uffdio_copy.dst 0x7ffdeaff9000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x7fb845e88000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x7ffdeafa6000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x7fb845e95000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x7fb845e92000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x7fb845c70000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x7fb845c6d000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x1790000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

The use case to use usefaultfd with a checkpointed process on a remote
machine will probably benefit from the current work related to
image-cache and image-proxy.

For the final implementation it would be nice to have a restore running
in uffd mode on one system which requests the memory pages over the
network from another system which is running 'criu checkpoint' also in
uffd mode. This way the pages need to be copied only 'once' from the
checkpoint process to the uffd restore process.

TODO:
  * What happens with pages which have not been requested via uffd
    during a certain timeframe. How can pages be forced into the
    restored process?
  * Contains still many debug outputs which need to be cleaned up.

v2:
    * provide option '--lazy-pages' to enable uffd style restore
    * use send_fd()/recv_fd() provided by criu (instead of own
      implementation)
    * do not install the uffd as service_fd
    * use named constants for MAP_ANONYMOUS
    * do not restore memory pages and then later mark them as uffd
      handled
    * remove function find_pages() to search in pages-<id>.img;
      now using criu functions to find the necessary pages;
      for each new page search the pages-<id>.img file is opened
    * only check the UFFDIO_API once
    * trying to protect uffd code by CONFIG_UFFD;
      use make UFFD=1 to compile criu with this patch

Signed-off-by: Adrian Reber <areber at redhat.com>
---
 Makefile             |   4 +
 Makefile.config      |   3 +
 Makefile.crtools     |   3 +
 cr-restore.c         | 150 ++++++++++++++++++++++++++++++++++-
 crtools.c            |  20 +++++
 include/cr_options.h |   1 +
 include/crtools.h    |   2 +
 include/page-read.h  |   2 +
 include/restorer.h   |   2 +
 include/uffd.h       |  18 +++++
 include/util-pie.h   |   2 +-
 page-read.c          |  13 ++++
 pie/restorer.c       |  77 +++++++++++++++++-
 uffd.c               | 216 +++++++++++++++++++++++++++++++++++++++++++++++++++
 14 files changed, 508 insertions(+), 5 deletions(-)
 create mode 100644 include/uffd.h
 create mode 100644 uffd.c

diff --git a/Makefile b/Makefile
index aeface3..45634e7 100644
--- a/Makefile
+++ b/Makefile
@@ -163,6 +163,10 @@ ifeq ($(GMON),1)
 	GMONLDOPT = -pg
 endif
 
+ifeq ($(UFFD),1)
+	DEFINES += -DUFFD
+endif
+
 CFLAGS		+= $(WARNINGS) $(DEFINES)
 SYSCALL-LIB	:= $(ARCH_DIR)/syscalls.built-in.o
 ARCH-LIB	:= $(ARCH_DIR)/crtools.built-in.o
diff --git a/Makefile.config b/Makefile.config
index ce4b8d8..3df8f53 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -44,6 +44,9 @@ endif
 ifeq ($(piegen-y),y)
 	$(Q) @echo '#define CONFIG_PIEGEN' >> $@
 endif
+ifeq ($(UFFD),1)
+	$(Q) @echo '#define CONFIG_UFFD' >> $@
+endif
 	$(Q) @echo '#endif /* __CR_CONFIG_H__ */' >> $@
 
 config: $(CONFIG)
diff --git a/Makefile.crtools b/Makefile.crtools
index 847b11d..235a85d 100644
--- a/Makefile.crtools
+++ b/Makefile.crtools
@@ -79,6 +79,9 @@ obj-y	+= seize.o
 obj-y	+= fault-injection.o
 obj-y	+= pie/util-fd.o
 obj-y	+= pie/util.o
+ifeq ($(UFFD),1)
+obj-y	+= uffd.o
+endif
 
 ifneq ($(MAKECMDGOALS),clean)
 incdeps := y
diff --git a/cr-restore.c b/cr-restore.c
index c132588..43d4ce0 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -19,6 +19,7 @@
 #include <sys/shm.h>
 #include <sys/mount.h>
 #include <sys/prctl.h>
+#include <sys/syscall.h>
 
 #include <sched.h>
 
@@ -78,6 +79,8 @@
 #include "seccomp.h"
 #include "bitmap.h"
 #include "fault-injection.h"
+#include "uffd.h"
+
 #include "parasite-syscall.h"
 
 #include "protobuf.h"
@@ -463,6 +466,16 @@ static int restore_priv_vma_content(void)
 			p = decode_pointer((off) * PAGE_SIZE +
 					vma->premmaped_addr);
 
+			/*
+			 * This means that userfaultfd is used to load the pages
+			 * on demand.
+			 */
+			if (opts.lazy_pages && (vma->e->flags & MAP_ANONYMOUS)) {
+				pr_debug("Lazy restore skips %lx\n", vma->e->start);
+				pr.skip_pages(&pr, PAGE_SIZE);
+				continue;
+			}
+
 			set_bit(off, vma->page_bitmap);
 			if (vma->ppage_bitmap) { /* inherited vma */
 				clear_bit(off, vma->ppage_bitmap);
@@ -1983,7 +1996,7 @@ out:
 	return -1;
 }
 
-static int prepare_task_entries(void)
+int prepare_task_entries(void)
 {
 	task_entries_pos = rst_mem_cpos(RM_SHREMAP);
 	task_entries = rst_mem_alloc(sizeof(*task_entries), RM_SHREMAP);
@@ -2699,6 +2712,125 @@ out:
 extern void __gcov_flush(void) __attribute__((weak));
 void __gcov_flush(void) {}
 
+static int server_listen(struct sockaddr_un *saddr)
+{
+	int fd;
+	int len;
+
+	if (strlen(UFFD_SK) >= sizeof(saddr->sun_path)) {
+		return -1;
+	}
+
+	if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
+		return -1;
+
+	unlink(UFFD_SK);
+
+	memset(saddr, 0, sizeof(struct sockaddr_un));
+	saddr->sun_family = AF_UNIX;
+	strcpy(saddr->sun_path, UFFD_SK);
+	len = offsetof(struct sockaddr_un, sun_path) + strlen(UFFD_SK);
+
+	if (bind(fd, (struct sockaddr *) saddr, len) < 0) {
+		goto out;
+	}
+
+	if (listen(fd, 10) < 0) {
+		goto out;
+	}
+
+	return fd;
+
+out:
+	close(fd);
+	return -1;
+}
+
+static int server_accept(int listen, struct sockaddr_un *saddr)
+{
+	int client;
+	int ret = -1;
+	socklen_t len;
+	time_t staletime;
+	struct stat statbuf;
+	char *name;
+
+	if ((name = malloc(sizeof(saddr->sun_path + 1))) == NULL)
+		return ret;
+
+	len = sizeof(struct sockaddr_un);
+	if ((client = accept(listen, (struct sockaddr *) saddr, &len)) < 0) {
+		free(name);
+		return ret;
+	}
+
+	len -= offsetof(struct sockaddr_un, sun_path);
+	memcpy(name, saddr->sun_path, len);
+	name[len] = 0;
+	if (stat(name, &statbuf) < 0) {
+		goto out;
+	}
+
+	if (S_ISSOCK(statbuf.st_mode) == 0) {
+		goto out;
+	}
+
+	if ((statbuf.st_mode & (S_IRWXG | S_IRWXO)) || (statbuf.st_mode & S_IRWXU) != S_IRWXU) {
+		goto out;
+	}
+
+	staletime = time(NULL) - 30;
+	if (statbuf.st_atime < staletime ||
+	    statbuf.st_ctime < staletime || statbuf.st_mtime < staletime) {
+		/* inode is older than 30 seconds; see above */
+		goto out;
+	}
+
+	unlink(name);
+	free(name);
+	return client;
+
+out:
+	unlink(name);
+	close(client);
+	free(name);
+	return ret;
+}
+
+static int send_uffd(int sendfd)
+{
+	int listen;
+	int client;
+	int ret = -1;
+	struct sockaddr_un saddr;
+
+	if (sendfd < 0)
+		return -1;
+
+	if ((listen = server_listen(&saddr)) < 0) {
+		pr_perror("server_listen error");
+		return -1;
+	}
+
+	/* accept new client request */
+	if ((client = server_accept(listen, &saddr)) < 0) {
+		pr_perror("server_accept error: %d", client);
+		close(listen);
+		return -1;
+	}
+
+	if (send_fd(client, NULL, 0, sendfd) < 0) {
+		pr_perror("send_fd error:");
+		goto out;
+	}
+	ret = 0;
+
+out:
+	close(listen);
+	close(client);
+	return ret;
+}
+
 static int sigreturn_restore(pid_t pid, CoreEntry *core)
 {
 	void *mem = MAP_FAILED;
@@ -2980,6 +3112,22 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 
 	strncpy(task_args->comm, core->tc->comm, sizeof(task_args->comm));
 
+	if (!opts.lazy_pages)
+		task_args->uffd = -1;
+
+#ifdef CONFIG_UFFD
+	/*
+	 * Open userfaulfd FD which is passed to the restorer blob and
+	 * to a second process handling the userfaultfd page faults.
+	 */
+	task_args->uffd = syscall(__NR_userfaultfd, O_CLOEXEC);
+	pr_info("uffd %d\n", task_args->uffd);
+
+	if (send_uffd(task_args->uffd) < 0) {
+		close(task_args->uffd);
+		goto err;
+	}
+#endif
 
 	/*
 	 * Fill up per-thread data.
diff --git a/crtools.c b/crtools.c
index d3812a1..1c6b5ab 100644
--- a/crtools.c
+++ b/crtools.c
@@ -253,6 +253,9 @@ int main(int argc, char *argv[], char *envp[])
 		{ "freeze-cgroup",		required_argument,	0, 1068 },
 		{ "ghost-limit",		required_argument,	0, 1069 },
 		{ "irmap-scan-path",		required_argument,	0, 1070 },
+#ifdef CONFIG_UFFD
+		{ "lazy-pages",			no_argument,		0, 1071 },
+#endif
 		{ },
 	};
 
@@ -498,6 +501,11 @@ int main(int argc, char *argv[], char *envp[])
 			if (irmap_scan_path_add(optarg))
 				return -1;
 			break;
+#ifdef CONFIG_UFFD
+		case 1071:
+			opts.lazy_pages = true;
+			break;
+#endif
 		case 'M':
 			{
 				char *aux;
@@ -629,6 +637,9 @@ int main(int argc, char *argv[], char *envp[])
 		return ret != 0;
 	}
 
+	if (!strcmp(argv[optind], "uffd"))
+		return uffd_listen() != 0;
+
 	if (!strcmp(argv[optind], "show"))
 		return cr_show(pid) != 0;
 
@@ -672,6 +683,9 @@ usage:
 "  criu page-server\n"
 "  criu service [<options>]\n"
 "  criu dedup\n"
+#ifdef CONFIG_UFFD
+"  criu uffd -D DIR [<options>]\n"
+#endif
 "\n"
 "Commands:\n"
 "  dump           checkpoint a process/tree identified by pid\n"
@@ -710,6 +724,12 @@ usage:
 "                        restore making it the parent of the restored process\n"
 "  --freeze-cgroup\n"
 "                        use cgroup freezer to collect processes\n"
+#ifdef CONFIG_UFFD
+"  --lazy-pages          restore pages on demand\n"
+"                        this requires running a second instance of criu\n"
+"                        in uffd mode: 'criu uffd -D DIR'\n"
+"                        --lazy-pages and uffd mode require userfaultfd\n"
+#endif
 "\n"
 "* Special resources support:\n"
 "  -x|--" USK_EXT_PARAM "inode,.." "      allow external unix connections (optionally can be assign socket's inode that allows one-sided dump)\n"
diff --git a/include/cr_options.h b/include/cr_options.h
index eac7283..93a1f1a 100644
--- a/include/cr_options.h
+++ b/include/cr_options.h
@@ -95,6 +95,7 @@ struct cr_options {
 	bool			overlayfs;
 	size_t			ghost_limit;
 	struct list_head	irmap_scan_paths;
+	bool			lazy_pages;
 };
 
 extern struct cr_options opts;
diff --git a/include/crtools.h b/include/crtools.h
index bbed0ef..9e98e92 100644
--- a/include/crtools.h
+++ b/include/crtools.h
@@ -23,6 +23,8 @@ extern int convert_to_elf(char *elf_path, int fd_core);
 extern int cr_check(void);
 extern int cr_exec(int pid, char **opts);
 extern int cr_dedup(void);
+extern int uffd_listen(void);
+extern int prepare_task_entries(void);
 
 extern int check_add_feature(char *arg);
 
diff --git a/include/page-read.h b/include/page-read.h
index 827e4ac..f5a267a 100644
--- a/include/page-read.h
+++ b/include/page-read.h
@@ -51,6 +51,7 @@ struct page_read {
 	/* stop working on current pagemap */
 	void (*put_pagemap)(struct page_read *);
 	void (*close)(struct page_read *);
+	void (*skip_pages)(struct page_read *, unsigned long len);
 
 	/* Private data of reader */
 	struct cr_img *pmi;
@@ -84,6 +85,7 @@ extern int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flag
 extern void pagemap2iovec(PagemapEntry *pe, struct iovec *iov);
 extern void iovec2pagemap(struct iovec *iov, PagemapEntry *pe);
 extern int seek_pagemap_page(struct page_read *pr, unsigned long vaddr, bool warn);
+extern void rewind_pagemap(struct page_read *pr);
 
 extern int dedup_one_iovec(struct page_read *pr, struct iovec *iov);
 extern int punch_hole(struct page_read *pr, unsigned long off, unsigned long len, bool cleanup);
diff --git a/include/restorer.h b/include/restorer.h
index afcaf68..6b998db 100644
--- a/include/restorer.h
+++ b/include/restorer.h
@@ -102,6 +102,8 @@ struct task_restore_args {
 	int				logfd;
 	unsigned int			loglevel;
 
+	int				uffd;
+
 	/* threads restoration */
 	int				nr_threads;		/* number of threads */
 	thread_restore_fcall_t		clone_restore_fn;	/* helper address for clone() call */
diff --git a/include/uffd.h b/include/uffd.h
new file mode 100644
index 0000000..8ab2d40
--- /dev/null
+++ b/include/uffd.h
@@ -0,0 +1,18 @@
+#ifndef __CR_UFFD_H_
+#define __CR_UFFD_H_
+
+#include "config.h"
+
+#ifdef CONFIG_UFFD
+
+#include <linux/userfaultfd.h>
+
+#ifndef __NR_userfaultfd
+#error "missing __NR_userfaultfd definition"
+#endif
+
+#define	UFFD_SK "/tmp/userfault.socket"
+
+#endif /* UFFD */
+
+#endif /* __CR_UFFD_H_ */
diff --git a/include/util-pie.h b/include/util-pie.h
index cbaed42..c086bbc 100644
--- a/include/util-pie.h
+++ b/include/util-pie.h
@@ -56,7 +56,7 @@ static inline int recv_fd(int sock)
 
 	ret = recv_fds(sock, &fd, 1, NULL);
 	if (ret)
-		return -1;
+		return ret;
 
 	return fd;
 }
diff --git a/page-read.c b/page-read.c
index 28ecd5b..47cb2c6 100644
--- a/page-read.c
+++ b/page-read.c
@@ -97,6 +97,13 @@ static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
 	pr->cvaddr += len;
 }
 
+void rewind_pagemap(struct page_read *pr)
+{
+	pr_debug("\tpr%u Rewind page-dump\n", pr->id);
+	lseek(img_raw_fd(pr->pi), 0, SEEK_SET);
+	pr->cvaddr = 0;
+}
+
 int seek_pagemap_page(struct page_read *pr, unsigned long vaddr, bool warn)
 {
 	int ret;
@@ -118,6 +125,10 @@ int seek_pagemap_page(struct page_read *pr, unsigned long vaddr, bool warn)
 		}
 		iov_end = (unsigned long)iov.iov_base + iov.iov_len;
 
+		pr_debug("vaddr %lx\n", vaddr);
+		pr_debug("iov_end %lx\n", iov_end);
+		pr_debug("pr->cvaddr %lx\n", pr->cvaddr);
+
 		if (iov_end <= vaddr) {
 			skip_pagemap_pages(pr, iov_end - pr->cvaddr);
 			put_pagemap(pr);
@@ -328,6 +339,7 @@ int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
 	pr->put_pagemap = put_pagemap;
 	pr->read_pages = read_pagemap_page;
 	pr->close = close_page_read;
+	pr->skip_pages = skip_pagemap_pages;
 	pr->id = ids++;
 
 	pr_debug("Opened page read %u (parent %u)\n",
@@ -350,6 +362,7 @@ open_old:
 	pr->read_pages = read_page;
 	pr->pi = NULL;
 	pr->close = close_page_read;
+	pr->skip_pages = NULL;
 
 	return 1;
 }
diff --git a/pie/restorer.c b/pie/restorer.c
index 26494f9..29b14df 100644
--- a/pie/restorer.c
+++ b/pie/restorer.c
@@ -3,6 +3,7 @@
 
 #include <linux/securebits.h>
 #include <linux/capability.h>
+#include <linux/userfaultfd.h>
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
@@ -534,7 +535,47 @@ static void rst_tcp_socks_all(struct task_restore_args *ta)
 		rst_tcp_repair_off(&ta->tcp_socks[i]);
 }
 
-static int vma_remap(unsigned long src, unsigned long dst, unsigned long len)
+
+
+
+static void enable_uffd(int uffd, unsigned long addr, unsigned long len)
+{
+	/*
+	 * If uffd == -1, this means that userfaultfd is not enabled
+	 * or it is not available.
+	 */
+	if (uffd == -1)
+		return;
+#ifdef CONFIG_UFFD
+	int rc;
+	struct uffdio_register uffdio_register;
+	unsigned long expected_ioctls;
+
+	uffdio_register.range.start = addr;
+	uffdio_register.range.len = len;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+	pr_info("uffdio_register.range.start 0x%lx\n", (unsigned long) uffdio_register.range.start);
+	pr_info("uffdio_register.len 0x%llx\n", uffdio_register.range.len);
+	rc = sys_ioctl(uffd, UFFDIO_REGISTER, &uffdio_register);
+	pr_info("ioctl UFFDIO_REGISTER rc %d\n", rc);
+	pr_info("uffdio_register.range.start 0x%lx\n", (unsigned long) uffdio_register.range.start);
+	pr_info("uffdio_register.len 0x%llx\n", uffdio_register.range.len);
+
+	expected_ioctls = (1 << _UFFDIO_WAKE) | (1 << _UFFDIO_COPY) | (1 << _UFFDIO_ZEROPAGE);
+
+	if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
+		pr_info("unexpected missing ioctl for anon memory\n");
+	}
+
+	if (sys_madvise(uffdio_register.range.start, uffdio_register.range.len, MADV_DONTNEED)) {
+		pr_info("madvise 2");
+	}
+#endif
+
+}
+
+
+static int vma_remap(unsigned long src, unsigned long dst, unsigned long len, int uffd, int flags)
 {
 	unsigned long guard = 0, tmp;
 
@@ -607,6 +648,17 @@ static int vma_remap(unsigned long src, unsigned long dst, unsigned long len)
 		return -1;
 	}
 
+	/*
+	 * If running in userfaultfd/lazy-pages mode pages with
+	 * MAP_ANONYMOUS are remapped but without the real content.
+	 * The function enable_uffd() marks the page(s) as userfaultfd
+	 * pages, so that the processes will hang until the memory is
+	 * injected via userfaultfd.
+	 */
+
+	if (flags & MAP_ANONYMOUS)
+		enable_uffd(uffd, dst, len);
+
 	return 0;
 }
 
@@ -832,6 +884,9 @@ long __export_restore_task(struct task_restore_args *args)
 	int i;
 	VmaEntry *vma_entry;
 	unsigned long va;
+	int uffd_flags;
+	struct uffdio_api uffdio_api;
+	int rc;
 
 	struct rt_sigframe *rt_sigframe;
 	struct prctl_mm_map prctl_map;
@@ -867,6 +922,22 @@ long __export_restore_task(struct task_restore_args *args)
 
 	pr_info("Switched to the restorer %d\n", my_pid);
 
+	if (args->uffd > -1) {
+		pr_info("logfd %d\n", args->logfd);
+		pr_info("uffd %d\n", args->uffd);
+
+		uffd_flags = sys_fcntl(args->uffd, F_GETFD, 0);
+		pr_info("uffd_flags %d\n", uffd_flags);
+		pr_info("UFFD_API 0x%llx\n", UFFD_API);
+		uffdio_api.api = UFFD_API;
+		uffdio_api.features = 0;
+		rc = sys_ioctl(args->uffd, UFFDIO_API, &uffdio_api);
+		pr_info("ioctl UFFDIO_API rc %d\n", rc);
+		pr_info("uffdio_api.api 0x%llx\n", uffdio_api.api);
+		pr_info("uffdio_api.features 0x%llx\n", uffdio_api.features);
+	}
+
+
 	if (vdso_do_park(&args->vdso_sym_rt, args->vdso_rt_parked_at, vdso_rt_size))
 		goto core_restore_end;
 
@@ -888,7 +959,7 @@ long __export_restore_task(struct task_restore_args *args)
 			break;
 
 		if (vma_remap(vma_premmaped_start(vma_entry),
-				vma_entry->start, vma_entry_len(vma_entry)))
+				vma_entry->start, vma_entry_len(vma_entry), args->uffd, vma_entry->flags))
 			goto core_restore_end;
 	}
 
@@ -906,7 +977,7 @@ long __export_restore_task(struct task_restore_args *args)
 			break;
 
 		if (vma_remap(vma_premmaped_start(vma_entry),
-				vma_entry->start, vma_entry_len(vma_entry)))
+				vma_entry->start, vma_entry_len(vma_entry), args->uffd, vma_entry->flags))
 			goto core_restore_end;
 	}
 
diff --git a/uffd.c b/uffd.c
new file mode 100644
index 0000000..52ffc74
--- /dev/null
+++ b/uffd.c
@@ -0,0 +1,216 @@
+#include <stddef.h>
+#include <stdio.h>
+#include <errno.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/un.h>
+#include <sys/socket.h>
+
+#include "asm/page.h"
+#include "include/log.h"
+#include "include/criu-plugin.h"
+#include "include/page-read.h"
+#include "include/uffd.h"
+#include "include/util-pie.h"
+#include "include/pstree.h"
+#include "include/crtools.h"
+
+#define CLI_PATH "/var/tmp/"
+
+static int ud_open()
+{
+	int fd;
+	int len;
+	int ret = 1;
+	struct sockaddr_un un;
+	struct sockaddr_un sun;
+
+	if (strlen(UFFD_SK) >= sizeof(un.sun_path)) {
+		return -1;
+	}
+
+	if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
+		return -1;
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	sprintf(un.sun_path, "%s%05ld", CLI_PATH, (long) getpid());
+	pr_info("file is %s\n", un.sun_path);
+	len = offsetof(struct sockaddr_un, sun_path) + strlen(un.sun_path);
+
+	/* remove it, if it exists */
+	unlink(un.sun_path);
+	if (bind(fd, (struct sockaddr *) &un, len) < 0) {
+		pr_perror("bind failed");
+		goto out;
+	}
+	if (chmod(un.sun_path, S_IRWXU) < 0) {
+		pr_perror("chmod failed");
+		goto out;
+	}
+
+	memset(&sun, 0, sizeof(sun));
+	sun.sun_family = AF_UNIX;
+	strcpy(sun.sun_path, UFFD_SK);
+	len = offsetof(struct sockaddr_un, sun_path) + strlen(UFFD_SK);
+	if (connect(fd, (struct sockaddr *) &sun, len) < 0) {
+		pr_perror("connect failed");
+		goto out;
+	}
+
+	int newfd;
+
+	pr_debug("fd %d\n", fd);
+	newfd = recv_fd(fd);
+	pr_debug("newfd %d\n", newfd);
+	close(fd);
+
+	return newfd;
+
+out:
+	close(fd);
+	unlink(un.sun_path);
+	return (ret);
+}
+
+#ifdef rewind
+static void get_page(unsigned long addr, void *dest, struct page_read *pr)
+#else
+static void get_page(unsigned long addr, void *dest, struct page_read *pr, int pid)
+#endif
+{
+	struct iovec iov;
+	int ret;
+	unsigned char buf[PAGE_SIZE];
+
+#ifdef rewind
+	rewind_pagemap(pr);
+#else
+	ret = open_page_read(pid, pr, PR_TASK | PR_MOD);
+	pr_debug("ret %d\n", ret);
+#endif
+	ret = pr->get_pagemap(pr, &iov);
+	pr_debug("get_pagemap  ret %d\n", ret);
+	ret = seek_pagemap_page(pr, addr, true);
+	pr_debug("seek_pagemap_page %x\n", ret);
+	ret = pr->read_pages(pr, addr, 1, buf);
+	pr_debug("read_pages ret %d\n", ret);
+	memcpy(dest, buf, PAGE_SIZE);
+}
+
+int uffd_listen()
+{
+	struct uffd_msg msg;
+	struct page_read pr;
+	int rc;
+	int uffd;
+	int uffd_flags;
+	//struct uffdio_api uffdio_api;
+	struct uffdio_copy uffdio_copy;
+	unsigned long ps;
+	__u64 flags;
+	__u64 address;
+	void *dest;
+
+	if ((uffd = ud_open()) < 0)
+		exit(0);
+
+	pr_info("uffd %d\n", uffd);
+	uffd_flags = fcntl(uffd, F_GETFD, NULL);
+	pr_info("uffd_flags 0x%x\n", uffd_flags);
+	pr_info("UFFD_API 0x%llx\n", UFFD_API);
+	check_img_inventory();
+	prepare_task_entries();
+	prepare_pstree();
+	pr_info("root_item->pid.virt %d\n", root_item->pid.virt);
+	pr_info("root_item->pid.real %d\n", root_item->pid.real);
+
+#if 0
+	uffdio_api.api = UFFD_API;
+	uffdio_api.features = 0;
+
+	if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+		pr_err("UFFDIO_API ioctl() failed\n");
+		return 1;
+	}
+
+	if (uffdio_api.api != UFFD_API) {
+		pr_err("UFFDIO_API error %Lu\n", uffdio_api.api);
+		return 1;
+	}
+#endif
+
+	ps = page_size();
+	dest = malloc(ps);
+
+#ifndef rewind
+	rc = open_page_read(root_item->pid.virt, &pr, PR_TASK | PR_MOD);
+	pr_debug("open_page_read rc %d\n", rc);
+#endif
+
+	while (1) {
+		rc = read(uffd, &msg, sizeof(msg));
+		pr_info("rc read 0x%x\n", rc);
+
+		if (rc != sizeof(msg)) {
+			if (rc < 0)
+				perror("blocking read error");
+			else
+				pr_info("short read\n");
+			continue;
+		}
+
+		flags = msg.arg.pagefault.flags;
+		address = msg.arg.pagefault.address;
+		pr_info("msg.event 0x%x\n", msg.event);
+		pr_info("msg.arg.pagefault.flags 0x%llx\n", flags);
+
+		if (msg.event != UFFD_EVENT_PAGEFAULT) {
+			pr_err("unexpected msg event %u\n", msg.event);
+			return 1;
+		}
+
+		if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+			pr_info("unexpected write fault\n");
+
+		pr_info("msg.arg.pagefault.address 0x%llx\n", address);
+		pr_info("msg.arg.pagefault.address aligned 0x%llx\n", address & ~(ps - 1));
+
+#ifdef rewind
+		get_page(address & ~(ps - 1), dest, &pr);
+#else
+		get_page(address & ~(ps - 1), dest, &pr, root_item->pid.virt);
+#endif
+		pr_info("content at 0x280 0x%x\n", ((char *)dest)[0x280]);
+
+		uffdio_copy.dst = address & ~(ps - 1);
+		uffdio_copy.src = (unsigned long) dest;
+		uffdio_copy.len = ps;
+		uffdio_copy.mode = 0;
+		uffdio_copy.copy = 0;
+
+		pr_info("uffdio_copy.dst 0x%llx\n", uffdio_copy.dst);
+		rc = ioctl(uffd, UFFDIO_COPY, &uffdio_copy);
+		pr_info("ioctl UFFDIO_COPY rc 0x%x\n", rc);
+		pr_info("uffdio_copy.copy 0x%llx\n", uffdio_copy.copy);
+		if (rc) {
+			/* real retval in ufdio_copy.copy */
+			if (uffdio_copy.copy != -EEXIST) {
+				pr_err("UFFDIO_COPY error %Ld\n", uffdio_copy.copy);
+				return 1;
+			}
+
+		} else if (uffdio_copy.copy != ps) {
+			pr_err("UFFDIO_COPY unexpected copy %Ld\n", uffdio_copy.copy);
+			return 1;
+		}
+	}
+	return 0;
+}
-- 
1.8.3.1



More information about the CRIU mailing list