[CRIU] [RFC PATCH] Try to include userfaultfd with criu

adrian at lisas.de adrian at lisas.de
Thu Sep 24 06:04:42 PDT 2015


From: Adrian Reber <areber at redhat.com>

This is a first try to include userfaultfd with criu. Right now it
still requires a "normal" checkpoint. After checkpointing the application
it can be restored with the help of userfaultfd.

The normal restore still copies all pages from the checkpoint to
the right location (this still needs to be disabled) and the usage
of userfaultfd is right now hardcoded in this patch.

All restored pages with MAP_ANONYMOUS set are marked as being handled by
userfaultfd and also madvise()'d as MADV_DONTNEED.

If I also enable userfaultfd for pages without MAP_ANONYMOUS the restored process
segfaults. I have not looked into more details into why it segfaults.

As soon as the process is restored it blocks on the first memory access
and waits for pages being transferred by userfaultfd.

To handle the required pages a new criu command has been added. The restore
works now like this:

  criu restore -D /tmp/3 -j -v4

This hangs after the restored process is running and needs:

  criu uffd -v4 -D /tmp/3/

This waits on a UFFD FD which has been passed by the 'criu restore' process over
unix domain sockets for UFFD requests. For my current test programm following
pages are transmitted over UFFD:

 uffdio_copy.dst 0x7ffdeaff9000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x7fb845e88000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x7ffdeafa6000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x7fb845e95000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x7fb845e92000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x7fb845c70000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x7fb845c6d000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

 uffdio_copy.dst 0x1790000
 ioctl UFFDIO_COPY rc 0x0
 uffdio_copy.copy 0x1000

The following pages are not handled by uffd:

 00400000-00401000 r-xp 00000000 00:23 17349189                           /share/minimal
 00600000-00601000 r--p 00000000 00:23 17349189                           /share/minimal
 00601000-00602000 rw-p 00001000 00:23 17349189                           /share/minimal
 7fb8458b1000-7fb845a67000 r-xp 00000000 fd:01 16820592                   /usr/lib64/libc-2.17.so
 7fb845a67000-7fb845c67000 ---p 001b6000 fd:01 16820592                   /usr/lib64/libc-2.17.so
 7fb845c67000-7fb845c6b000 r--p 001b6000 fd:01 16820592                   /usr/lib64/libc-2.17.so
 7fb845c6b000-7fb845c6d000 rw-p 001ba000 fd:01 16820592                   /usr/lib64/libc-2.17.so
 7fb845c72000-7fb845c93000 r-xp 00000000 fd:01 16820585                   /usr/lib64/ld-2.17.so
 7fb845e93000-7fb845e94000 r--p 00021000 fd:01 16820585                   /usr/lib64/ld-2.17.so
 7fb845e94000-7fb845e95000 rw-p 00022000 fd:01 16820585                   /usr/lib64/ld-2.17.so

The use case to use usefaultfd with a checkpointed process on a remote
machine will probably benefit from the current work related to
image-cache and image-proxy.

TODO:
 * Provide parameter for restore to not always use UFFD like described in the wiki:
   http://criu.org/Userfaultfd : --lazy-pages

 * Do not restore pages which will be transferred via UFFD.
   They still need to be mapped.

 * Unix domain sockets with credentials.

Signed-off-by: Adrian Reber <areber at redhat.com>
---
 Makefile.crtools    |   1 +
 cr-restore.c        | 158 ++++++++++++++++++++++++++
 crtools.c           |   3 +
 include/crtools.h   |   1 +
 include/restorer.h  |   2 +
 include/servicefd.h |   1 +
 include/uffd.h      |   8 ++
 pie/restorer.c      |  58 +++++++++-
 uffd.c              | 321 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 550 insertions(+), 3 deletions(-)
 create mode 100644 include/uffd.h
 create mode 100644 uffd.c

diff --git a/Makefile.crtools b/Makefile.crtools
index 80f704f..f91b726 100644
--- a/Makefile.crtools
+++ b/Makefile.crtools
@@ -74,6 +74,7 @@ obj-y	+= plugin.o
 obj-y	+= cr-errno.o
 obj-y	+= pie/pie-relocs.o
 obj-y	+= seize.o
+obj-y	+= uffd.o
 
 ifneq ($(MAKECMDGOALS),clean)
 incdeps := y
diff --git a/cr-restore.c b/cr-restore.c
index b8b4473..b0b44fd 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -19,6 +19,7 @@
 #include <sys/shm.h>
 #include <sys/mount.h>
 #include <sys/prctl.h>
+#include <sys/syscall.h>
 
 #include <sched.h>
 
@@ -76,6 +77,7 @@
 #include "security.h"
 #include "lsm.h"
 #include "seccomp.h"
+#include "uffd.h"
 
 #include "parasite-syscall.h"
 
@@ -1904,6 +1906,8 @@ static int restore_root_task(struct pstree_item *init)
 
 	write_stats(RESTORE_STATS);
 
+	close_service_fd(UFFD_FD_OFF);
+
 	if (!opts.restore_detach && !opts.exec_cmd)
 		wait(NULL);
 
@@ -2650,6 +2654,131 @@ out:
 extern void __gcov_flush(void) __attribute__((weak));
 void __gcov_flush(void) {}
 
+static int server_listen()
+{
+	int fd;
+	int len;
+	int ret = -1;
+	struct sockaddr_un un;
+
+	if (strlen(UFFD_SK) >= sizeof(un.sun_path)) {
+		return ret;
+	}
+
+	if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
+		return ret;
+
+	unlink(UFFD_SK);
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	strcpy(un.sun_path, UFFD_SK);
+	len = offsetof(struct sockaddr_un, sun_path) + strlen(UFFD_SK);
+
+	if (bind(fd, (struct sockaddr *) &un, len) < 0) {
+		goto out;
+	}
+
+	if (listen(fd, 10) < 0) {
+		goto out;
+	}
+
+	return (fd);
+
+out:
+	close(fd);
+	return (ret);
+}
+
+static int server_accept(int listen)
+{
+	int client;
+	int ret = -1;
+	socklen_t len;
+	time_t staletime;
+	struct sockaddr_un un;
+	struct stat statbuf;
+	char *name;
+
+	if ((name = malloc(sizeof(un.sun_path + 1))) == NULL)
+		return ret;
+
+	len = sizeof(un);
+	if ((client = accept(listen, (struct sockaddr *) &un, &len)) < 0) {
+		free(name);
+		return ret;
+	}
+
+	len -= offsetof(struct sockaddr_un, sun_path);
+	memcpy(name, un.sun_path, len);
+	name[len] = 0;
+	if (stat(name, &statbuf) < 0) {
+		goto out;
+	}
+
+	if (S_ISSOCK(statbuf.st_mode) == 0) {
+		goto out;
+	}
+
+	if ((statbuf.st_mode & (S_IRWXG | S_IRWXO)) || (statbuf.st_mode & S_IRWXU) != S_IRWXU) {
+		goto out;
+	}
+
+	staletime = time(NULL) - 30;
+	if (statbuf.st_atime < staletime ||
+	    statbuf.st_ctime < staletime || statbuf.st_mtime < staletime) {
+		/* inode is older than 30 seconds; see above */
+		goto out;
+	}
+
+	unlink(name);
+	free(name);
+	return (client);
+
+out:
+	close(client);
+	free(name);
+	return (ret);
+}
+
+static int send_uffd(int fd, int send_fd)
+{
+	struct iovec iov[1];
+	struct msghdr msg;
+	char buf[2];
+	struct cmsghdr *cmptr = NULL;
+
+	if (send_fd < 0)
+		return -1;
+
+	buf[0] = 0;
+	buf[1] = 0;
+	iov[0].iov_base = buf;
+	iov[0].iov_len = 2;
+	msg.msg_iov = iov;
+	msg.msg_iovlen = 1;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+
+	cmptr = malloc(CONTROLLEN);
+	if (!cmptr)
+		return -1;
+	cmptr->cmsg_level = SOL_SOCKET;
+	cmptr->cmsg_type = SCM_RIGHTS;
+	cmptr->cmsg_len = CONTROLLEN;
+	msg.msg_control = cmptr;
+	msg.msg_controllen = CONTROLLEN;
+	*(int *) CMSG_DATA(cmptr) = send_fd;
+
+	if (sendmsg(fd, &msg, 0) != 2) {
+		free(cmptr);
+		return -1;
+	}
+
+	free(cmptr);
+	return -1;
+}
+
 static int sigreturn_restore(pid_t pid, CoreEntry *core)
 {
 	void *mem = MAP_FAILED;
@@ -2687,6 +2816,10 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 
 	CredsEntry *creds;
 
+	int uffd;
+	int listen;
+	int client;
+
 	pr_info("Restore via sigreturn\n");
 
 	/* pr_info_vma_list(&self_vma_list); */
@@ -2931,6 +3064,31 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 
 	strncpy(task_args->comm, core->tc->comm, sizeof(task_args->comm));
 
+	/*
+	 * open userfaulfd FD which is to the restorer blob and
+	 * to a second process handling the userfaultfd page faults
+	 */
+	uffd = syscall(__NR_userfaultfd, O_CLOEXEC);
+	pr_info("uffd %d\n", uffd);
+
+	if ((listen = server_listen()) < 0) {
+		pr_info("server_listen error");
+	}
+
+	/* accept new client request */
+	if ((client = server_accept(listen)) < 0) {
+		pr_info("server_accept error: %d", client);
+	}
+
+	send_uffd(client, uffd);
+
+	close(listen);
+	close(client);
+
+	install_service_fd(UFFD_FD_OFF, uffd);
+	close(uffd);
+
+	task_args->uffd = get_service_fd(UFFD_FD_OFF);
 
 	/*
 	 * Fill up per-thread data.
diff --git a/crtools.c b/crtools.c
index ea8b889..2d1060a 100644
--- a/crtools.c
+++ b/crtools.c
@@ -625,6 +625,9 @@ int main(int argc, char *argv[], char *envp[])
 		return ret != 0;
 	}
 
+	if (!strcmp(argv[optind], "uffd"))
+		return uffd_listen() != 0;
+
 	if (!strcmp(argv[optind], "show"))
 		return cr_show(pid) != 0;
 
diff --git a/include/crtools.h b/include/crtools.h
index bbed0ef..8541ab2 100644
--- a/include/crtools.h
+++ b/include/crtools.h
@@ -23,6 +23,7 @@ extern int convert_to_elf(char *elf_path, int fd_core);
 extern int cr_check(void);
 extern int cr_exec(int pid, char **opts);
 extern int cr_dedup(void);
+extern int uffd_listen();
 
 extern int check_add_feature(char *arg);
 
diff --git a/include/restorer.h b/include/restorer.h
index afcaf68..6b998db 100644
--- a/include/restorer.h
+++ b/include/restorer.h
@@ -102,6 +102,8 @@ struct task_restore_args {
 	int				logfd;
 	unsigned int			loglevel;
 
+	int				uffd;
+
 	/* threads restoration */
 	int				nr_threads;		/* number of threads */
 	thread_restore_fcall_t		clone_restore_fn;	/* helper address for clone() call */
diff --git a/include/servicefd.h b/include/servicefd.h
index 3c6e08a..e93d885 100644
--- a/include/servicefd.h
+++ b/include/servicefd.h
@@ -18,6 +18,7 @@ enum sfd_type {
 	ROOT_FD_OFF,	/* Root of the namespace we dump/restore */
 	CGROUP_YARD,
 	USERNSD_SK,	/* Socket for usernsd */
+	UFFD_FD_OFF,	/* uffd FD */
 
 	SERVICE_FD_MAX
 };
diff --git a/include/uffd.h b/include/uffd.h
new file mode 100644
index 0000000..306134e
--- /dev/null
+++ b/include/uffd.h
@@ -0,0 +1,8 @@
+#include <linux/userfaultfd.h>
+
+#ifndef __NR_userfaultfd
+#error "missing __NR_userfaultfd definition"
+#endif
+
+#define CONTROLLEN CMSG_LEN(sizeof(int))
+#define	UFFD_SK "/tmp/userfault.socket"	/* well-known name */
diff --git a/pie/restorer.c b/pie/restorer.c
index 5e1db1f..308c568 100644
--- a/pie/restorer.c
+++ b/pie/restorer.c
@@ -3,6 +3,7 @@
 
 #include <linux/securebits.h>
 #include <linux/capability.h>
+#include <linux/userfaultfd.h>
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
@@ -534,7 +535,39 @@ static void rst_tcp_socks_all(struct task_restore_args *ta)
 		rst_tcp_repair_off(&ta->tcp_socks[i]);
 }
 
-static int vma_remap(unsigned long src, unsigned long dst, unsigned long len)
+
+
+
+static void enable_uffd(int uffd, unsigned long addr, unsigned long len)
+{
+	int rc;
+	struct uffdio_register uffdio_register;
+	unsigned long expected_ioctls;
+
+	uffdio_register.range.start = addr;
+	uffdio_register.range.len = len;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+	pr_info("uffdio_register.range.start 0x%lx\n", (unsigned long) uffdio_register.range.start);
+	pr_info("uffdio_register.len 0x%llx\n", uffdio_register.range.len);
+	rc = sys_ioctl(uffd, UFFDIO_REGISTER, &uffdio_register);
+	pr_info("ioctl UFFDIO_REGISTER rc %d\n", rc);
+	pr_info("uffdio_register.range.start 0x%lx\n", (unsigned long) uffdio_register.range.start);
+	pr_info("uffdio_register.len 0x%llx\n", uffdio_register.range.len);
+
+	expected_ioctls = (1 << _UFFDIO_WAKE) | (1 << _UFFDIO_COPY) | (1 << _UFFDIO_ZEROPAGE);
+
+	if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
+		pr_info("unexpected missing ioctl for anon memory\n");
+	}
+
+	if (sys_madvise(uffdio_register.range.start, uffdio_register.range.len, MADV_DONTNEED)) {
+		pr_info("madvise 2");
+	}
+
+}
+
+
+static int vma_remap(unsigned long src, unsigned long dst, unsigned long len, int uffd, int flags)
 {
 	unsigned long guard = 0, tmp;
 
@@ -606,6 +639,8 @@ static int vma_remap(unsigned long src, unsigned long dst, unsigned long len)
 		pr_err("Unable to remap %lx -> %lx\n", src, dst);
 		return -1;
 	}
+	if (flags & 0x20)
+		enable_uffd(uffd, dst, len);
 
 	return 0;
 }
@@ -832,6 +867,9 @@ long __export_restore_task(struct task_restore_args *args)
 	int i;
 	VmaEntry *vma_entry;
 	unsigned long va;
+	int uffd_flags;
+	struct uffdio_api uffdio_api;
+	int rc;
 
 	struct rt_sigframe *rt_sigframe;
 	struct prctl_mm_map prctl_map;
@@ -867,6 +905,20 @@ long __export_restore_task(struct task_restore_args *args)
 
 	pr_info("Switched to the restorer %d\n", my_pid);
 
+	pr_info("logfd %d\n", args->logfd);
+	pr_info("uffd %d\n", args->uffd);
+
+	uffd_flags = sys_fcntl(args->uffd, F_GETFD, 0);
+	pr_info("uffd_flags %d\n", uffd_flags);
+	pr_info("UFFD_API 0x%llx\n", UFFD_API);
+	uffdio_api.api = UFFD_API;
+	uffdio_api.features = 0;
+	rc = sys_ioctl(args->uffd, UFFDIO_API, &uffdio_api);
+	pr_info("ioctl UFFDIO_API rc %d\n", rc);
+	pr_info("uffdio_api.api 0x%llx\n", uffdio_api.api);
+	pr_info("uffdio_api.features 0x%llx\n", uffdio_api.features);
+
+
 	if (vdso_do_park(&args->vdso_sym_rt, args->vdso_rt_parked_at, vdso_rt_size))
 		goto core_restore_end;
 
@@ -888,7 +940,7 @@ long __export_restore_task(struct task_restore_args *args)
 			break;
 
 		if (vma_remap(vma_premmaped_start(vma_entry),
-				vma_entry->start, vma_entry_len(vma_entry)))
+				vma_entry->start, vma_entry_len(vma_entry), args->uffd, vma_entry->flags))
 			goto core_restore_end;
 	}
 
@@ -906,7 +958,7 @@ long __export_restore_task(struct task_restore_args *args)
 			break;
 
 		if (vma_remap(vma_premmaped_start(vma_entry),
-				vma_entry->start, vma_entry_len(vma_entry)))
+				vma_entry->start, vma_entry_len(vma_entry), args->uffd, vma_entry->flags))
 			goto core_restore_end;
 	}
 
diff --git a/uffd.c b/uffd.c
new file mode 100644
index 0000000..ed0aff3
--- /dev/null
+++ b/uffd.c
@@ -0,0 +1,321 @@
+#include <stddef.h>
+#include <stdio.h>
+#include <errno.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/un.h>
+#include <sys/socket.h>
+
+#include "asm/page.h"
+#include "include/log.h"
+#include "include/criu-plugin.h"
+#include "include/page-read.h"
+#include "include/uffd.h"
+
+#define CLI_PATH "/var/tmp/"
+
+static int recv_fd(int fd)
+{
+	int newfd = -1;
+	int nr;
+	int ret = -1;
+	int status;
+	char *ptr;
+	char buf[page_size()];
+	struct iovec iov[1];
+	struct msghdr msg;
+	struct cmsghdr *cmptr = NULL;
+
+	status = -1;
+	for (;;) {
+		iov[0].iov_base = buf;
+		iov[0].iov_len = sizeof(buf);
+		msg.msg_iov = iov;
+		msg.msg_iovlen = 1;
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		cmptr = malloc(CONTROLLEN);
+		if (!cmptr)
+			return -1;
+		msg.msg_control = cmptr;
+		msg.msg_controllen = CONTROLLEN;
+		if ((nr = recvmsg(fd, &msg, 0)) < 0) {
+			pr_info("recvmsg error");
+			goto out;
+		} else if (nr == 0) {
+			pr_info("connection closed by server");
+			goto out;
+		}
+
+		/*
+		 * See if this is the final data with null & status.  Null
+		 * is next to last byte of buffer; status byte is last byte.
+		 * Zero status means there is a file descriptor to receive.
+		 */
+		for (ptr = buf; ptr < &buf[nr];) {
+			if (*ptr++ == 0) {
+				if (ptr != &buf[nr - 1]) {
+					pr_info("message format error");
+					goto out;
+				}
+				/* no sign extension */
+				status = *ptr & 0xFF;
+				if (status == 0) {
+					if (msg.msg_controllen < CONTROLLEN) {
+						pr_info("status = 0 but no fd");
+						goto out;
+					}
+					memcpy(&newfd, (int *) CMSG_DATA(cmptr), sizeof(int));
+				} else {
+					newfd = -status;
+				}
+				nr -= 2;
+			}
+		}
+		if (status >= 0) {
+			ret = newfd;
+			goto out;
+		}
+	}
+out:
+	free(cmptr);
+	return ret;
+}
+
+
+static int client_conn()
+{
+	int fd;
+	int len;
+	int ret = 1;
+	struct sockaddr_un un;
+	struct sockaddr_un sun;
+
+	if (strlen(UFFD_SK) >= sizeof(un.sun_path)) {
+		return -1;
+	}
+
+	if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
+		return -1;
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	sprintf(un.sun_path, "%s%05ld", CLI_PATH, (long) getpid());
+	pr_info("file is %s\n", un.sun_path);
+	len = offsetof(struct sockaddr_un, sun_path) + strlen(un.sun_path);
+
+	/* remove it, if it exists */
+	unlink(un.sun_path);
+	if (bind(fd, (struct sockaddr *) &un, len) < 0) {
+		goto out;
+	}
+	if (chmod(un.sun_path, S_IRWXU) < 0) {
+		goto out;
+	}
+
+	memset(&sun, 0, sizeof(sun));
+	sun.sun_family = AF_UNIX;
+	strcpy(sun.sun_path, UFFD_SK);
+	len = offsetof(struct sockaddr_un, sun_path) + strlen(UFFD_SK);
+	if (connect(fd, (struct sockaddr *) &sun, len) < 0) {
+		goto out;
+	}
+	return (fd);
+
+out:
+	close(fd);
+	unlink(un.sun_path);
+	return (ret);
+}
+
+static int ud_open()
+{
+	int udfd;
+	int newfd;
+
+	if ((udfd = client_conn()) < 0) {
+		pr_err("unix domain socket connection error");
+		return (-1);
+	}
+
+	newfd = recv_fd(udfd);
+	close(udfd);
+
+	return newfd;
+}
+
+
+static unsigned long find_page(unsigned long addr)
+{
+	struct dirent *ent;
+	DIR *dirp = fdopendir(dup(criu_get_image_dir()));
+	struct page_read pr;
+	struct iovec iov;
+	int pid;
+	int ret = -7;
+	unsigned long offset = 0;
+
+	while (1) {
+		ent = readdir(dirp);
+		if (ent == NULL) {
+			if (errno) {
+				pr_perror("Failed readdir, error=%d", errno);
+				offset = -1;
+				goto exit;
+			}
+			break;
+		}
+		ret = sscanf(ent->d_name, "pagemap-%d.img", &pid);
+		if (ret == 1) {
+			pr_info("pid=%d\n", pid);
+			ret = open_page_read(pid, &pr, PR_TASK | PR_MOD);
+			if (ret <= 0) {
+				pr_info("1\n");
+				offset = -1;
+				goto exit;
+			}
+
+			ret = pr.get_pagemap(&pr, &iov);
+			if (ret <= 0) {
+				pr_info("3\n");
+				offset = -1;
+				goto exit;
+			}
+			while (1) {
+				pr_debug("base=%p, len=%zu\n", iov.iov_base, iov.iov_len);
+				if (addr == (unsigned long) iov.iov_base) {
+					pr_info("SAME page\n");
+					pr.put_pagemap(&pr);
+					goto exit;
+				}
+				if (((unsigned long) iov.iov_base <= addr) &&
+				    (addr <= (unsigned long) iov.iov_base +
+				     (unsigned long) iov.iov_len)) {
+					pr_info("in page range\n");
+					pr.put_pagemap(&pr);
+					offset += (addr - (unsigned long) iov.iov_base);
+					goto exit;
+				}
+				offset += iov.iov_len;
+				pr.put_pagemap(&pr);
+				ret = pr.get_pagemap(&pr, &iov);
+				if (ret <= 0)
+					break;
+			}
+			break;
+		}
+	}
+
+      exit:
+	pr.close(&pr);
+	rewinddir(dirp);
+	if (dirp)
+		ret = closedir(dirp);
+
+	return offset;
+}
+
+int uffd_listen()
+{
+	struct uffd_msg msg;
+	unsigned long offset = 0;
+	int rc;
+	int uffd;
+	int uffd_flags;
+	struct uffdio_api uffdio_api;
+	struct uffdio_copy uffdio_copy;
+	unsigned long ps;
+	int pages;
+	void *mapped_pages;
+	__u64 flags;
+	__u64 address;
+
+	if ((uffd = ud_open()) < 0)
+		exit(0);
+
+	pr_info("uffd %d\n", uffd);
+	uffd_flags = fcntl(uffd, F_GETFD, NULL);
+	pr_info("uffd_flags 0x%x\n", uffd_flags);
+	pr_info("UFFD_API 0x%llx\n", UFFD_API);
+
+	uffdio_api.api = UFFD_API;
+	uffdio_api.features = 0;
+
+	if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+		pr_err("UFFDIO_API ioctl() failed\n");
+		return 1;
+	}
+
+	if (uffdio_api.api != UFFD_API) {
+		pr_err("UFFDIO_API error %Lu\n", uffdio_api.api);
+		return 1;
+	}
+
+	ps = page_size();
+
+	while (1) {
+		rc = read(uffd, &msg, sizeof(msg));
+		pr_info("rc read 0x%x\n", rc);
+
+		if (rc != sizeof(msg)) {
+			if (rc < 0)
+				perror("blocking read error");
+			else
+				pr_info("short read\n");
+			continue;
+		}
+
+		flags = msg.arg.pagefault.flags;
+		address = msg.arg.pagefault.address;
+		pr_info("msg.event 0x%x\n", msg.event);
+		pr_info("msg.arg.pagefault.flags 0x%llx\n", flags);
+
+		if (msg.event != UFFD_EVENT_PAGEFAULT) {
+			pr_err("unexpected msg event %u\n", msg.event);
+			return 1;
+		}
+
+		if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+			pr_info("unexpected write fault\n");
+
+		pr_info("msg.arg.pagefault.address 0x%llx\n", address);
+		pr_info("msg.arg.pagefault.address aligned 0x%llx\n", address & ~(ps - 1));
+
+		offset = find_page(address & ~(ps - 1));
+		pr_info("uffd mmap offset 0x%lx\n", offset);
+
+		pages = openat(criu_get_image_dir(), "pages-1.img", O_RDONLY);
+		mapped_pages = mmap(0, ps, PROT_READ, MAP_SHARED, pages, offset);
+
+		uffdio_copy.dst = address & ~(ps - 1);
+		uffdio_copy.src = (unsigned long) mapped_pages;
+		uffdio_copy.len = ps;
+		uffdio_copy.mode = 0;
+		uffdio_copy.copy = 0;
+
+		pr_info("uffdio_copy.dst 0x%llx\n", uffdio_copy.dst);
+		rc = ioctl(uffd, UFFDIO_COPY, &uffdio_copy);
+		pr_info("ioctl UFFDIO_COPY rc 0x%x\n", rc);
+		pr_info("uffdio_copy.copy 0x%llx\n", uffdio_copy.copy);
+		if (rc) {
+			/* real retval in ufdio_copy.copy */
+			if (uffdio_copy.copy != -EEXIST) {
+				pr_err("UFFDIO_COPY error %Ld\n", uffdio_copy.copy);
+				return 1;
+			}
+
+		} else if (uffdio_copy.copy != ps) {
+			pr_err("UFFDIO_COPY unexpected copy %Ld\n", uffdio_copy.copy);
+			return 1;
+		}
+		close(pages);
+	}
+	return 0;
+}
-- 
2.4.3



More information about the CRIU mailing list