[CRIU] [PATCH v3] Try to include userfaultfd with criu
Pavel Emelyanov
xemul at parallels.com
Mon Dec 14 01:13:10 PST 2015
On 12/09/2015 11:24 PM, Adrian Reber wrote:
> From: Adrian Reber <areber at redhat.com>
>
> This is a first try to include userfaultfd with criu. Right now it
> still requires a "normal" checkpoint. After checkpointing the application
> it can be restored with the help of userfaultfd.
>
> All restored pages with MAP_ANONYMOUS and MAP_PRIVATE set are marked as
> being handled by userfaultfd and also madvise()'d as MADV_DONTNEED.
> MADV_DONTNEED is needed to make sure the pages are not mapped in and are
> actually triggering a userfault.
Can you shed more light on this -- once criu does mmap() there's not (should
be no) pages in the region, so what do we remove from there with this madvise?
> TODO:
> * Contains still many debug outputs which need to be cleaned up.
> * Maybe transfer the dump directory FD also via unix domain sockets
> so that the 'uffd'/'lazy-pages' server can keep running without
> the need to specify the dump directory with '-D'
> * Keep the lazy-pages server running after all pages have been
> transferred and start waiting for new connections to serve.
Please, add to the TODO list the need to resurrect the non-cooperative
patch set, as once the restored task fork()-s or calls mremap() the
whole thing becomes broken :(
>
> diff --git a/Makefile b/Makefile
> index 1793091..fe62488 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -163,6 +163,10 @@ ifeq ($(GMON),1)
> GMONLDOPT = -pg
> endif
>
> +ifeq ($(UFFD),1)
The UFFD should be specified in the command line, right? Let's have the
uffd always compiled in, but for the proff-of-concept period just don't
include this into CLI and declare the feature as experimental.
> + DEFINES += -DUFFD
> +endif
> +
> CFLAGS += $(WARNINGS) $(DEFINES)
> SYSCALL-LIB := $(ARCH_DIR)/syscalls.built-in.o
> ARCH-LIB := $(ARCH_DIR)/crtools.built-in.o
> diff --git a/Makefile.config b/Makefile.config
> index ce4b8d8..3df8f53 100644
> --- a/Makefile.config
> +++ b/Makefile.config
> @@ -44,6 +44,9 @@ endif
> ifeq ($(piegen-y),y)
> $(Q) @echo '#define CONFIG_PIEGEN' >> $@
> endif
> +ifeq ($(UFFD),1)
> + $(Q) @echo '#define CONFIG_UFFD' >> $@
> +endif
> $(Q) @echo '#endif /* __CR_CONFIG_H__ */' >> $@
>
> config: $(CONFIG)
> +#define CLI_PATH "/var/tmp/"
> +
> +static int send_uffd(int sendfd)
> +{
> + int fd;
> + int len;
> + int ret = -1;
> + struct sockaddr_un un;
> + struct sockaddr_un sun;
> +
> + if (sendfd < 0)
> + return -1;
> +
> + if (strlen(UFFD_SK) >= sizeof(un.sun_path)) {
> + return -1;
> + }
> +
> + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
> + return -1;
> +
> + memset(&un, 0, sizeof(un));
> + un.sun_family = AF_UNIX;
> + sprintf(un.sun_path, "%s%05ld", CLI_PATH, (long) getpid());
> + pr_debug("file is %s\n", un.sun_path);
> + len = offsetof(struct sockaddr_un, sun_path) + strlen(un.sun_path);
> +
> + /* remove it, if it exists */
> + unlink(un.sun_path);
> + if (bind(fd, (struct sockaddr *) &un, len) < 0) {
> + pr_perror("bind failed");
> + goto out;
> + }
Why do you bind() this socket before connect to uffd lazy server?
> + if (chmod(un.sun_path, S_IRWXU) < 0) {
> + pr_perror("chmod failed");
> + goto out;
> + }
> +
> + memset(&sun, 0, sizeof(sun));
> + sun.sun_family = AF_UNIX;
> + strcpy(sun.sun_path, UFFD_SK);
> + len = offsetof(struct sockaddr_un, sun_path) + strlen(UFFD_SK);
> + if (connect(fd, (struct sockaddr *) &sun, len) < 0) {
> + pr_perror("connect failed");
> + goto out;
> + }
> +
> + if (send_fd(fd, NULL, 0, sendfd) < 0) {
> + pr_perror("send_fd error:");
> + goto out;
> + }
> + ret = 0;
> +out:
> + close(fd);
> + return ret;
> +}
> +
> static int sigreturn_restore(pid_t pid, CoreEntry *core)
> {
> void *mem = MAP_FAILED;
> @@ -3008,6 +3078,37 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
>
> strncpy(task_args->comm, core->tc->comm, sizeof(task_args->comm));
>
> + if (!opts.lazy_pages)
> + task_args->uffd = -1;
> +#ifdef CONFIG_UFFD
> + else {
> + struct uffdio_api uffdio_api;
> + /*
> + * Open userfaulfd FD which is passed to the restorer blob and
> + * to a second process handling the userfaultfd page faults.
> + */
> + task_args->uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
You can add the syscall number to per-arch syscall tables and the there will
appear the sys_userfaultfd() function in criu :)
> +
> + /*
> + * Check if the UFFD_API is the one which is expected
> + */
> + uffdio_api.api = UFFD_API;
> + uffdio_api.features = 0;
> + if (ioctl(task_args->uffd, UFFDIO_API, &uffdio_api)) {
> + pr_err("Checking for UFFDIO_API failed.\n");
> + goto err;
> + }
> + if (uffdio_api.api != UFFD_API) {
> + pr_err("Result of looking up UFFDIO_API does not match: %Lu\n", uffdio_api.api);
> + goto err;
> + }
> +
> + if (send_uffd(task_args->uffd) < 0) {
> + close(task_args->uffd);
> + goto err;
> + }
> + }
> +#endif
>
> /*
> * Fill up per-thread data.
> diff --git a/crtools.c b/crtools.c
> diff --git a/uffd.c b/uffd.c
> new file mode 100644
> index 0000000..8923916
> --- /dev/null
> +++ b/uffd.c
> @@ -0,0 +1,496 @@
> +#include <stddef.h>
> +#include <stdio.h>
> +#include <errno.h>
> +#include <dirent.h>
> +#include <unistd.h>
> +#include <stdlib.h>
> +#include <fcntl.h>
> +#include <string.h>
> +#include <time.h>
> +#include <sys/stat.h>
> +#include <sys/mman.h>
> +#include <sys/syscall.h>
> +#include <sys/ioctl.h>
> +#include <sys/un.h>
> +#include <sys/socket.h>
> +
> +#include "asm/page.h"
> +#include "include/log.h"
> +#include "include/criu-plugin.h"
> +#include "include/page-read.h"
> +#include "include/files-reg.h"
> +#include "include/mem.h"
> +#include "include/uffd.h"
> +#include "include/util-pie.h"
> +#include "include/pstree.h"
> +#include "include/crtools.h"
> +#include "xmalloc.h"
> +
> +#undef LOG_PREFIX
> +#define LOG_PREFIX "lazy-pages: "
> +
> +static int server_listen(struct sockaddr_un *saddr)
> +{
> + int fd;
> + int len;
> +
> + if (strlen(UFFD_SK) >= sizeof(saddr->sun_path)) {
> + return -1;
> + }
> +
> + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
> + return -1;
> +
> + unlink(UFFD_SK);
> +
> + memset(saddr, 0, sizeof(struct sockaddr_un));
> + saddr->sun_family = AF_UNIX;
> + strcpy(saddr->sun_path, UFFD_SK);
Please, make use of the --address option for lazy-pages server address.
> + len = offsetof(struct sockaddr_un, sun_path) + strlen(UFFD_SK);
> +
> + if (bind(fd, (struct sockaddr *) saddr, len) < 0) {
> + goto out;
> + }
> +
> + if (listen(fd, 10) < 0) {
> + goto out;
> + }
> +
> + return fd;
> +
> +out:
> + close(fd);
> + return -1;
> +}
> +
> +static int server_accept(int listen, struct sockaddr_un *saddr)
> +{
> + int client;
> + int ret = -1;
> + socklen_t len;
> + time_t staletime;
> + struct stat statbuf;
> + char *name;
> +
> + if ((name = malloc(sizeof(saddr->sun_path + 1))) == NULL)
> + return ret;
> +
> + len = sizeof(struct sockaddr_un);
> + if ((client = accept(listen, (struct sockaddr *) saddr, &len)) < 0) {
> + free(name);
> + return ret;
> + }
> +
> + len -= offsetof(struct sockaddr_un, sun_path);
> + memcpy(name, saddr->sun_path, len);
> + name[len] = 0;
> + if (stat(name, &statbuf) < 0) {
> + goto out;
> + }
> +
> + if (S_ISSOCK(statbuf.st_mode) == 0) {
> + goto out;
> + }
> +
> + if ((statbuf.st_mode & (S_IRWXG | S_IRWXO)) || (statbuf.st_mode & S_IRWXU) != S_IRWXU) {
> + goto out;
> + }
> +
> + staletime = time(NULL) - 30;
> + if (statbuf.st_atime < staletime ||
> + statbuf.st_ctime < staletime || statbuf.st_mtime < staletime) {
> + /* inode is older than 30 seconds; see above */
> + goto out;
> + }
Wow... Would you eplain these checks about?
> +
> + unlink(name);
> + free(name);
> + return client;
> +
> +out:
> + unlink(name);
> + close(client);
> + free(name);
> + return ret;
> +}
> +
> +static void get_page(unsigned long addr, void *dest, struct page_read *pr)
> +{
> + struct iovec iov;
> + int ret;
> + unsigned char buf[PAGE_SIZE];
> +
> +#ifdef rewind
> + /*
> + * TODO: The idea behind rewind is to get a function
> + * which rewinds (seek(fd, 0, SEEK_SET)) the image pointer.
> + * Unfortunetaly this does not yet exist and so the image
> + * is opened anc closed for each page retrieved.
> + */
> + rewind_pagemap(pr);
> +#else
> + ret = open_page_read(root_item->pid.virt, pr, PR_TASK | PR_MOD);
> + pr_debug("ret %d\n", ret);
> +#endif
> + /* TODO: return code checking */
> + ret = pr->get_pagemap(pr, &iov);
> + pr_debug("get_pagemap ret %d\n", ret);
> + ret = seek_pagemap_page(pr, addr, true);
> + pr_debug("seek_pagemap_page %x\n", ret);
> + ret = pr->read_pages(pr, addr, 1, buf);
> + pr_debug("read_pages ret %d\n", ret);
> + memcpy(dest, buf, PAGE_SIZE);
> +#ifndef rewind
> + pr->close(pr);
> +#endif
> +}
> +
> +#define UFFD_FLAG_SENT 0x1
> +#define UFFD_FLAG_VDSO 0x2
> +
> +struct uffd_pages_struct {
> + struct list_head list;
> + unsigned long addr;
> + int flags;
> +};
> +
> +static int uffd_copy_page(int uffd, struct page_read *pr, __u64 address,
> + void *dest)
> +{
> + struct uffdio_copy uffdio_copy;
> + int rc;
> +
> + get_page(address, dest, pr);
But there can be no page with that address in the images. This is pretty
possble and in this case we should install zero page into task with the
correcponding ioctl.
> +
> + uffdio_copy.dst = address;
> + uffdio_copy.src = (unsigned long) dest;
> + uffdio_copy.len = page_size();
> + uffdio_copy.mode = 0;
> + uffdio_copy.copy = 0;
> +
> + pr_debug("uffdio_copy.dst 0x%llx\n", uffdio_copy.dst);
> + rc = ioctl(uffd, UFFDIO_COPY, &uffdio_copy);
> + pr_debug("ioctl UFFDIO_COPY rc 0x%x\n", rc);
> + pr_debug("uffdio_copy.copy 0x%llx\n", uffdio_copy.copy);
> + if (rc) {
> + /* real retval in ufdio_copy.copy */
> + if (uffdio_copy.copy != -EEXIST) {
> + pr_err("UFFDIO_COPY error %Ld\n", uffdio_copy.copy);
> + return -1;
> + }
> + } else if (uffdio_copy.copy != page_size()) {
> + pr_err("UFFDIO_COPY unexpected size %Ld\n", uffdio_copy.copy);
> + return -1;
> + }
> +
> +
> + return uffdio_copy.copy;
> +
> +}
> +
> +int uffd_listen()
> +{
> + __u64 address;
> + void *dest;
> + __u64 flags;
> + struct uffd_msg msg;
> + struct page_read pr;
> + unsigned long ps;
> + int rc;
> + fd_set set;
> + struct timeval timeout;
> + int uffd;
> + unsigned long uffd_copied_pages = 0;
> + int uffd_flags;
> + struct uffd_pages_struct *uffd_pages;
> + bool vdso_sent;
> + unsigned long vma_size = 0;
> +
> + LIST_HEAD(uffd_list);
> +
> + pr_debug("Waiting for incoming connections\n");
> + if ((uffd = ud_open()) < 0)
> + exit(0);
> +
> + pr_debug("uffd is 0x%d\n", uffd);
> + uffd_flags = fcntl(uffd, F_GETFD, NULL);
> + pr_debug("uffd_flags are 0x%x\n", uffd_flags);
> +
> + /* Setting up criu infrastructure to easily access the dump results */
> + criu_init();
> +
> + /* Initialize FD sets for read() with timeouts (using select()) */
> + FD_ZERO(&set);
> + FD_SET(uffd, &set);
> +
> + /* All operations will be done on page size */
> + ps = page_size();
> + dest = malloc(ps);
> +
> +#ifdef rewind
> + rc = open_page_read(root_item->pid.virt, &pr, PR_TASK | PR_MOD);
> + pr_debug("open_page_read rc %d\n", rc);
> +#endif
> +
> +
> + rc = open_page_read(root_item->pid.virt, &pr, PR_TASK);
> + if (rc <= 0)
> + return 1;
> + /*
> + * This puts all pages which should be handled by userfaultfd
> + * in the list uffd_list. This list is later used to detect if
> + * a page has already been transferred or if it needs to be
> + * pushed into the process using userfaultfd.
> + */
> + do {
> + rc = collect_uffd_pages(&pr, &uffd_list, &vma_size);
> + if (rc == -1)
> + return 1;
> + } while (rc);
> +
> + if (pr.close)
> + pr.close(&pr);
> +
> + while (1) {
> + bool page_sent = false;
> + /*
> + * Setting the timeout to 5 seconds. If after this time
> + * no uffd pages are requested the code switches to
> + * copying the remaining pages.
> + *
> + * Timeout is re-defined every time select() is run as
> + * select(2) says:
> + * Consider timeout to be undefined after select() returns.
> + */
> + timeout.tv_sec = 5;
> + timeout.tv_usec = 0;
> + rc = select(uffd + 1, &set, NULL, NULL, &timeout);
> + pr_debug("select() rc: 0x%x\n", rc);
> + if (rc == 0) {
> + pr_debug("read timeout\n");
> + pr_debug("switching from request to copy mode\n");
> + break;
> + }
> + rc = read(uffd, &msg, sizeof(msg));
> + pr_debug("read() rc: 0x%x\n", rc);
> +
> + if (rc != sizeof(msg)) {
> + if (rc < 0)
> + pr_perror("read error");
> + else
> + pr_debug("short read\n");
> + continue;
> + }
> +
> + /* Align requested address to the next page boundary */
> + address = msg.arg.pagefault.address & ~(ps - 1);
> + pr_debug("msg.arg.pagefault.address 0x%llx\n", address);
> +
> + /*
> + * At this point the process on the other side waits for the first page.
> + * In the first step we will force the vdso pages into the new process.
> + */
> + if (!vdso_sent) {
> + pr_debug("Pushing VDSO pages once\n");
> + list_for_each_entry(uffd_pages, &uffd_list, list) {
> + if (!(uffd_pages->flags & UFFD_FLAG_VDSO))
> + continue;
> + rc = uffd_copy_page(uffd, &pr, uffd_pages->addr,
> + dest);
> + if (rc < 0) {
> + pr_err("Error during UFFD copy\n");
> + return 1;
> + }
> + vma_size -= rc;
> + uffd_copied_pages++;
> + uffd_pages->flags |= UFFD_FLAG_SENT;
> + }
> + vdso_sent = true;
> + }
> +
> + /* Make sure to not transfer a page twice */
But how can this happen? If you put the page into task and it asks for it
_again_, this means that something goes wrong, doesn' it?
> + list_for_each_entry(uffd_pages, &uffd_list, list) {
> + if ((uffd_pages->addr == address) &&
> + (uffd_pages->flags & UFFD_FLAG_SENT)) {
> + page_sent = true;
> + break;
> + }
> + }
> +
> + if (page_sent)
> + continue;
> +
> + /* Now handle the pages actually requested. */
> +
> + flags = msg.arg.pagefault.flags;
> + pr_debug("msg.arg.pagefault.flags 0x%llx\n", flags);
> +
> + if (msg.event != UFFD_EVENT_PAGEFAULT) {
> + pr_err("unexpected msg event %u\n", msg.event);
> + return 1;
> + }
> +
> + rc = uffd_copy_page(uffd, &pr, address, dest);
> + if (rc < 0) {
> + pr_err("Error during UFFD copy\n");
> + return 1;
> + }
More information about the CRIU
mailing list