[CRIU] [PATCH v4] Try to include userfaultfd with criu
Pavel Emelyanov
xemul at virtuozzo.com
Wed Mar 9 03:44:11 PST 2016
Hi, Adrian
Great progress :) Please, find my comments inline.
> To handle the required pages a new criu command has been added. For a
> userfaultfd supported restore the first step is to start the
> 'lazy-pages' server:
>
> criu lazy-pages -v4 -D /tmp/3/ --address /tmp/userfault.socket
>
> This waits on a unix domain socket (defined using the --address option)
> to receive a userfaultfd file descriptor from a '--lazy-pages' enabled
> 'criu restore':
>
> criu restore -D /tmp/3 -j -v4 --lazy-pages \
> --address /tmp/userfault.socket
Would you split the patch into two then -- the first one is to introduce
the lazy-pages action and the second one is to make --lazy-pages option
for restore?
> ...
> #include "protobuf.h"
> @@ -469,6 +472,17 @@ static int restore_priv_vma_content(void)
> p = decode_pointer((off) * PAGE_SIZE +
> vma->premmaped_addr);
>
> + /*
> + * This means that userfaultfd is used to load the pages
> + * on demand.
> + */
> + if (opts.lazy_pages && (vma->e->flags & MAP_ANONYMOUS) &&
> + (vma->e->flags & MAP_PRIVATE)) {
Please, introduce the vma_can_be_lazy() helper that we will extend while
the uffd supports more.
> + pr_debug("Lazy restore skips %lx\n", vma->e->start);
> + pr.skip_pages(&pr, PAGE_SIZE);
> + continue;
> + }
> +
> set_bit(off, vma->page_bitmap);
> if (vma->ppage_bitmap) { /* inherited vma */
> clear_bit(off, vma->ppage_bitmap);
> @@ -2131,7 +2145,7 @@ out:
> return -1;
> }
>
> -static int prepare_task_entries(void)
> +int prepare_task_entries(void)
> {
> task_entries_pos = rst_mem_align_cpos(RM_SHREMAP);
> task_entries = rst_mem_alloc(sizeof(*task_entries), RM_SHREMAP);
> @@ -2964,6 +2978,54 @@ static int rst_prep_creds(pid_t pid, CoreEntry *core, unsigned long *creds_pos)
> return 0;
> }
>
> +#ifdef CONFIG_HAS_UFFD
> +
> +static int send_uffd(int sendfd)
Maybe put this routine into uffd.c?
> +{
> + int fd;
> + int len;
> + int ret = -1;
> + struct sockaddr_un sun;
> +
> + if (!opts.addr) {
> + pr_info("Please specify a file name for the unix domain socket\n");
> + pr_info("used to communicate between the lazy-pages server\n");
> + pr_info("and the restore process. Use the --address option like\n");
> + pr_info("criu restore --lazy-pages --address /tmp/userfault.socket\n");
> + return -1;
> + }
> +
> + if (sendfd < 0)
> + return -1;
> +
> + if (strlen(opts.addr) >= sizeof(sun.sun_path)) {
> + return -1;
> + }
> +
> + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
> + return -1;
> +
> + memset(&sun, 0, sizeof(sun));
> + sun.sun_family = AF_UNIX;
> + strcpy(sun.sun_path, opts.addr);
> + len = offsetof(struct sockaddr_un, sun_path) + strlen(opts.addr);
> + if (connect(fd, (struct sockaddr *) &sun, len) < 0) {
> + pr_perror("connect to %s failed", opts.addr);
> + goto out;
> + }
> +
> ...
> @@ -3219,6 +3281,37 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
>
> strncpy(task_args->comm, core->tc->comm, sizeof(task_args->comm));
>
> + if (!opts.lazy_pages)
> + task_args->uffd = -1;
> +#ifdef CONFIG_HAS_UFFD
> + else {
> + struct uffdio_api uffdio_api;
> + /*
> + * Open userfaulfd FD which is passed to the restorer blob and
> + * to a second process handling the userfaultfd page faults.
> + */
> + task_args->uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
> +
> + /*
> + * Check if the UFFD_API is the one which is expected
> + */
> + uffdio_api.api = UFFD_API;
> + uffdio_api.features = 0;
> + if (ioctl(task_args->uffd, UFFDIO_API, &uffdio_api)) {
> + pr_err("Checking for UFFDIO_API failed.\n");
> + goto err;
> + }
> + if (uffdio_api.api != UFFD_API) {
> + pr_err("Result of looking up UFFDIO_API does not match: %Lu\n", uffdio_api.api);
> + goto err;
> + }
> +
> + if (send_uffd(task_args->uffd) < 0) {
> + close(task_args->uffd);
> + goto err;
> + }
> + }
> +#endif
Can we have this hunk be a routine in uffd.c (or a static inline in a header when
CONFIG_UFFD is not set)?
>
> /*
> * Fill up per-thread data.
> ...
> @@ -51,6 +51,7 @@ struct page_read {
> /* stop working on current pagemap */
> void (*put_pagemap)(struct page_read *);
> void (*close)(struct page_read *);
> + void (*skip_pages)(struct page_read *, unsigned long len);
Ah, you need the skip_pages callbacks from page_read-s. Please, make it
be another separate patch.
>
> /* Private data of reader */
> struct cr_img *pmi;
> ...
> +static void enable_uffd(int uffd, unsigned long addr, unsigned long len)
> +{
> + /*
> + * If uffd == -1, this means that userfaultfd is not enabled
> + * or it is not available.
> + */
> + if (uffd == -1)
> + return;
> +#ifdef CONFIG_HAS_UFFD
> + int rc;
> + struct uffdio_register uffdio_register;
> + unsigned long expected_ioctls;
> +
> + uffdio_register.range.start = addr;
> + uffdio_register.range.len = len;
> + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
> + pr_info("lazy-pages: uffdio_register.range.start 0x%lx\n", (unsigned long) uffdio_register.range.start);
> + pr_info("lazy-pages: uffdio_register.len 0x%llx\n", uffdio_register.range.len);
> + rc = sys_ioctl(uffd, UFFDIO_REGISTER, &uffdio_register);
The failrure to register the region should be handled.
> + pr_info("lazy-pages: ioctl UFFDIO_REGISTER rc %d\n", rc);
> + pr_info("lazy-pages: uffdio_register.range.start 0x%lx\n", (unsigned long) uffdio_register.range.start);
> + pr_info("lazy-pages: uffdio_register.len 0x%llx\n", uffdio_register.range.len);
> +
> + expected_ioctls = (1 << _UFFDIO_WAKE) | (1 << _UFFDIO_COPY) | (1 << _UFFDIO_ZEROPAGE);
> +
> + if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
> + pr_err("lazy-pages: unexpected missing uffd ioctl for anon memory\n");
> + }
> +
> +#endif
> +
> +}
> +
> +
> +static int vma_remap(unsigned long src, unsigned long dst, unsigned long len, int uffd, int flags)
> {
> unsigned long guard = 0, tmp;
>
> ...
> +static void criu_init()
> +{
> + /* TODO: return code checking */
> + check_img_inventory();
> + prepare_task_entries();
> + prepare_pstree();
> + collect_remaps_and_regfiles();
> + prepare_shared_reg_files();
> + prepare_remaps();
> + prepare_mm_pid(root_item);
Wow! Do you really need all this stuff in uffd daemon? What for?
> +
> + /* We found a PID */
> + pr_debug("root_item->pid.virt %d\n", root_item->pid.virt);
> + pr_debug("root_item->pid.real %d\n", root_item->pid.real);
> +}
> +
> +int uffd_listen()
> +{
> + __u64 address;
> + void *dest;
> + __u64 flags;
> + struct uffd_msg msg;
> + struct page_read pr;
> + unsigned long ps;
> + int rc;
> + fd_set set;
> + struct timeval timeout;
> + int uffd;
> + unsigned long uffd_copied_pages = 0;
> + int uffd_flags;
> + struct uffd_pages_struct *uffd_pages;
> + bool vdso_sent = false;
> + unsigned long vma_size = 0;
> +
> + LIST_HEAD(uffd_list);
> +
> + if (!opts.addr) {
> + pr_info("Please specify a file name for the unix domain socket\n");
> + pr_info("used to communicate between the lazy-pages server\n");
> + pr_info("and the restore process. Use the --address option like\n");
> + pr_info("criu --lazy-pages --address /tmp/userfault.socket\n");
> + return -1;
> + }
> +
> + pr_debug("Waiting for incoming connections on %s\n", opts.addr);
> + if ((uffd = ud_open()) < 0)
> + exit(0);
> +
> + pr_debug("uffd is 0x%d\n", uffd);
> + uffd_flags = fcntl(uffd, F_GETFD, NULL);
> + pr_debug("uffd_flags are 0x%x\n", uffd_flags);
> +
> + /* Setting up criu infrastructure to easily access the dump results */
> + criu_init();
> +
> + /* Initialize FD sets for read() with timeouts (using select()) */
> + FD_ZERO(&set);
> + FD_SET(uffd, &set);
> +
> + /* All operations will be done on page size */
> + ps = page_size();
> + dest = malloc(ps);
> +
> + rc = open_page_read(root_item->pid.virt, &pr, PR_TASK);
This seem to work only with one single task, doesn't it?
> + if (rc <= 0)
> + return 1;
> + /*
> + * This puts all pages which should be handled by userfaultfd
> + * in the list uffd_list. This list is later used to detect if
> + * a page has already been transferred or if it needs to be
> + * pushed into the process using userfaultfd.
> + */
-- Pavel
More information about the CRIU
mailing list