[CRIU] [PATCH v3] Try to include userfaultfd with criu

Thu Dec 17 04:00:37 PST 2015

>>> diff --git a/Makefile b/Makefile
>>> index 1793091..fe62488 100644
>>> --- a/Makefile
>>> +++ b/Makefile
>>> @@ -163,6 +163,10 @@ ifeq ($(GMON),1)
>>>  	GMONLDOPT = -pg
>>>  endif
>>>  
>>> +ifeq ($(UFFD),1)
>>
>> The UFFD should be specified in the command line, right? Let's have the
>> uffd always compiled in, but for the proff-of-concept period just don't
>> include this into CLI and declare the feature as experimental.
> 
> As it requires the linux/userfaultfd.h header and is only available
> since (I think 4.3) it would break many systems on older kernel. So I
> think it makes sense to make it somehow optional. I think I saw a
> build-system feature which can set defines based on found header files.
> I could try to add some auto-detection if the linux/userfaultfd.h header
> is actually there (if that exists in the criu Makefiles).

Well, we used to have plans to integrate build system with automake that
does smth like this, but never get there.

For now, what we always do for not-yet-available kernel headers is just
copy the needed bits right into the criu sources. So if you bring the
include/linux/userfaultfd.h from kernel into criu/include that would be
just fine.

>>> +static void get_page(unsigned long addr, void *dest, struct page_read *pr)
>>> +{
>>> +	struct iovec iov;
>>> +	int ret;
>>> +	unsigned char buf[PAGE_SIZE];
>>> +
>>> +#ifdef rewind
>>> +	/*
>>> +	 * TODO: The idea behind rewind is to get a function
>>> +	 * which rewinds (seek(fd, 0, SEEK_SET)) the image pointer.
>>> +	 * Unfortunetaly this does not yet exist and so the image
>>> +	 * is opened anc closed for each page retrieved.
>>> +	 */
>>> +	rewind_pagemap(pr);
>>> +#else
>>> +	ret = open_page_read(root_item->pid.virt, pr, PR_TASK | PR_MOD);
>>> +	pr_debug("ret %d\n", ret);
>>> +#endif
>>> +	/* TODO: return code checking */
>>> +	ret = pr->get_pagemap(pr, &iov);
>>> +	pr_debug("get_pagemap  ret %d\n", ret);
>>> +	ret = seek_pagemap_page(pr, addr, true);
>>> +	pr_debug("seek_pagemap_page %x\n", ret);
>>> +	ret = pr->read_pages(pr, addr, 1, buf);
>>> +	pr_debug("read_pages ret %d\n", ret);
>>> +	memcpy(dest, buf, PAGE_SIZE);
>>> +#ifndef rewind
>>> +	pr->close(pr);
>>> +#endif
>>> +}
>>> +
>>> +#define UFFD_FLAG_SENT	0x1
>>> +#define UFFD_FLAG_VDSO	0x2
>>> +
>>> +struct uffd_pages_struct {
>>> +        struct list_head                list;
>>> +        unsigned long                   addr;
>>> +        int                             flags;
>>> +};
>>> +
>>> +static int uffd_copy_page(int uffd, struct page_read *pr, __u64 address,
>>> +			  void *dest)
>>> +{
>>> +	struct uffdio_copy uffdio_copy;
>>> +	int rc;
>>> +
>>> +	get_page(address, dest, pr);
>>
>> But there can be no page with that address in the images. This is pretty
>> possble and in this case we should install zero page into task with the
>> correcponding ioctl.
> 
> I think, I do not understand this comment. Can you elaborate a bit more
> how/why zero pages are needed?

Sure. If we have an area of two pages with only one being present, migrate it 
then we can receive uffd event for both -- used-to-be present page and non
present one. For the former we have the page in the image and can uffd-copy
it into requesting task. But for the latter we have no page and will have
to push zero page into task just as kernel does for such cases.

>>> +
>>> +	uffdio_copy.dst = address;
>>> +	uffdio_copy.src = (unsigned long) dest;
>>> +	uffdio_copy.len = page_size();
>>> +	uffdio_copy.mode = 0;
>>> +	uffdio_copy.copy = 0;
>>> +
>>> +	pr_debug("uffdio_copy.dst 0x%llx\n", uffdio_copy.dst);
>>> +	rc = ioctl(uffd, UFFDIO_COPY, &uffdio_copy);
>>> +	pr_debug("ioctl UFFDIO_COPY rc 0x%x\n", rc);
>>> +	pr_debug("uffdio_copy.copy 0x%llx\n", uffdio_copy.copy);
>>> +	if (rc) {
>>> +		/* real retval in ufdio_copy.copy */
>>> +		if (uffdio_copy.copy != -EEXIST) {
>>> +			pr_err("UFFDIO_COPY error %Ld\n", uffdio_copy.copy);
>>> +			return -1;
>>> +		}
>>> +	} else if (uffdio_copy.copy != page_size()) {
>>> +		pr_err("UFFDIO_COPY unexpected size %Ld\n", uffdio_copy.copy);
>>> +		return -1;
>>> +	}
>>> +
>>> +
>>> +	return uffdio_copy.copy;
>>> +
>>> +}
>>> +
>>
>>> +int uffd_listen()
>>> +{
>>> +	__u64 address;
>>> +	void *dest;
>>> +	__u64 flags;
>>> +	struct uffd_msg msg;
>>> +	struct page_read pr;
>>> +	unsigned long ps;
>>> +	int rc;
>>> +	fd_set set;
>>> +	struct timeval timeout;
>>> +	int uffd;
>>> +	unsigned long uffd_copied_pages = 0;
>>> +	int uffd_flags;
>>> +	struct uffd_pages_struct *uffd_pages;
>>> +	bool vdso_sent;
>>> +	unsigned long vma_size = 0;
>>> +
>>> +	LIST_HEAD(uffd_list);
>>> +
>>> +	pr_debug("Waiting for incoming connections\n");
>>> +	if ((uffd = ud_open()) < 0)
>>> +		exit(0);
>>> +
>>> +	pr_debug("uffd is 0x%d\n", uffd);
>>> +	uffd_flags = fcntl(uffd, F_GETFD, NULL);
>>> +	pr_debug("uffd_flags are 0x%x\n", uffd_flags);
>>> +
>>> +	/* Setting up criu infrastructure to easily access the dump results */
>>> +	criu_init();
>>> +
>>> +	/* Initialize FD sets for read() with timeouts (using select()) */
>>> +	FD_ZERO(&set);
>>> +	FD_SET(uffd, &set);
>>> +
>>> +	/* All operations will be done on page size */
>>> +	ps = page_size();
>>> +	dest = malloc(ps);
>>> +
>>> +#ifdef rewind
>>> +	rc = open_page_read(root_item->pid.virt, &pr, PR_TASK | PR_MOD);
>>> +	pr_debug("open_page_read rc %d\n", rc);
>>> +#endif
>>> +
>>> +
>>> +	rc = open_page_read(root_item->pid.virt, &pr, PR_TASK);
>>> +	if (rc <= 0)
>>> +		return 1;
>>> +	/*
>>> +	 * This puts all pages which should be handled by userfaultfd
>>> +	 * in the list uffd_list. This list is later used to detect if
>>> +	 * a page has already been transferred or if it needs to be
>>> +	 * pushed into the process using userfaultfd.
>>> +	 */
>>> +	do {
>>> +		rc = collect_uffd_pages(&pr, &uffd_list, &vma_size);
>>> +		if (rc == -1)
>>> +			return 1;
>>> +	} while (rc);
>>> +
>>> +	if (pr.close)
>>> +		pr.close(&pr);
>>> +
>>> +	while (1) {
>>> +		bool page_sent = false;
>>> +		/*
>>> +		 * Setting the timeout to 5 seconds. If after this time
>>> +		 * no uffd pages are requested the code switches to
>>> +		 * copying the remaining pages.
>>> +		 *
>>> +		 * Timeout is re-defined every time select() is run as
>>> +		 * select(2) says:
>>> +		 *  Consider timeout to be undefined after select() returns.
>>> +		 */
>>> +		timeout.tv_sec = 5;
>>> +		timeout.tv_usec = 0;
>>> +		rc = select(uffd + 1, &set, NULL, NULL, &timeout);
>>> +		pr_debug("select() rc: 0x%x\n", rc);
>>> +		if (rc == 0) {
>>> +			pr_debug("read timeout\n");
>>> +			pr_debug("switching from request to copy mode\n");
>>> +			break;
>>> +		}
>>> +		rc = read(uffd, &msg, sizeof(msg));
>>> +		pr_debug("read() rc: 0x%x\n", rc);
>>> +
>>> +		if (rc != sizeof(msg)) {
>>> +			if (rc < 0)
>>> +				pr_perror("read error");
>>> +			else
>>> +				pr_debug("short read\n");
>>> +			continue;
>>> +		}
>>> +
>>> +		/* Align requested address to the next page boundary */
>>> +		address = msg.arg.pagefault.address & ~(ps - 1);
>>> +		pr_debug("msg.arg.pagefault.address 0x%llx\n", address);
>>> +
>>> +		/*
>>> +		 * At this point the process on the other side waits for the first page.
>>> +		 * In the first step we will force the vdso pages into the new process.
>>> +		 */
>>> +		if (!vdso_sent) {
>>> +			pr_debug("Pushing VDSO pages once\n");
>>> +			list_for_each_entry(uffd_pages, &uffd_list, list) {
>>> +				if (!(uffd_pages->flags & UFFD_FLAG_VDSO))
>>> +					continue;
>>> +				rc = uffd_copy_page(uffd, &pr, uffd_pages->addr,
>>> +						    dest);
>>> +				if (rc < 0) {
>>> +					pr_err("Error during UFFD copy\n");
>>> +					return 1;
>>> +				}
>>> +				vma_size -= rc;
>>> +				uffd_copied_pages++;
>>> +				uffd_pages->flags |= UFFD_FLAG_SENT;
>>> +			}
>>> +			vdso_sent = true;
>>> +		}
>>> +
>>> +		/* Make sure to not transfer a page twice */
>>
>> But how can this happen? If you put the page into task and it asks for it
>> _again_, this means that something goes wrong, doesn' it?
> 
> I probably need to explain a bit more in detail what I am doing here.
> 
> Right now, I am waiting on the FD if a page is requested for up to 5
> seconds. If a page is requested I am copying into the process. If for 5
> seconds no more pages are requested I am starting to copy all remaining
> pages into the process.
> 
> The first page that usually is requested is a VDSO page. The process I
> am restoring has two VDSO pages, but only one is requested via
> userfaultfd. In the second part where I am copying the remaining pages
> into the process, the second VDSO page is also copied into the process
> as it has not been requested previously.

Yup, that's clear.

> Unfortunately, even as this
> page has not been requested before, it is not accepted by userfaultfd. 

Hm...

> I > get a EINVAL. Not sure why this is the case. Therefore I am now first
> copying the VDSO pages into the process, then I am switching to request
> mode and copying the pages which are requested via userfaultfd.

But the page is not accepted by uffd, why in the request mode it differs?

> To decide at which point I can copy the VDSO pages into the process I am
> waiting for the first page requested via userfaultfd. This is one of the
> VDSO pages. To not copy a page a second time, which is unnecessary and
> not possible, I am now checking if the page has been transferred
> previously.

Hm... What if we force vdso pages to be transfered always synchronously,
so uffd would never see them, would this double copy problem go away?

I have  asubtle feeling, that uffd-ing vdso pages is supposed to work at all
in the kernel.

> That's why I am doing the things the way I am doing it right now. Hope
> this makes it a bit clearer.
> 
> 
>>> +		list_for_each_entry(uffd_pages, &uffd_list, list) {
>>> +			if ((uffd_pages->addr == address) &&
>>> +			    (uffd_pages->flags & UFFD_FLAG_SENT)) {
>>> +				page_sent = true;
>>> +				break;
>>> +			}
>>> +		}
>>> +
>>> +		if (page_sent)
>>> +			continue;
>>> +
>>> +		/* Now handle the pages actually requested. */
>>> +
>>> +		flags = msg.arg.pagefault.flags;
>>> +		pr_debug("msg.arg.pagefault.flags 0x%llx\n", flags);
>>> +
>>> +		if (msg.event != UFFD_EVENT_PAGEFAULT) {
>>> +			pr_err("unexpected msg event %u\n", msg.event);
>>> +			return 1;
>>> +		}
>>> +
>>> +		rc = uffd_copy_page(uffd, &pr, address, dest);
>>> +		if (rc < 0) {
>>> +			pr_err("Error during UFFD copy\n");
>>> +			return 1;
>>> +		}
> .
> 

-- Pavel