[CRIU] [PATCH v4] Try to include userfaultfd with criu

Pavel Emelyanov xemul at virtuozzo.com
Wed Mar 9 03:44:11 PST 2016


Hi, Adrian

Great progress :) Please, find my comments inline.

> To handle the required pages a new criu command has been added. For a
> userfaultfd supported restore the first step is to start the
> 'lazy-pages' server:
> 
>   criu lazy-pages -v4 -D /tmp/3/ --address /tmp/userfault.socket
> 
> This waits on a unix domain socket (defined using the --address option)
> to receive a userfaultfd file descriptor from a '--lazy-pages' enabled
> 'criu restore':
> 
>   criu restore -D /tmp/3 -j -v4 --lazy-pages \
>   --address /tmp/userfault.socket

Would you split the patch into two then -- the first one is to introduce
the lazy-pages action and the second one is to make --lazy-pages option
for restore?

> ...
>  #include "protobuf.h"
> @@ -469,6 +472,17 @@ static int restore_priv_vma_content(void)
>  			p = decode_pointer((off) * PAGE_SIZE +
>  					vma->premmaped_addr);
>  
> +			/*
> +			 * This means that userfaultfd is used to load the pages
> +			 * on demand.
> +			 */
> +			if (opts.lazy_pages && (vma->e->flags & MAP_ANONYMOUS) &&
> +					(vma->e->flags & MAP_PRIVATE)) {

Please, introduce the vma_can_be_lazy() helper that we will extend while
the uffd supports more.

> +				pr_debug("Lazy restore skips %lx\n", vma->e->start);
> +				pr.skip_pages(&pr, PAGE_SIZE);
> +				continue;
> +			}
> +
>  			set_bit(off, vma->page_bitmap);
>  			if (vma->ppage_bitmap) { /* inherited vma */
>  				clear_bit(off, vma->ppage_bitmap);
> @@ -2131,7 +2145,7 @@ out:
>  	return -1;
>  }
>  
> -static int prepare_task_entries(void)
> +int prepare_task_entries(void)
>  {
>  	task_entries_pos = rst_mem_align_cpos(RM_SHREMAP);
>  	task_entries = rst_mem_alloc(sizeof(*task_entries), RM_SHREMAP);
> @@ -2964,6 +2978,54 @@ static int rst_prep_creds(pid_t pid, CoreEntry *core, unsigned long *creds_pos)
>  	return 0;
>  }
>  
> +#ifdef CONFIG_HAS_UFFD
> +
> +static int send_uffd(int sendfd)

Maybe put this routine into uffd.c?

> +{
> +	int fd;
> +	int len;
> +	int ret = -1;
> +	struct sockaddr_un sun;
> +
> +	if (!opts.addr) {
> +		pr_info("Please specify a file name for the unix domain socket\n");
> +		pr_info("used to communicate between the lazy-pages server\n");
> +		pr_info("and the restore process. Use the --address option like\n");
> +		pr_info("criu restore --lazy-pages --address /tmp/userfault.socket\n");
> +		return -1;
> +	}
> +
> +	if (sendfd < 0)
> +		return -1;
> +
> +	if (strlen(opts.addr) >= sizeof(sun.sun_path)) {
> +		return -1;
> +	}
> +
> +	if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
> +		return -1;
> +
> +	memset(&sun, 0, sizeof(sun));
> +	sun.sun_family = AF_UNIX;
> +	strcpy(sun.sun_path, opts.addr);
> +	len = offsetof(struct sockaddr_un, sun_path) + strlen(opts.addr);
> +	if (connect(fd, (struct sockaddr *) &sun, len) < 0) {
> +		pr_perror("connect to %s failed", opts.addr);
> +		goto out;
> +	}
> +

> ...
> @@ -3219,6 +3281,37 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
>  
>  	strncpy(task_args->comm, core->tc->comm, sizeof(task_args->comm));
>  
> +	if (!opts.lazy_pages)
> +		task_args->uffd = -1;
> +#ifdef CONFIG_HAS_UFFD
> +	else {
> +		struct uffdio_api uffdio_api;
> +		/*
> +		 * Open userfaulfd FD which is passed to the restorer blob and
> +		 * to a second process handling the userfaultfd page faults.
> +		 */
> +		task_args->uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
> +
> +		/*
> +		 * Check if the UFFD_API is the one which is expected
> +		 */
> +		uffdio_api.api = UFFD_API;
> +		uffdio_api.features = 0;
> +		if (ioctl(task_args->uffd, UFFDIO_API, &uffdio_api)) {
> +			pr_err("Checking for UFFDIO_API failed.\n");
> +			goto err;
> +		}
> +		if (uffdio_api.api != UFFD_API) {
> +			pr_err("Result of looking up UFFDIO_API does not match: %Lu\n", uffdio_api.api);
> +			goto err;
> +		}
> +
> +		if (send_uffd(task_args->uffd) < 0) {
> +			close(task_args->uffd);
> +			goto err;
> +		}
> +	}
> +#endif

Can we have this hunk be a routine in uffd.c (or a static inline in a header when
CONFIG_UFFD is not set)?

>  
>  	/*
>  	 * Fill up per-thread data.

> ...
> @@ -51,6 +51,7 @@ struct page_read {
>  	/* stop working on current pagemap */
>  	void (*put_pagemap)(struct page_read *);
>  	void (*close)(struct page_read *);
> +	void (*skip_pages)(struct page_read *, unsigned long len);

Ah, you need the skip_pages callbacks from page_read-s. Please, make it
be another separate patch.

>  
>  	/* Private data of reader */
>  	struct cr_img *pmi;

> ...
> +static void enable_uffd(int uffd, unsigned long addr, unsigned long len)
> +{
> +	/*
> +	 * If uffd == -1, this means that userfaultfd is not enabled
> +	 * or it is not available.
> +	 */
> +	if (uffd == -1)
> +		return;
> +#ifdef CONFIG_HAS_UFFD
> +	int rc;
> +	struct uffdio_register uffdio_register;
> +	unsigned long expected_ioctls;
> +
> +	uffdio_register.range.start = addr;
> +	uffdio_register.range.len = len;
> +	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
> +	pr_info("lazy-pages: uffdio_register.range.start 0x%lx\n", (unsigned long) uffdio_register.range.start);
> +	pr_info("lazy-pages: uffdio_register.len 0x%llx\n", uffdio_register.range.len);
> +	rc = sys_ioctl(uffd, UFFDIO_REGISTER, &uffdio_register);

The failrure to register the region should be handled.

> +	pr_info("lazy-pages: ioctl UFFDIO_REGISTER rc %d\n", rc);
> +	pr_info("lazy-pages: uffdio_register.range.start 0x%lx\n", (unsigned long) uffdio_register.range.start);
> +	pr_info("lazy-pages: uffdio_register.len 0x%llx\n", uffdio_register.range.len);
> +
> +	expected_ioctls = (1 << _UFFDIO_WAKE) | (1 << _UFFDIO_COPY) | (1 << _UFFDIO_ZEROPAGE);
> +
> +	if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
> +		pr_err("lazy-pages: unexpected missing uffd ioctl for anon memory\n");
> +	}
> +
> +#endif
> +
> +}
> +
> +
> +static int vma_remap(unsigned long src, unsigned long dst, unsigned long len, int uffd, int flags)
>  {
>  	unsigned long guard = 0, tmp;
>  

> ...
> +static void criu_init()
> +{
> +	/* TODO: return code checking */
> +	check_img_inventory();
> +	prepare_task_entries();
> +	prepare_pstree();
> +	collect_remaps_and_regfiles();
> +	prepare_shared_reg_files();
> +	prepare_remaps();
> +	prepare_mm_pid(root_item);

Wow! Do you really need all this stuff in uffd daemon? What for?

> +
> +	/* We found a PID */
> +	pr_debug("root_item->pid.virt %d\n", root_item->pid.virt);
> +	pr_debug("root_item->pid.real %d\n", root_item->pid.real);
> +}
> +
> +int uffd_listen()
> +{
> +	__u64 address;
> +	void *dest;
> +	__u64 flags;
> +	struct uffd_msg msg;
> +	struct page_read pr;
> +	unsigned long ps;
> +	int rc;
> +	fd_set set;
> +	struct timeval timeout;
> +	int uffd;
> +	unsigned long uffd_copied_pages = 0;
> +	int uffd_flags;
> +	struct uffd_pages_struct *uffd_pages;
> +	bool vdso_sent = false;
> +	unsigned long vma_size = 0;
> +
> +	LIST_HEAD(uffd_list);
> +
> +	if (!opts.addr) {
> +		pr_info("Please specify a file name for the unix domain socket\n");
> +		pr_info("used to communicate between the lazy-pages server\n");
> +		pr_info("and the restore process. Use the --address option like\n");
> +		pr_info("criu --lazy-pages --address /tmp/userfault.socket\n");
> +		return -1;
> +	}
> +
> +	pr_debug("Waiting for incoming connections on %s\n", opts.addr);
> +	if ((uffd = ud_open()) < 0)
> +		exit(0);
> +
> +	pr_debug("uffd is 0x%d\n", uffd);
> +	uffd_flags = fcntl(uffd, F_GETFD, NULL);
> +	pr_debug("uffd_flags are 0x%x\n", uffd_flags);
> +
> +	/* Setting up criu infrastructure to easily access the dump results */
> +	criu_init();
> +
> +	/* Initialize FD sets for read() with timeouts (using select()) */
> +	FD_ZERO(&set);
> +	FD_SET(uffd, &set);
> +
> +	/* All operations will be done on page size */
> +	ps = page_size();
> +	dest = malloc(ps);
> +
> +	rc = open_page_read(root_item->pid.virt, &pr, PR_TASK);

This seem to work only with one single task, doesn't it?

> +	if (rc <= 0)
> +		return 1;
> +	/*
> +	 * This puts all pages which should be handled by userfaultfd
> +	 * in the list uffd_list. This list is later used to detect if
> +	 * a page has already been transferred or if it needs to be
> +	 * pushed into the process using userfaultfd.
> +	 */

-- Pavel



More information about the CRIU mailing list