[CRIU] [PATCH 07/11] lazy-pages: make handle_requests also poll for listening socket

Tue Apr 12 04:06:22 PDT 2016

On 04/11/2016 09:19 AM, Mike Rapoport wrote:
> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
> ---
>  criu/uffd.c | 119 +++++++++++++++++++++++++++++++++++++++---------------------
>  1 file changed, 77 insertions(+), 42 deletions(-)
> 
> diff --git a/criu/uffd.c b/criu/uffd.c
> index 87c2602..fece56e 100644
> --- a/criu/uffd.c
> +++ b/criu/uffd.c
> @@ -545,32 +545,56 @@ static int handle_user_fault(int fd, struct list_head *uffd_list, void *dest)
>  	return 0;
>  }
>  
> +static int handle_socket_conn(int listen, struct sockaddr_un *saddr,
> +			      struct list_head *uffd_list)
> +{
> +	int uffd;
> +	int uffd_flags;
> +
> +	uffd = ud_open(listen, saddr);
> +	if (uffd < 0) {
> +		pr_perror("uffd open error");
> +		return -1;
> +	}
> +
> +	pr_debug("uffd is 0x%d\n", uffd);
> +	uffd_flags = fcntl(uffd, F_GETFD, NULL);
> +	pr_debug("uffd_flags are 0x%x\n", uffd_flags);
> +	if (fcntl(uffd, F_SETFD, uffd_flags | O_NONBLOCK))
> +		return -1;
> +
> +	/*
> +	 * Find the memory pages belonging to the restored process so
> +	 * that it is trackable when all pages have been transferred.
> +	 */
> +	if ((total_pages = find_vmas(uffd_list)) == -1)
> +		return -1;
> +
> +	return uffd;
> +}
> +
>  #define POLL_TIMEOUT 5000
>  
> -static int handle_requests(int fd)
> +static int handle_requests(int listen, struct sockaddr_un *saddr)
>  {
>  	struct pollfd *pollfd;
> -	int nr_pollfd;
> +	int nr_pollfd = 1;
> +	int max_pollfd;
> +	int timeout = -1;
>  	int ret = -1;
>  	unsigned long ps;
>  	void *dest;
> +	int i;
>  
>  	LIST_HEAD(uffd_list);
>  
> -	/*
> -	 * Find the memory pages belonging to the restored process
> -	 * so that it is trackable when all pages have been transferred.
> -	 */
> -	if ((total_pages = find_vmas(&uffd_list)) == -1)
> -		return -1;
> -
>  	/* All operations will be done on page size */
>  	ps = page_size();
>  	dest = malloc(ps);
>  
>  	/* allocate pollfd per task + pollfd for listen */
> -	nr_pollfd = task_entries->nr_tasks + 1;
> -	pollfd = malloc(sizeof(*pollfd) * nr_pollfd);
> +	max_pollfd = task_entries->nr_tasks + 1;
> +	pollfd = malloc(sizeof(*pollfd) * max_pollfd);
>  	if (!pollfd || !dest) {
>  		pr_err("memory allocation failed\n");
>  		ret = -1;
> @@ -578,12 +602,10 @@ static int handle_requests(int fd)
>  	}
>  
>  	/*
> -	 * use nr_pollfd to set number of pollfd's being polled
> -	 * FIXME: eventually nr_pollfd should be nr_tasks + 1, and the
> -	 * first fd to poll on should that of listen
> +	 * use nr_pollfd to set number of pollfd's being polled, the
> +	 * first pollfd is always for the listening socket
>  	 */
> -	nr_pollfd = 1;
> -	pollfd[0].fd = fd;
> +	pollfd[0].fd = listen;
>  	pollfd[0].events = POLLIN;
>  
>  	while (1) {
> @@ -592,7 +614,7 @@ static int handle_requests(int fd)
>  		 * no uffd pages are requested the code switches to
>  		 * copying the remaining pages.
>  		 */
> -		ret = poll(pollfd, nr_pollfd, POLL_TIMEOUT);
> +		ret = poll(pollfd, nr_pollfd, timeout);

But there should be strict split between the time we expect for new connections
on this socket versus when pagefaults start arriving. Because first we restore
the tree and send uffd sockets to this daemon and _then_ we resume the tasks and
from now on no new connections may appear, but PF-s start popping up from uffds.

Since we know this, why do we need to complicate this and poll for all the
sockets together?

>  		pr_debug("poll() rc: 0x%x\n", ret);
>  		if (ret < 0) {
>  			pr_perror("polling failed");
> @@ -603,16 +625,42 @@ static int handle_requests(int fd)
>  			break;
>  		}
>  
> -		ret = handle_user_fault(fd, &uffd_list, dest);
> -		if (ret < 0)
> -			goto out;
> +		if (pollfd[0].revents & POLLIN) {
> +			int fd = handle_socket_conn(listen, saddr, &uffd_list);
> +			if (fd < 0)
> +				goto out;
> +			pollfd[nr_pollfd].fd = fd;
> +			pollfd[nr_pollfd].events = POLLIN;
> +			nr_pollfd++;
> +			BUG_ON(nr_pollfd > max_pollfd);
> +
> +			/*
> +			 * once the first uffd is register wait POLL_TIMEOUT
> +			 * before starting bulk page trasfer
> +			 */
> +			timeout = POLL_TIMEOUT;
> +
> +			continue;
> +		}
> +
> +		for (i = 1; i < nr_pollfd; i++) {
> +			if (pollfd[i].revents & POLLIN) {
> +				ret = handle_user_fault(pollfd[i].fd,
> +							&uffd_list, dest);
> +				if (ret < 0)
> +					goto out;
> +			}
> +		}
>  	}
> -	pr_debug("Handle remaining pages\n");
> -	ret = handle_remaining_pages(fd, &uffd_list, dest);
> -	if (ret < 0) {
> -		pr_err("Error during remaining page copy\n");
> -		ret = 1;
> -		goto out;
> +
> +	for (i = 1; i < nr_pollfd; i++) {
> +		pr_debug("Handle remaining pages\n");
> +		ret = handle_remaining_pages(pollfd[i].fd, &uffd_list, dest);
> +		if (ret < 0) {
> +			pr_err("Error during remaining page copy\n");
> +			ret = 1;
> +			goto out;
> +		}
>  	}
>  
>  	pr_debug("With UFFD transferred pages: (%ld/%ld)\n", uffd_copied_pages, total_pages);
> @@ -626,9 +674,10 @@ static int handle_requests(int fd)
>  	ret = 0;
>  
>  out:
> +	for (i = nr_pollfd - 1; i > 0; i--)
> +		close(pollfd[i].fd);
>  	free(pollfd);
>  	free(dest);
> -	close(fd);
>  	return ret;
>  
>  }
> @@ -657,8 +706,6 @@ static int lazy_pages_prepare_pstree(void)
>  int cr_lazy_pages()
>  {
>  	int listen;
> -	int uffd;
> -	int uffd_flags;
>  	int ret;
>  	struct sockaddr_un saddr;
>  
> @@ -679,19 +726,7 @@ int cr_lazy_pages()
>  		return -1;
>  	}
>  
> -	uffd = ud_open(listen, &saddr);
> -	if (uffd < 0) {
> -		pr_perror("uffd open error");
> -		return -1;
> -	}
> -
> -	pr_debug("uffd is 0x%d\n", uffd);
> -	uffd_flags = fcntl(uffd, F_GETFD, NULL);
> -	pr_debug("uffd_flags are 0x%x\n", uffd_flags);
> -	if (fcntl(uffd, F_SETFD, uffd_flags | O_NONBLOCK))
> -		return -1;
> -
> -	ret = handle_requests(uffd);
> +	ret = handle_requests(listen, &saddr);
>  	close(listen);
>  
>  	return ret;
>