[CRIU] [PATCH 07/11] lazy-pages: make handle_requests also poll for listening socket

Tue Apr 12 05:21:22 PDT 2016

On Tue, Apr 12, 2016 at 03:08:14PM +0300, Pavel Emelyanov wrote:
> On 04/12/2016 02:27 PM, Mike Rapoport wrote:
> > On Tue, Apr 12, 2016 at 02:06:22PM +0300, Pavel Emelyanov wrote:
> >> On 04/11/2016 09:19 AM, Mike Rapoport wrote:
> >>> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
> >>> ---
> >>>  criu/uffd.c | 119 +++++++++++++++++++++++++++++++++++++++---------------------
> >>>  1 file changed, 77 insertions(+), 42 deletions(-)
> >>>
> >>> diff --git a/criu/uffd.c b/criu/uffd.c
> >>> index 87c2602..fece56e 100644
> >>> --- a/criu/uffd.c
> >>> +++ b/criu/uffd.c
> >>> @@ -545,32 +545,56 @@ static int handle_user_fault(int fd, struct list_head *uffd_list, void *dest)
> >>>  	return 0;
> >>>  }
> >>>  
> >>> +static int handle_socket_conn(int listen, struct sockaddr_un *saddr,
> >>> +			      struct list_head *uffd_list)
> >>> +{
> >>> +	int uffd;
> >>> +	int uffd_flags;
> >>> +
> >>> +	uffd = ud_open(listen, saddr);
> >>> +	if (uffd < 0) {
> >>> +		pr_perror("uffd open error");
> >>> +		return -1;
> >>> +	}
> >>> +
> >>> +	pr_debug("uffd is 0x%d\n", uffd);
> >>> +	uffd_flags = fcntl(uffd, F_GETFD, NULL);
> >>> +	pr_debug("uffd_flags are 0x%x\n", uffd_flags);
> >>> +	if (fcntl(uffd, F_SETFD, uffd_flags | O_NONBLOCK))
> >>> +		return -1;
> >>> +
> >>> +	/*
> >>> +	 * Find the memory pages belonging to the restored process so
> >>> +	 * that it is trackable when all pages have been transferred.
> >>> +	 */
> >>> +	if ((total_pages = find_vmas(uffd_list)) == -1)
> >>> +		return -1;
> >>> +
> >>> +	return uffd;
> >>> +}
> >>> +
> >>>  #define POLL_TIMEOUT 5000
> >>>  
> >>> -static int handle_requests(int fd)
> >>> +static int handle_requests(int listen, struct sockaddr_un *saddr)
> >>>  {
> >>>  	struct pollfd *pollfd;
> >>> -	int nr_pollfd;
> >>> +	int nr_pollfd = 1;
> >>> +	int max_pollfd;
> >>> +	int timeout = -1;
> >>>  	int ret = -1;
> >>>  	unsigned long ps;
> >>>  	void *dest;
> >>> +	int i;
> >>>  
> >>>  	LIST_HEAD(uffd_list);
> >>>  
> >>> -	/*
> >>> -	 * Find the memory pages belonging to the restored process
> >>> -	 * so that it is trackable when all pages have been transferred.
> >>> -	 */
> >>> -	if ((total_pages = find_vmas(&uffd_list)) == -1)
> >>> -		return -1;
> >>> -
> >>>  	/* All operations will be done on page size */
> >>>  	ps = page_size();
> >>>  	dest = malloc(ps);
> >>>  
> >>>  	/* allocate pollfd per task + pollfd for listen */
> >>> -	nr_pollfd = task_entries->nr_tasks + 1;
> >>> -	pollfd = malloc(sizeof(*pollfd) * nr_pollfd);
> >>> +	max_pollfd = task_entries->nr_tasks + 1;
> >>> +	pollfd = malloc(sizeof(*pollfd) * max_pollfd);
> >>>  	if (!pollfd || !dest) {
> >>>  		pr_err("memory allocation failed\n");
> >>>  		ret = -1;
> >>> @@ -578,12 +602,10 @@ static int handle_requests(int fd)
> >>>  	}
> >>>  
> >>>  	/*
> >>> -	 * use nr_pollfd to set number of pollfd's being polled
> >>> -	 * FIXME: eventually nr_pollfd should be nr_tasks + 1, and the
> >>> -	 * first fd to poll on should that of listen
> >>> +	 * use nr_pollfd to set number of pollfd's being polled, the
> >>> +	 * first pollfd is always for the listening socket
> >>>  	 */
> >>> -	nr_pollfd = 1;
> >>> -	pollfd[0].fd = fd;
> >>> +	pollfd[0].fd = listen;
> >>>  	pollfd[0].events = POLLIN;
> >>>  
> >>>  	while (1) {
> >>> @@ -592,7 +614,7 @@ static int handle_requests(int fd)
> >>>  		 * no uffd pages are requested the code switches to
> >>>  		 * copying the remaining pages.
> >>>  		 */
> >>> -		ret = poll(pollfd, nr_pollfd, POLL_TIMEOUT);
> >>> +		ret = poll(pollfd, nr_pollfd, timeout);
> >>
> >> But there should be strict split between the time we expect for new connections
> >> on this socket versus when pagefaults start arriving. Because first we restore
> >> the tree and send uffd sockets to this daemon and _then_ we resume the tasks and
> >> from now on no new connections may appear, but PF-s start popping up from uffds.
> >>
> >> Since we know this, why do we need to complicate this and poll for all the
> >> sockets together?
> > 
> > I had an impression that the uffd is sent to the daemon at the stage when
> > tasks are already running concurrently. If I was wrong here and the tasks
> > are resumed after all the uffd's are sent to the daemon, then we indeed
> > don't need to poll for uffd's and sockets together. 
> 
> Yes, we first fork() all the stuff and send the uffds out, then tasks are resumed
> and start generating PF-s. The problem here is that this "resume" point is not
> reported to uffd :) so he can only guess this moment.

And if it will guess wrong? ;-)
Let's keep (e)polling for everything? :) 

> -- Pavel
> 
> >>>  		pr_debug("poll() rc: 0x%x\n", ret);
> >>>  		if (ret < 0) {
> >>>  			pr_perror("polling failed");
> >>> @@ -603,16 +625,42 @@ static int handle_requests(int fd)
> >>>  			break;
> >>>  		}
> >>>  
> >>> -		ret = handle_user_fault(fd, &uffd_list, dest);
> >>> -		if (ret < 0)
> >>> -			goto out;
> >>> +		if (pollfd[0].revents & POLLIN) {
> >>> +			int fd = handle_socket_conn(listen, saddr, &uffd_list);
> >>> +			if (fd < 0)
> >>> +				goto out;
> >>> +			pollfd[nr_pollfd].fd = fd;
> >>> +			pollfd[nr_pollfd].events = POLLIN;
> >>> +			nr_pollfd++;
> >>> +			BUG_ON(nr_pollfd > max_pollfd);
> >>> +
> >>> +			/*
> >>> +			 * once the first uffd is register wait POLL_TIMEOUT
> >>> +			 * before starting bulk page trasfer
> >>> +			 */
> >>> +			timeout = POLL_TIMEOUT;
> >>> +
> >>> +			continue;
> >>> +		}
> >>> +
> >>> +		for (i = 1; i < nr_pollfd; i++) {
> >>> +			if (pollfd[i].revents & POLLIN) {
> >>> +				ret = handle_user_fault(pollfd[i].fd,
> >>> +							&uffd_list, dest);
> >>> +				if (ret < 0)
> >>> +					goto out;
> >>> +			}
> >>> +		}
> >>>  	}
> >>> -	pr_debug("Handle remaining pages\n");
> >>> -	ret = handle_remaining_pages(fd, &uffd_list, dest);
> >>> -	if (ret < 0) {
> >>> -		pr_err("Error during remaining page copy\n");
> >>> -		ret = 1;
> >>> -		goto out;
> >>> +
> >>> +	for (i = 1; i < nr_pollfd; i++) {
> >>> +		pr_debug("Handle remaining pages\n");
> >>> +		ret = handle_remaining_pages(pollfd[i].fd, &uffd_list, dest);
> >>> +		if (ret < 0) {
> >>> +			pr_err("Error during remaining page copy\n");
> >>> +			ret = 1;
> >>> +			goto out;
> >>> +		}
> >>>  	}
> >>>  
> >>>  	pr_debug("With UFFD transferred pages: (%ld/%ld)\n", uffd_copied_pages, total_pages);
> >>> @@ -626,9 +674,10 @@ static int handle_requests(int fd)
> >>>  	ret = 0;
> >>>  
> >>>  out:
> >>> +	for (i = nr_pollfd - 1; i > 0; i--)
> >>> +		close(pollfd[i].fd);
> >>>  	free(pollfd);
> >>>  	free(dest);
> >>> -	close(fd);
> >>>  	return ret;
> >>>  
> >>>  }
> >>> @@ -657,8 +706,6 @@ static int lazy_pages_prepare_pstree(void)
> >>>  int cr_lazy_pages()
> >>>  {
> >>>  	int listen;
> >>> -	int uffd;
> >>> -	int uffd_flags;
> >>>  	int ret;
> >>>  	struct sockaddr_un saddr;
> >>>  
> >>> @@ -679,19 +726,7 @@ int cr_lazy_pages()
> >>>  		return -1;
> >>>  	}
> >>>  
> >>> -	uffd = ud_open(listen, &saddr);
> >>> -	if (uffd < 0) {
> >>> -		pr_perror("uffd open error");
> >>> -		return -1;
> >>> -	}
> >>> -
> >>> -	pr_debug("uffd is 0x%d\n", uffd);
> >>> -	uffd_flags = fcntl(uffd, F_GETFD, NULL);
> >>> -	pr_debug("uffd_flags are 0x%x\n", uffd_flags);
> >>> -	if (fcntl(uffd, F_SETFD, uffd_flags | O_NONBLOCK))
> >>> -		return -1;
> >>> -
> >>> -	ret = handle_requests(uffd);
> >>> +	ret = handle_requests(listen, &saddr);
> >>>  	close(listen);
> >>>  
> >>>  	return ret;
> >>>
> >>
> > 
> > .
> > 
>