[CRIU] [PATCH] lazy-pages: use random read from page-pipe instead of splitting it

Pavel Emelyanov xemul at virtuozzo.com
Mon Jun 19 19:41:56 MSK 2017


On 06/07/2017 11:53 AM, Mike Rapoport wrote:
> For the remote lazy pages case, to access pages in the middle of a pipe we
> are splitting the page_pipe_buffers and iovecs and use splice() to move the
> data between the underlying pipes. After the splits we get page_pipe_buffer
> with single iovec that can be used to splice() the data further into the
> socket.
> This patch replaces the splitting and splicing with use of a helper pipe
> and tee(). We tee() the pages from beginning of the pipe up to the last
> requested page into a helper pipe, sink the unneeded head part into
> /dev/null and we get the requested pages ready for splice() into the
> socket.
> This allows lazy-pages daemon to request the same page several time, which
> is required to properly support fork() after the restore.
> As added bonus we simplify the code and reduce amount of pipes that live in
> the system.
> 
> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>

Acked-by: Pavel Emelyanov <xemul at virtuozzo.com>

> ---
>  criu/include/page-pipe.h | 10 +++++++
>  criu/page-pipe.c         | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
>  criu/page-xfer.c         | 59 +++++++++++++-------------------------
>  3 files changed, 102 insertions(+), 40 deletions(-)
> 
> diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h
> index 10ae873..76ec1fd 100644
> --- a/criu/include/page-pipe.h
> +++ b/criu/include/page-pipe.h
> @@ -138,4 +138,14 @@ extern int page_pipe_split(struct page_pipe *pp, unsigned long addr,
>  
>  extern void page_pipe_destroy_ppb(struct page_pipe_buf *ppb);
>  
> +struct pipe_read_dest {
> +	int p[2];
> +	int sink_fd;
> +};
> +
> +extern int pipe_read_dest_init(struct pipe_read_dest *prd);
> +extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd,
> +			  unsigned long addr, unsigned int *nr_pages,
> +			  unsigned int ppb_flags);
> +
>  #endif /* __CR_PAGE_PIPE_H__ */
> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
> index 4ebd0cb..4b4b3fc 100644
> --- a/criu/page-pipe.c
> +++ b/criu/page-pipe.c
> @@ -474,6 +474,79 @@ int page_pipe_split(struct page_pipe *pp, unsigned long addr,
>  	return 0;
>  }
>  
> +int pipe_read_dest_init(struct pipe_read_dest *prd)
> +{
> +	int ret;
> +
> +	if (pipe(prd->p)) {
> +		pr_perror("Cannot create pipe for reading from page-pipe");
> +		return -1;
> +	}
> +
> +	ret = fcntl(prd->p[0], F_SETPIPE_SZ, PIPE_MAX_SIZE * PAGE_SIZE);
> +	if (ret < 0)
> +		return -1;
> +
> +	prd->sink_fd = open("/dev/null", O_WRONLY);
> +	if (prd->sink_fd < 0) {
> +		pr_perror("Cannot open sink for reading from page-pipe");
> +		return -1;
> +	}
> +
> +	ret = fcntl(prd->p[0], F_GETPIPE_SZ, 0);
> +	pr_debug("Created tee pipe size %d\n", ret);
> +
> +	return 0;
> +}
> +
> +int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd,
> +		   unsigned long addr, unsigned int *nr_pages,
> +		   unsigned int ppb_flags)
> +{
> +	struct page_pipe_buf *ppb;
> +	struct iovec *iov = NULL;
> +	unsigned long skip = 0, len;
> +	int ret;
> +
> +	/*
> +	 * Get ppb that contains addr and count length of data between
> +	 * the beginning of the pipe and addr. If no ppb is found, the
> +	 * requested page is mapped to zero pfn
> +	 */
> +	ppb = get_ppb(pp, addr, &iov, &skip);
> +	if (!ppb) {
> +		*nr_pages = 0;
> +		return 0;
> +	}
> +
> +	if (!(ppb->flags & ppb_flags)) {
> +		pr_err("PPB flags mismatch: %x %x\n", ppb_flags, ppb->flags);
> +		return false;
> +	}
> +
> +	/* clamp the request if it passes the end of iovec */
> +	len = min((unsigned long)iov->iov_base + iov->iov_len - addr,
> +		  (unsigned long)(*nr_pages) * PAGE_SIZE);
> +	*nr_pages = len / PAGE_SIZE;
> +
> +	/* we should tee() the requested lenth + the beginning of the pipe */
> +	len += skip;
> +
> +	ret = tee(ppb->p[0], prd->p[1], len, 0);
> +	if (ret != len) {
> +		pr_perror("tee: %d", ret);
> +		return -1;
> +	}
> +
> +	ret = splice(prd->p[0], NULL, prd->sink_fd, NULL, skip, 0);
> +	if (ret != skip) {
> +		pr_perror("splice: %d", ret);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
>  void page_pipe_destroy_ppb(struct page_pipe_buf *ppb)
>  {
>  	list_del(&ppb->l);
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index c557407..49693bb 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -554,10 +554,19 @@ static struct page_xfer_job cxfer = {
>  	.dst_id = ~0,
>  };
>  
> +static struct pipe_read_dest pipe_read_dest = {
> +	.sink_fd = -1,
> +};
> +
>  static void page_server_close(void)
>  {
>  	if (cxfer.dst_id != ~0)
>  		cxfer.loc_xfer.close(&cxfer.loc_xfer);
> +	if (pipe_read_dest.sink_fd != -1) {
> +		close(pipe_read_dest.sink_fd);
> +		close(pipe_read_dest.p[0]);
> +		close(pipe_read_dest.p[1]);
> +	}
>  }
>  
>  static int page_server_open(int sk, struct page_server_iov *pi)
> @@ -653,43 +662,18 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags)
>  	return 0;
>  }
>  
> -static bool can_send_pages(struct page_pipe_buf *ppb, struct iovec *iov,
> -			   struct page_server_iov *pi)
> -{
> -	unsigned long len = pi->nr_pages * PAGE_SIZE;
> -
> -	if (!(ppb->flags & PPB_LAZY)) {
> -		pr_err("Requested pages are not lazy\n");
> -		return false;
> -	}
> -
> -	if (iov->iov_len != len) {
> -		pr_err("IOV len %zu does not match requested %lu\n",
> -		       iov->iov_len, len);
> -		return false;
> -	}
> -
> -	if(pi->vaddr != encode_pointer(iov->iov_base)) {
> -		pr_err("IOV start %p does not match requested addr %"PRIx64"\n",
> -		       iov->iov_base, pi->vaddr);
> -		return false;
> -	}
> -
> -	return true;
> -}
> -
>  static int page_server_get_pages(int sk, struct page_server_iov *pi)
>  {
>  	struct pstree_item *item;
>  	struct page_pipe *pp;
> -	struct page_pipe_buf *ppb;
> -	struct iovec *iov;
> +	unsigned long len;
>  	int ret;
>  
>  	item = pstree_item_by_virt(pi->dst_id);
>  	pp = dmpi(item)->mem_pp;
>  
> -	ret = page_pipe_split(pp, pi->vaddr, &pi->nr_pages);
> +	ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr,
> +			     &pi->nr_pages, PPB_LAZY);
>  	if (ret)
>  		return ret;
>  
> @@ -699,23 +683,17 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi)
>  		return send_psi(sk, PS_IOV_ZERO, 0, 0, 0);
>  	}
>  
> -	ppb = list_first_entry(&pp->bufs, struct page_pipe_buf, l);
> -	iov = &ppb->iov[0];
> -
> -	if (!can_send_pages(ppb, iov, pi))
> -		return -1;
> +	len = pi->nr_pages * PAGE_SIZE;
>  
>  	if (send_psi(sk, PS_IOV_ADD, pi->nr_pages, pi->vaddr, pi->dst_id))
>  		return -1;
>  
> -	ret = splice(ppb->p[0], NULL, sk, NULL, iov->iov_len, SPLICE_F_MOVE);
> -	if (ret != iov->iov_len)
> +	ret = splice(pipe_read_dest.p[0], NULL, sk, NULL, len, SPLICE_F_MOVE);
> +	if (ret != len)
>  		return -1;
>  
>  	tcp_nodelay(sk, true);
>  
> -	page_pipe_destroy_ppb(ppb);
> -
>  	return 0;
>  }
>  
> @@ -723,8 +701,9 @@ static int page_server_serve(int sk)
>  {
>  	int ret = -1;
>  	bool flushed = false;
> +	bool receiving_pages = !opts.lazy_pages;
>  
> -	if (!opts.lazy_pages) {
> +	if (receiving_pages) {
>  		/*
>  		 * This socket only accepts data except one thing -- it
>  		 * writes back the has_parent bit from time to time, so
> @@ -741,6 +720,7 @@ static int page_server_serve(int sk)
>  		cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0);
>  		pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size);
>  	} else {
> +		pipe_read_dest_init(&pipe_read_dest);
>  		tcp_cork(sk, true);
>  	}
>  
> @@ -800,7 +780,6 @@ static int page_server_serve(int sk)
>  			break;
>  		}
>  		case PS_IOV_GET:
> -			flushed = true;
>  			ret = page_server_get_pages(sk, &pi);
>  			break;
>  		default:
> @@ -813,7 +792,7 @@ static int page_server_serve(int sk)
>  			break;
>  	}
>  
> -	if (!ret && !flushed) {
> +	if (receiving_pages && !ret && !flushed) {
>  		pr_err("The data were not flushed\n");
>  		ret = -1;
>  	}
> 



More information about the CRIU mailing list