[CRIU] [PATCH RFC 6/8] criu: page-xfer: add PS_IOV_GET interface

Pavel Emelyanov xemul at virtuozzo.com
Mon May 30 07:57:13 PDT 2016


On 05/30/2016 03:29 PM, Mike Rapoport wrote:
> On Mon, May 30, 2016 at 02:00:41PM +0300, Pavel Emelyanov wrote:
>> On 05/29/2016 09:52 AM, Mike Rapoport wrote:
>>> On Fri, May 27, 2016 at 10:31:59PM +0300, Pavel Emelyanov wrote:
>>>> On 05/21/2016 01:49 PM, Mike Rapoport wrote:
>>>>> When dump side is acting as lazy pages server it should be able to respond
>>>>> to random page access requests
>>>>> The protocol is quite simple:
>>>>> - the restore sends PS_IOV_GET command with PID, address and number
>>>>>   of pages it wishes to get
>>>>
>>>> Ack
>>>>
>>>>> - the dump side replies with PS_IOV_GET command. 
>>>>
>>>> Oops. Why PS_IOV_GET? We have PS_IOV_ADD for sending pages.
>>>  
>>> PS_IOV_ADD is for pushing pages. PS_IOV_GET is for pulling them :)
>>
>> Yes, _GET is what restore side sends to dump side, but then dump side
>> should just do regular PS_IOV_ADD, no? (one more comment below).
>  
> What do you mean by PS_IOV_ADD? Set the pi.cmd to IOV_ADD or use
> ->write_pagemap and ->write_pages?

Actually both :)

> The later requires page_xfer, which in turn is pretty much per-process
> object, at least at the moment.

Yes, it is.

> We can add PID-agnostic, say, lazy-page-xfer and helpers for
> ->write_pagemap and ->write_pages to use by both lazy-page-xfer and
> page-server-xfer.
> 
>>>>>   The nr_pages field is
>>>>>   updated to reflect actual amount of pages that the dump side is going to
>>>>>   send. If the pages in question are mapped to zero pfn, the entire
>>>>>   PS_IOV_GET reply is zeroed.
>>>>> - After the PS_IOV_GET command the dump side sends actual page data
>>>>>
>>>>> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
>>>>> ---
>>>>>  criu/include/page-xfer.h |  2 +
>>>>>  criu/page-xfer.c         | 99 ++++++++++++++++++++++++++++++++++++++++++++----
>>>>>  2 files changed, 93 insertions(+), 8 deletions(-)
>>>>>
>>>>>  
>>>>> +static int page_server_get_pages(int sk, struct page_server_iov *pi)
>>>>> +{
>>>>> +	struct pstree_item *item;
>>>>> +	struct page_pipe *pp;
>>>>> +	struct page_pipe_buf *ppb;
>>>>> +	struct iovec *iov;
>>>>> +	int ret;
>>>>> +
>>>>> +	item = pstree_item_by_virt(pi->dst_id);
>>>>> +	pp = item->parasite_ctl->mem_pp;
>>>>> +
>>>>> +	ret = page_pipe_split(pp, pi->vaddr, &pi->nr_pages);
>>>>> +	if (ret)
>>>>> +		return ret;
>>>>> +
>>>>> +	if (pi->nr_pages == 0) {
>>>>> +		/* no iovs found means we've hit a zero page */
>>>>> +		pr_debug("no iovs found, zero pages\n");
>>>>> +		memset(pi, 0, sizeof(*pi));
>>
>> This looks like PS_IOV_HOLE. But even if it doesn't let's add special
>> PS_IOV_..._ZERO_PAGE? command for this instead of zeroified pi.
> 
> If I understand correctly, HOLE means that pages are in parent checkpoint.
> I'd go with ZERO_PAGE...
> 
> Again, we can add some helper to ->write_hole :)

Please!

>>>>> +
>>>>> +		return write(sk, pi, sizeof(*pi)) != sizeof(*pi);
>>>>> +	}
>>>>> +
>>>>> +	ppb = list_first_entry(&pp->bufs, struct page_pipe_buf, l);
>>>>> +	iov = &ppb->iov[0];
>>>>> +
>>>>> +	BUG_ON(!(ppb->flags & PPB_LAZY));
>>>>> +	BUG_ON(iov->iov_len != pi->nr_pages * PAGE_SIZE);
>>>>> +	BUG_ON(pi->vaddr != encode_pointer(iov->iov_base));
>>>>> +
>>>>> +	if (write(sk, pi, sizeof(*pi)) != sizeof(*pi))
>>>>> +		return -1;
>>>>> +
>>>>> +	ret = splice(ppb->p[0], NULL, sk, NULL, iov->iov_len, SPLICE_F_MOVE);
>>>>> +	if (ret != iov->iov_len)
>>>>> +		return -1;
>>>>> +
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>>  static int page_server_check_parent(int sk, struct page_server_iov *pi);
>>>>>  
>>>>>  static int page_server_serve(int sk)
>>>>> @@ -190,14 +233,16 @@ static int page_server_serve(int sk)
>>>>>  	 */
>>>>>  	tcp_nodelay(sk, true);
>>>>>  
>>>>> -	if (pipe(cxfer.p)) {
>>>>> -		pr_perror("Can't make pipe for xfer");
>>>>> -		close(sk);
>>>>> -		return -1;
>>>>> -	}
>>>>> +	if (!opts.lazy_pages) {
>>>>> +		if (pipe(cxfer.p)) {
>>>>> +			pr_perror("Can't make pipe for xfer");
>>>>> +			close(sk);
>>>>> +			return -1;
>>>>> +		}
>>>>>  
>>>>> -	cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0);
>>>>> -	pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size);
>>>>> +		cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0);
>>>>> +		pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size);
>>>>> +	}
>>>>>  
>>>>>  	while (1) {
>>>>>  		struct page_server_iov pi;
>>>>> @@ -249,6 +294,10 @@ static int page_server_serve(int sk)
>>>>>  			flushed = true;
>>>>>  			break;
>>>>>  		}
>>>>> +		case PS_IOV_GET:
>>>>> +			flushed = true;
>>>>> +			ret = page_server_get_pages(sk, &pi);
>>>>> +			break;
>>>>>  		default:
>>>>>  			pr_err("Unknown command %u\n", pi.cmd);
>>>>>  			ret = -1;
>>>>> @@ -291,7 +340,8 @@ int cr_page_server(bool daemon_mode, int cfd)
>>>>>  	int sk = -1;
>>>>>  	int ret;
>>>>>  
>>>>> -	up_page_ids_base();
>>>>> +	if (!opts.lazy_pages)
>>>>> +		up_page_ids_base();
>>>>>  
>>>>>  	if (opts.ps_socket != -1) {
>>>>>  		ret = 0;
>>>>> @@ -787,3 +837,36 @@ int check_parent_page_xfer(int fd_type, long id)
>>>>>  	else
>>>>>  		return check_parent_local_xfer(fd_type, id);
>>>>>  }
>>>>> +
>>>>> +int get_remote_pages(int pid, unsigned long addr, int nr_pages, void *dest)
>>>>> +{
>>>>> +	int ret;
>>>>> +
>>>>> +	struct page_server_iov pi = {
>>>>> +		.cmd = PS_IOV_GET,
>>>>> +		.nr_pages = nr_pages,
>>>>> +		.vaddr = addr,
>>>>> +		.dst_id = pid,
>>>>> +	};
>>>>> +
>>>>> +	ret = write(page_server_sk, &pi, sizeof(pi));
>>>>> +	if (ret != sizeof(pi))
>>>>> +		return -1;
>>>>> +
>>>>> +	ret = recv(page_server_sk, &pi, sizeof(pi), MSG_WAITALL);
>>>>> +	if (ret != sizeof(pi))
>>>>> +		return -1;
>>>>> +
>>>>> +	/* zero page */
>>>>> +	if (pi.cmd == 0 && pi.vaddr == 0 && pi.nr_pages == 0 && pi.dst_id == 0)
>>>>> +		return 0;
>>>>> +
>>>>> +	if (pi.nr_pages > nr_pages)
>>>>> +		return -1;
>>>>> +
>>>>> +	ret = recv(page_server_sk, dest, PAGE_SIZE, MSG_WAITALL);
>>>>> +	if (ret != PAGE_SIZE)
>>>>> +		return -1;
>>>>> +
>>>>> +	return 1;
>>>>> +}
>>>>>
>>>>
>>>
>>> .
>>>
>>
> 
> .
> 



More information about the CRIU mailing list