[CRIU] [PATCH] vmsplice user-buffer(grabbed by process_vm_readv) to page-pipe

abhishek dubey dubeyabhishek777 at gmail.com
Thu Jul 11 12:27:20 MSK 2019


Hi Mike,

On 09/07/19 1:34 PM, Mike Rapoport wrote:
> Hi Abishek,
>
> On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
>> This patch implements usage of process_vm_readv
>> syscall to collect memory pages from target process during
>> pre-dump. process_vm_readv collects pages in user-buffer,
>> which is later vmspliced to page-pipes.
>  From what I can tell, the patch completely replaces splicing of the memory
> in the parasite code with usage of process_vm_readv() for both pre-dump and
> dump.
>
> I don't think that for dump that approach will be better then what we have
> now and surely it'll be more expensive.

Yes, for now its same for dump and pre-dump. process_vm_readv can't dump 
mappings without PROT_READ protection,

so for such cases we will fall back to parasite method in dump stage.

>> Signed-off-by: Abhishek Dubey <dubeyabhishek777 at gmail.com>
>> ---
>>   criu/cr-dump.c           |  3 +-
>>   criu/include/page-xfer.h |  2 +-
>>   criu/mem.c               | 88 ++++++++++++++++++------------------------------
>>   criu/page-pipe.c         |  4 +--
>>   criu/page-xfer.c         | 62 ++++++++++++++++++++++++++++++++--
>>   criu/shmem.c             | 19 +++--------
>>   6 files changed, 102 insertions(+), 76 deletions(-)
>>
>> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
>> index 7f2e5ed..ee5f4f3 100644
>> --- a/criu/cr-dump.c
>> +++ b/criu/cr-dump.c
>> @@ -1501,6 +1501,7 @@ static int cr_pre_dump_finish(int status)
>>   		struct parasite_ctl *ctl = dmpi(item)->parasite_ctl;
>>   		struct page_pipe *mem_pp;
>>   		struct page_xfer xfer;
>> +		size_t off = 0;
>>   
>>   		if (!ctl)
>>   			continue;
>> @@ -1512,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>>   			goto err;
>>   
>>   		mem_pp = dmpi(item)->mem_pp;
>> -		ret = page_xfer_dump_pages(&xfer, mem_pp);
>> +		ret = page_xfer_dump_pages(item->pid->real, &xfer, mem_pp, &off);
>>   
>>   		xfer.close(&xfer);
>>   
>> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
>> index fa72273..74d42f2 100644
>> --- a/criu/include/page-xfer.h
>> +++ b/criu/include/page-xfer.h
>> @@ -47,7 +47,7 @@ struct page_xfer {
>>   
>>   extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
>>   struct page_pipe;
>> -extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
>> +extern int page_xfer_dump_pages(int pid, struct page_xfer *, struct page_pipe *, size_t *poff);
>>   extern int connect_to_page_server_to_send(void);
>>   extern int connect_to_page_server_to_recv(int epfd);
>>   extern int disconnect_from_page_server(void);
>> diff --git a/criu/mem.c b/criu/mem.c
>> index 6a1a87a..844d726 100644
>> --- a/criu/mem.c
>> +++ b/criu/mem.c
>> @@ -260,39 +260,7 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
>>   	return args;
>>   }
>>   
>> -static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl,
>> -		      struct parasite_dump_pages_args *args)
>> -{
>> -	struct page_pipe_buf *ppb;
>> -	int ret = 0;
>> -
>> -	debug_show_page_pipe(pp);
>> -
>> -	/* Step 2 -- grab pages into page-pipe */
>> -	list_for_each_entry(ppb, &pp->bufs, l) {
>> -		args->nr_segs = ppb->nr_segs;
>> -		args->nr_pages = ppb->pages_in;
>> -		pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
>> -				args->nr_pages, args->nr_segs, ppb->pipe_size, args->off);
>> -
>> -		ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl);
>> -		if (ret < 0)
>> -			return -1;
>> -		ret = compel_util_send_fd(ctl, ppb->p[1]);
>> -		if (ret)
>> -			return -1;
>> -
>> -		ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl);
>> -		if (ret < 0)
>> -			return -1;
>> -
>> -		args->off += args->nr_segs;
>> -	}
>> -
>> -	return 0;
>> -}
> The drain_pages() should remain as is for the dump case, the target
> processes are anyway frozen and we have the parasite code there. I see no
> point in replacing vmsplice() in the parasite with process_vm_readv() +
> vmsplice() in this case.
>
> Actually, I think that to use process_vm_readv() for the pre-dump case it
> would be enough to override drain_pages() so we'll have something like
> drain_pages_dump() that does vmsplice() via parasite and
> drain_pages_pre_dump() that performs process_vm_readv() to a buffer and
> then splices that buffer to the pipes.
Sure. I will handle pre-dump independent of dump as mentioned.
>> -static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>> +static int xfer_pages( int pid, struct page_pipe *pp, struct page_xfer *xfer, size_t *poff)
> Radostin already mentioned a couple of coding style issues, I'd suggest to
> use checkpatch.pl from Linux sources to verify the next patches.
Sure :)
>
>>   {
>>   	int ret;
>>   
>> @@ -301,7 +269,7 @@ static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>>   	 *           pre-dump action (see pre_dump_one_task)
>>   	 */
>>   	timing_start(TIME_MEMWRITE);
>> -	ret = page_xfer_dump_pages(xfer, pp);
>> +	ret = page_xfer_dump_pages(pid, xfer, pp, poff);
>>   	timing_stop(TIME_MEMWRITE);
>>   
>>   	return ret;
>> @@ -351,7 +319,7 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>>   			     struct page_pipe *pp, struct page_xfer *xfer,
>>   			     struct parasite_dump_pages_args *args,
>>   			     struct parasite_ctl *ctl, pmc_t *pmc,
>> -			     bool has_parent, bool pre_dump)
>> +			     bool has_parent, bool pre_dump, size_t *poff)
>>   {
>>   	u64 off = 0;
>>   	u64 *map;
>> @@ -361,6 +329,12 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>>   				!vma_area_is(vma, VMA_ANON_SHARED))
>>   		return 0;
>>   
>> +	if (!(vma->e->prot & PROT_READ)){
>> +		if(pre_dump)
>> +			return 0;
>> +		has_parent = false;
>> +	}
>> +
>>   	if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
>>   		if (pre_dump)
>>   			return 0;
>> @@ -379,9 +353,7 @@ again:
>>   	if (ret == -EAGAIN) {
>>   		BUG_ON(!(pp->flags & PP_CHUNK_MODE));
>>   
>> -		ret = drain_pages(pp, ctl, args);
>> -		if (!ret)
>> -			ret = xfer_pages(pp, xfer);
>> +		ret = xfer_pages(item->pid->real, pp, xfer, poff);
>>   		if (!ret) {
>>   			page_pipe_reinit(pp);
>>   			goto again;
>> @@ -406,6 +378,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>>   	unsigned long pmc_size;
>>   	int possible_pid_reuse = 0;
>>   	bool has_parent;
>> +	size_t poff = 0;
>>   
>>   	pr_info("\n");
>>   	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real);
>> @@ -470,11 +443,12 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>>   	/*
>>   	 * Step 1 -- generate the pagemap
>>   	 */
>> +	poff = 0;
>>   	args->off = 0;
>>   	has_parent = !!xfer.parent && !possible_pid_reuse;
>>   	list_for_each_entry(vma_area, &vma_area_list->h, list) {
>>   		ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
>> -					&pmc, has_parent, mdc->pre_dump);
>> +					&pmc, has_parent, mdc->pre_dump, &poff);
>>   		if (ret < 0)
>>   			goto out_xfer;
>>   	}
>> @@ -482,9 +456,8 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>>   	if (mdc->lazy)
>>   		memcpy(pargs_iovs(args), pp->iovs,
>>   		       sizeof(struct iovec) * pp->nr_iovs);
>> -	ret = drain_pages(pp, ctl, args);
>> -	if (!ret && !mdc->pre_dump)
>> -		ret = xfer_pages(pp, &xfer);
>> +	if(!mdc->pre_dump)
>> +		ret = xfer_pages(item->pid->real, pp, &xfer, &poff);
>>   	if (ret)
>>   		goto out_xfer;
>>   
>> @@ -529,17 +502,18 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>>   	 *
>>   	 * Afterwards -- reprotect memory back.
>>   	 */
>> +	if(!mdc->pre_dump){
>> +		pargs->add_prot = PROT_READ;
>> +		ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
>> +		if (ret) {
>> +			pr_err("Can't dump unprotect vmas with parasite\n");
>> +			return ret;
>> +		}
>>   
>> -	pargs->add_prot = PROT_READ;
>> -	ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
>> -	if (ret) {
>> -		pr_err("Can't dump unprotect vmas with parasite\n");
>> -		return ret;
>> -	}
>> -
>> -	if (fault_injected(FI_DUMP_PAGES)) {
>> -		pr_err("fault: Dump VMA pages failure!\n");
>> -		return -1;
>> +		if (fault_injected(FI_DUMP_PAGES)) {
>> +			pr_err("fault: Dump VMA pages failure!\n");
>> +			return -1;
>> +		}
>>   	}
>>   
>>   	ret = __parasite_dump_pages_seized(item, pargs, vma_area_list, mdc, ctl);
>> @@ -549,10 +523,12 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>>   		return ret;
>>   	}
>>   
>> -	pargs->add_prot = 0;
>> -	if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
>> -		pr_err("Can't rollback unprotected vmas with parasite\n");
>> -		ret = -1;
>> +	if(!mdc->pre_dump){
>> +		pargs->add_prot = 0;
>> +		if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
>> +			pr_err("Can't rollback unprotected vmas with parasite\n");
>> +			ret = -1;
>> +		}
>>   	}
>>   
>>   	return ret;
>> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
>> index c32b893..c70ba70 100644
>> --- a/criu/page-pipe.c
>> +++ b/criu/page-pipe.c
>> @@ -33,7 +33,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>>   {
>>   	int ret;
>>   
>> -	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE);
>> +	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, (new_size * PAGE_SIZE) + 1);
> Why the size should be changed here?

This is minor hack that I have discussed with Pavel. The situation for 
changing pipe size like this:

Consider process_vm_readv() reads 512 pages in user buffer. All those 
512 pages are vmspliced to

pipe-fd. But vmsplice is able to transfer 511 pages completely and last 
page in fraction of 0.86 each

time for any test case. Cross checking pipe-size reveals its size is 512 
pages. So manipulated actual pipe

size and pipe size that is stored for time being. It will be helpful if 
you could assist me in this.

This is in my #TODO list to debug.

> In any case, this change should be a separate patch.
>
>>   	if (ret < 0)
>>   		return -1;
>>   
>> @@ -41,7 +41,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>>   	BUG_ON(ret < ppb->pipe_size);
>>   
>>   	pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret);
>> -	ppb->pipe_size = ret;
>> +	ppb->pipe_size = ret - 1;
>>   
>>   	return 0;
>>   }
>> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
>> index 9cdffd8..a5396b1 100644
>> --- a/criu/page-xfer.c
>> +++ b/criu/page-xfer.c
>> @@ -496,19 +496,75 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
>>   		return PE_PRESENT;
>>   }
>>   
>> -int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>> +static char userbuf[4 << 20];
>> +
>> +int page_xfer_dump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp, size_t *poff)
>>   {
>>   	struct page_pipe_buf *ppb;
>>   	unsigned int cur_hole = 0;
>> -	int ret;
>> +	unsigned int ret, ret2;
>> +	size_t off;
>> +	struct iovec *remoteiovs = pp->iovs;
>>   
>>   	pr_debug("Transferring pages:\n");
>>   
>> +	off = *poff;
>> +
>>   	list_for_each_entry(ppb, &pp->bufs, l) {
>>   		unsigned int i;
>>   
>>   		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>>   
>> +		size_t bufsize = sizeof(userbuf);
>> +		struct iovec bufvec = {.iov_len = bufsize};
>> +		bufvec.iov_base = userbuf;
>> +
>> +		ret = syscall(__NR_process_vm_readv, pid, &bufvec, 1, \
>> +						&remoteiovs[off], ppb->nr_segs, 0);
> I think glibc has a wrapper for __NR_process_vm_readv, why do you need to
> use syscall(__NR_process_vm_readv, ...) here?
Sure. Will change it.
>> +		if (ret == -1) {
>> +			switch (errno) {
>> +				case EINVAL:
>> +					pr_debug("process_vm_readv: Invalid arguments\n");
>> +				break;
>> +				case EFAULT:
>> +					pr_debug("process_vm_readv: Unable to access remote iov\n");
>> +				break;
>> +				case ENOMEM:
>> +					pr_debug("process_vm_readv: Unable to allocate memory\n");
>> +				break;
>> +				case EPERM:
>> +					pr_debug("process_vm_readv: Insufficient privileges\n");
>> +				break;
>> +				case ESRCH:
>> +					pr_debug("process_vm_readv: Target process doesn't exist\n");
>> +				break;
>> +				default:
>> +					pr_debug("process_vm_readv: Uncategorised error\n");
>> +			}
>> +			return 0;
>> +		}
>> +
>> +		/* Handling partial reads due to modified mappings*/
>> +
>> +		if(ret != ppb->pages_in * PAGE_SIZE){
>> +			pr_debug("Can't read remote iovs (%d(%lu)/%d)\n", ret,\
>> +				(unsigned long int)PAGE_SIZE * ppb->pages_in, ppb->pages_in);
>> +			continue;
>> +		}
>> +
>> +		bufvec.iov_len = ret;
>> +		ret2 = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
>> +
>> +		if(ret2 == -1){
>> +			pr_debug("vmsplice: Failed to splice user buffer to pipe\n");
>> +			continue;
>> +		}
>> +
>> +		if(ret != ret2){
>> +			pr_debug("Partial splice from user buffer to pipe (%d)\n", ret2);
>> +			continue;
>> +		}
>> +
> As I've said, I believe the whole block around process_vm_readv() +
> vmsplice() should be in pre-dump version of drain_pages().
ACK.
>
>>   		for (i = 0; i < ppb->nr_segs; i++) {
>>   			struct iovec iov = ppb->iov[i];
>>   			u32 flags;
>> @@ -530,8 +586,10 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>>   						ppb->p[0], iov.iov_len))
>>   				return -1;
>>   		}
>> +		off += ppb->nr_segs;
>>   	}
>>   
>> +	*poff = off;
>>   	return dump_holes(xfer, pp, &cur_hole, NULL);
>>   }
>>   
>> diff --git a/criu/shmem.c b/criu/shmem.c
>> index 03b088f..a797bde 100644
>> --- a/criu/shmem.c
>> +++ b/criu/shmem.c
>> @@ -629,19 +629,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map)
>>   	return 0;
>>   }
> I don't think there is much value in doing process_vm_splice() for shared
> memory. We anyway attach to the same shared memory the target processes
> use and simply fill the pipes with it's contents.
>    

So pre-dump handling will vary for non-shared and shared memory.

As noted above, having separate drain_pages_predump() will take care of 
this.

>> -static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer)
>> +static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, size_t *off)
>>   {
>> -	struct page_pipe_buf *ppb;
>> -
>> -	list_for_each_entry(ppb, &pp->bufs, l)
>> -		if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
>> -					SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
>> -				ppb->pages_in * PAGE_SIZE) {
>> -			pr_perror("Can't get shmem into page-pipe");
>> -			return -1;
>> -		}
>> -
>> -	return page_xfer_dump_pages(xfer, pp);
>> +	return page_xfer_dump_pages(getpid(), xfer, pp, off);
>>   }
>>   
>>   static int next_data_segment(int fd, unsigned long pfn,
>> @@ -678,6 +668,7 @@ static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si)
>>   	int err, ret = -1;
>>   	unsigned long pfn, nrpages, next_data_pnf = 0, next_hole_pfn = 0;
>>   	unsigned long pages[2] = {};
>> +	size_t off = 0;
>>   
>>   	nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
>>   
>> @@ -726,7 +717,7 @@ again:
>>   		}
>>   
>>   		if (ret == -EAGAIN) {
>> -			ret = dump_pages(pp, &xfer);
>> +			ret = dump_pages(pp, &xfer, &off);
>>   			if (ret)
>>   				goto err_xfer;
>>   			page_pipe_reinit(pp);
>> @@ -742,7 +733,7 @@ again:
>>   	cnt_add(CNT_SHPAGES_SKIPPED_PARENT, pages[0]);
>>   	cnt_add(CNT_SHPAGES_WRITTEN, pages[1]);
>>   
>> -	ret = dump_pages(pp, &xfer);
>> +	ret = dump_pages(pp, &xfer, &off);
>>   
>>   err_xfer:
>>   	xfer.close(&xfer);
>> -- 
>> 2.7.4
>>
>> _______________________________________________
>> CRIU mailing list
>> CRIU at openvz.org
>> https://lists.openvz.org/mailman/listinfo/criu
>>
> -Abhishek


More information about the CRIU mailing list