[CRIU] [PATCH] vmsplice user-buffer(grabbed by process_vm_readv) to page-pipe

Andrei Vagin avagin at gmail.com
Mon Jul 8 07:05:38 MSK 2019


On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
> This patch implements usage of process_vm_readv
> syscall to collect memory pages from target process during
> pre-dump. process_vm_readv collects pages in user-buffer,
> which is later vmspliced to page-pipes.

You have to be more detailed in the commit message.

Frist of all, you have to explain why we need this patch.

If you improve performance, you need to give some numbers.

If you implement a new feature, you need to provide tests for it.

If I understand this patch right, criu freezes all processes, collects
page maps, unfreezes process, dump pages? If this is right, you need to
explain how it handles in this case:

addr = map(NULL, 4 * PAGE_SIZE)	|
addr[0] = 1			|
addr[PAGE_SIZE]  = 2		|
addr[2 * PAGE_SIZE] = 3		|
addr[3 * PAGE_SIZE] = 4		|
				| criu freezes the process
				| criu collects page maps
				| criu unfreezes the process
unmap(addr, PAGE_SIZE * 2)	|
				| criu dumps pages

here are a few inline comments
> 
> Signed-off-by: Abhishek Dubey <dubeyabhishek777 at gmail.com>
> ---
>  criu/cr-dump.c           |  3 +-
>  criu/include/page-xfer.h |  2 +-
>  criu/mem.c               | 88 ++++++++++++++++++------------------------------
>  criu/page-pipe.c         |  4 +--
>  criu/page-xfer.c         | 62 ++++++++++++++++++++++++++++++++--
>  criu/shmem.c             | 19 +++--------
>  6 files changed, 102 insertions(+), 76 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 7f2e5ed..ee5f4f3 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1501,6 +1501,7 @@ static int cr_pre_dump_finish(int status)
>  		struct parasite_ctl *ctl = dmpi(item)->parasite_ctl;
>  		struct page_pipe *mem_pp;
>  		struct page_xfer xfer;
> +		size_t off = 0;
>  
>  		if (!ctl)
>  			continue;
> @@ -1512,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>  			goto err;
>  
>  		mem_pp = dmpi(item)->mem_pp;
> -		ret = page_xfer_dump_pages(&xfer, mem_pp);
> +		ret = page_xfer_dump_pages(item->pid->real, &xfer, mem_pp, &off);
>  
>  		xfer.close(&xfer);
>  
> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
> index fa72273..74d42f2 100644
> --- a/criu/include/page-xfer.h
> +++ b/criu/include/page-xfer.h
> @@ -47,7 +47,7 @@ struct page_xfer {
>  
>  extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
>  struct page_pipe;
> -extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
> +extern int page_xfer_dump_pages(int pid, struct page_xfer *, struct page_pipe *, size_t *poff);
>  extern int connect_to_page_server_to_send(void);
>  extern int connect_to_page_server_to_recv(int epfd);
>  extern int disconnect_from_page_server(void);
> diff --git a/criu/mem.c b/criu/mem.c
> index 6a1a87a..844d726 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -260,39 +260,7 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
>  	return args;
>  }
>  
> -static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl,
> -		      struct parasite_dump_pages_args *args)
> -{
> -	struct page_pipe_buf *ppb;
> -	int ret = 0;
> -
> -	debug_show_page_pipe(pp);
> -
> -	/* Step 2 -- grab pages into page-pipe */
> -	list_for_each_entry(ppb, &pp->bufs, l) {
> -		args->nr_segs = ppb->nr_segs;
> -		args->nr_pages = ppb->pages_in;
> -		pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
> -				args->nr_pages, args->nr_segs, ppb->pipe_size, args->off);
> -
> -		ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl);
> -		if (ret < 0)
> -			return -1;
> -		ret = compel_util_send_fd(ctl, ppb->p[1]);
> -		if (ret)
> -			return -1;
> -
> -		ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl);
> -		if (ret < 0)
> -			return -1;
> -
> -		args->off += args->nr_segs;
> -	}
> -
> -	return 0;
> -}
> -
> -static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
> +static int xfer_pages( int pid, struct page_pipe *pp, struct page_xfer *xfer, size_t *poff)
>  {
>  	int ret;
>  
> @@ -301,7 +269,7 @@ static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>  	 *           pre-dump action (see pre_dump_one_task)
>  	 */
>  	timing_start(TIME_MEMWRITE);
> -	ret = page_xfer_dump_pages(xfer, pp);
> +	ret = page_xfer_dump_pages(pid, xfer, pp, poff);
>  	timing_stop(TIME_MEMWRITE);
>  
>  	return ret;
> @@ -351,7 +319,7 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>  			     struct page_pipe *pp, struct page_xfer *xfer,
>  			     struct parasite_dump_pages_args *args,
>  			     struct parasite_ctl *ctl, pmc_t *pmc,
> -			     bool has_parent, bool pre_dump)
> +			     bool has_parent, bool pre_dump, size_t *poff)
>  {
>  	u64 off = 0;
>  	u64 *map;
> @@ -361,6 +329,12 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>  				!vma_area_is(vma, VMA_ANON_SHARED))
>  		return 0;
>  
> +	if (!(vma->e->prot & PROT_READ)){
> +		if(pre_dump)
> +			return 0;
> +		has_parent = false;
> +	}
> +
>  	if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
>  		if (pre_dump)
>  			return 0;
> @@ -379,9 +353,7 @@ again:
>  	if (ret == -EAGAIN) {
>  		BUG_ON(!(pp->flags & PP_CHUNK_MODE));
>  
> -		ret = drain_pages(pp, ctl, args);
> -		if (!ret)
> -			ret = xfer_pages(pp, xfer);
> +		ret = xfer_pages(item->pid->real, pp, xfer, poff);
>  		if (!ret) {
>  			page_pipe_reinit(pp);
>  			goto again;
> @@ -406,6 +378,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	unsigned long pmc_size;
>  	int possible_pid_reuse = 0;
>  	bool has_parent;
> +	size_t poff = 0;
>  
>  	pr_info("\n");
>  	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real);
> @@ -470,11 +443,12 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	/*
>  	 * Step 1 -- generate the pagemap
>  	 */
> +	poff = 0;
>  	args->off = 0;
>  	has_parent = !!xfer.parent && !possible_pid_reuse;
>  	list_for_each_entry(vma_area, &vma_area_list->h, list) {
>  		ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
> -					&pmc, has_parent, mdc->pre_dump);
> +					&pmc, has_parent, mdc->pre_dump, &poff);
>  		if (ret < 0)
>  			goto out_xfer;
>  	}
> @@ -482,9 +456,8 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	if (mdc->lazy)
>  		memcpy(pargs_iovs(args), pp->iovs,
>  		       sizeof(struct iovec) * pp->nr_iovs);
> -	ret = drain_pages(pp, ctl, args);
> -	if (!ret && !mdc->pre_dump)
> -		ret = xfer_pages(pp, &xfer);
> +	if(!mdc->pre_dump)
> +		ret = xfer_pages(item->pid->real, pp, &xfer, &poff);
>  	if (ret)
>  		goto out_xfer;
>  
> @@ -529,17 +502,18 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>  	 *
>  	 * Afterwards -- reprotect memory back.
>  	 */
> +	if(!mdc->pre_dump){
> +		pargs->add_prot = PROT_READ;
> +		ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);

This should be in a separate patch with explanation why we don't need
mprotect in this case.

> +		if (ret) {
> +			pr_err("Can't dump unprotect vmas with parasite\n");
> +			return ret;
> +		}
>  
> -	pargs->add_prot = PROT_READ;
> -	ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
> -	if (ret) {
> -		pr_err("Can't dump unprotect vmas with parasite\n");
> -		return ret;
> -	}
> -
> -	if (fault_injected(FI_DUMP_PAGES)) {
> -		pr_err("fault: Dump VMA pages failure!\n");
> -		return -1;
> +		if (fault_injected(FI_DUMP_PAGES)) {
> +			pr_err("fault: Dump VMA pages failure!\n");
> +			return -1;
> +		}
>  	}
>  
>  	ret = __parasite_dump_pages_seized(item, pargs, vma_area_list, mdc, ctl);
> @@ -549,10 +523,12 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>  		return ret;
>  	}
>  
> -	pargs->add_prot = 0;
> -	if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
> -		pr_err("Can't rollback unprotected vmas with parasite\n");
> -		ret = -1;
> +	if(!mdc->pre_dump){
> +		pargs->add_prot = 0;
> +		if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
> +			pr_err("Can't rollback unprotected vmas with parasite\n");
> +			ret = -1;
> +		}
>  	}
>  
>  	return ret;
> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
> index c32b893..c70ba70 100644
> --- a/criu/page-pipe.c
> +++ b/criu/page-pipe.c
> @@ -33,7 +33,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>  {
>  	int ret;
>  
> -	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE);
> +	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, (new_size * PAGE_SIZE) + 1);
>  	if (ret < 0)
>  		return -1;
>  
> @@ -41,7 +41,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>  	BUG_ON(ret < ppb->pipe_size);
>  
>  	pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret);
> -	ppb->pipe_size = ret;
> +	ppb->pipe_size = ret - 1;
>  
>  	return 0;
>  }
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index 9cdffd8..a5396b1 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -496,19 +496,75 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
>  		return PE_PRESENT;
>  }
>  
> -int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
> +static char userbuf[4 << 20];
> +
> +int page_xfer_dump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp, size_t *poff)
>  {
>  	struct page_pipe_buf *ppb;
>  	unsigned int cur_hole = 0;
> -	int ret;
> +	unsigned int ret, ret2;
> +	size_t off;
> +	struct iovec *remoteiovs = pp->iovs;
>  
>  	pr_debug("Transferring pages:\n");
>  
> +	off = *poff;
> +
>  	list_for_each_entry(ppb, &pp->bufs, l) {
>  		unsigned int i;
>  
>  		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>  
> +		size_t bufsize = sizeof(userbuf);
> +		struct iovec bufvec = {.iov_len = bufsize};
> +		bufvec.iov_base = userbuf;
> +
> +		ret = syscall(__NR_process_vm_readv, pid, &bufvec, 1, \
> +						&remoteiovs[off], ppb->nr_segs, 0);
> +		if (ret == -1) {
> +			switch (errno) {

you don't need this switch. You can use pr_perror of pr_debug("....: %s". strerror(errno));

> +				case EINVAL:
> +					pr_debug("process_vm_readv: Invalid arguments\n");
> +				break;
> +				case EFAULT:
> +					pr_debug("process_vm_readv: Unable to access remote iov\n");
> +				break;
> +				case ENOMEM:
> +					pr_debug("process_vm_readv: Unable to allocate memory\n");
> +				break;
> +				case EPERM:
> +					pr_debug("process_vm_readv: Insufficient privileges\n");
> +				break;
> +				case ESRCH:
> +					pr_debug("process_vm_readv: Target process doesn't exist\n");
> +				break;
> +				default:
> +					pr_debug("process_vm_readv: Uncategorised error\n");
> +			}
> +			return 0;

Why do we return 0 in error cases? You need to add a comment here.

> +		}
> +
> +		/* Handling partial reads due to modified mappings*/
> +
> +		if(ret != ppb->pages_in * PAGE_SIZE){
> +			pr_debug("Can't read remote iovs (%d(%lu)/%d)\n", ret,\
> +				(unsigned long int)PAGE_SIZE * ppb->pages_in, ppb->pages_in);
> +			continue;
> +		}
> +
> +		bufvec.iov_len = ret;
> +		ret2 = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
> +
> +		if(ret2 == -1){
> +			pr_debug("vmsplice: Failed to splice user buffer to pipe\n");
> +			continue;
> +		}
> +
> +		if(ret != ret2){
> +			pr_debug("Partial splice from user buffer to pipe (%d)\n", ret2);
> +			continue;
> +		}
> +
>  		for (i = 0; i < ppb->nr_segs; i++) {
>  			struct iovec iov = ppb->iov[i];
>  			u32 flags;
> @@ -530,8 +586,10 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>  						ppb->p[0], iov.iov_len))
>  				return -1;
>  		}
> +		off += ppb->nr_segs;
>  	}
>  
> +	*poff = off;
>  	return dump_holes(xfer, pp, &cur_hole, NULL);
>  }
>  
> diff --git a/criu/shmem.c b/criu/shmem.c
> index 03b088f..a797bde 100644
> --- a/criu/shmem.c
> +++ b/criu/shmem.c
> @@ -629,19 +629,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map)
>  	return 0;
>  }
>  
> -static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer)
> +static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, size_t *off)
>  {
> -	struct page_pipe_buf *ppb;
> -
> -	list_for_each_entry(ppb, &pp->bufs, l)
> -		if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
> -					SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
> -				ppb->pages_in * PAGE_SIZE) {
> -			pr_perror("Can't get shmem into page-pipe");
> -			return -1;
> -		}
> -
> -	return page_xfer_dump_pages(xfer, pp);
> +	return page_xfer_dump_pages(getpid(), xfer, pp, off);
>  }
>  
>  static int next_data_segment(int fd, unsigned long pfn,
> @@ -678,6 +668,7 @@ static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si)
>  	int err, ret = -1;
>  	unsigned long pfn, nrpages, next_data_pnf = 0, next_hole_pfn = 0;
>  	unsigned long pages[2] = {};
> +	size_t off = 0;
>  
>  	nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
>  
> @@ -726,7 +717,7 @@ again:
>  		}
>  
>  		if (ret == -EAGAIN) {
> -			ret = dump_pages(pp, &xfer);
> +			ret = dump_pages(pp, &xfer, &off);
>  			if (ret)
>  				goto err_xfer;
>  			page_pipe_reinit(pp);
> @@ -742,7 +733,7 @@ again:
>  	cnt_add(CNT_SHPAGES_SKIPPED_PARENT, pages[0]);
>  	cnt_add(CNT_SHPAGES_WRITTEN, pages[1]);
>  
> -	ret = dump_pages(pp, &xfer);
> +	ret = dump_pages(pp, &xfer, &off);
>  
>  err_xfer:
>  	xfer.close(&xfer);
> -- 
> 2.7.4
> 


More information about the CRIU mailing list