[CRIU] [PATCH] vmsplice user-buffer(grabbed by process_vm_readv) to page-pipe

Mike Rapoport rppt at linux.ibm.com
Tue Jul 9 11:04:47 MSK 2019


Hi Abishek,

On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
> This patch implements usage of process_vm_readv
> syscall to collect memory pages from target process during
> pre-dump. process_vm_readv collects pages in user-buffer,
> which is later vmspliced to page-pipes.

>From what I can tell, the patch completely replaces splicing of the memory
in the parasite code with usage of process_vm_readv() for both pre-dump and
dump.

I don't think that for dump that approach will be better then what we have
now and surely it'll be more expensive.

> Signed-off-by: Abhishek Dubey <dubeyabhishek777 at gmail.com>
> ---
>  criu/cr-dump.c           |  3 +-
>  criu/include/page-xfer.h |  2 +-
>  criu/mem.c               | 88 ++++++++++++++++++------------------------------
>  criu/page-pipe.c         |  4 +--
>  criu/page-xfer.c         | 62 ++++++++++++++++++++++++++++++++--
>  criu/shmem.c             | 19 +++--------
>  6 files changed, 102 insertions(+), 76 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 7f2e5ed..ee5f4f3 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1501,6 +1501,7 @@ static int cr_pre_dump_finish(int status)
>  		struct parasite_ctl *ctl = dmpi(item)->parasite_ctl;
>  		struct page_pipe *mem_pp;
>  		struct page_xfer xfer;
> +		size_t off = 0;
>  
>  		if (!ctl)
>  			continue;
> @@ -1512,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>  			goto err;
>  
>  		mem_pp = dmpi(item)->mem_pp;
> -		ret = page_xfer_dump_pages(&xfer, mem_pp);
> +		ret = page_xfer_dump_pages(item->pid->real, &xfer, mem_pp, &off);
>  
>  		xfer.close(&xfer);
>  
> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
> index fa72273..74d42f2 100644
> --- a/criu/include/page-xfer.h
> +++ b/criu/include/page-xfer.h
> @@ -47,7 +47,7 @@ struct page_xfer {
>  
>  extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
>  struct page_pipe;
> -extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
> +extern int page_xfer_dump_pages(int pid, struct page_xfer *, struct page_pipe *, size_t *poff);
>  extern int connect_to_page_server_to_send(void);
>  extern int connect_to_page_server_to_recv(int epfd);
>  extern int disconnect_from_page_server(void);
> diff --git a/criu/mem.c b/criu/mem.c
> index 6a1a87a..844d726 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -260,39 +260,7 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
>  	return args;
>  }
>  
> -static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl,
> -		      struct parasite_dump_pages_args *args)
> -{
> -	struct page_pipe_buf *ppb;
> -	int ret = 0;
> -
> -	debug_show_page_pipe(pp);
> -
> -	/* Step 2 -- grab pages into page-pipe */
> -	list_for_each_entry(ppb, &pp->bufs, l) {
> -		args->nr_segs = ppb->nr_segs;
> -		args->nr_pages = ppb->pages_in;
> -		pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
> -				args->nr_pages, args->nr_segs, ppb->pipe_size, args->off);
> -
> -		ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl);
> -		if (ret < 0)
> -			return -1;
> -		ret = compel_util_send_fd(ctl, ppb->p[1]);
> -		if (ret)
> -			return -1;
> -
> -		ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl);
> -		if (ret < 0)
> -			return -1;
> -
> -		args->off += args->nr_segs;
> -	}
> -
> -	return 0;
> -}

The drain_pages() should remain as is for the dump case, the target
processes are anyway frozen and we have the parasite code there. I see no
point in replacing vmsplice() in the parasite with process_vm_readv() +
vmsplice() in this case.

Actually, I think that to use process_vm_readv() for the pre-dump case it
would be enough to override drain_pages() so we'll have something like
drain_pages_dump() that does vmsplice() via parasite and
drain_pages_pre_dump() that performs process_vm_readv() to a buffer and
then splices that buffer to the pipes.

> -static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
> +static int xfer_pages( int pid, struct page_pipe *pp, struct page_xfer *xfer, size_t *poff)

Radostin already mentioned a couple of coding style issues, I'd suggest to
use checkpatch.pl from Linux sources to verify the next patches.

>  {
>  	int ret;
>  
> @@ -301,7 +269,7 @@ static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>  	 *           pre-dump action (see pre_dump_one_task)
>  	 */
>  	timing_start(TIME_MEMWRITE);
> -	ret = page_xfer_dump_pages(xfer, pp);
> +	ret = page_xfer_dump_pages(pid, xfer, pp, poff);
>  	timing_stop(TIME_MEMWRITE);
>  
>  	return ret;
> @@ -351,7 +319,7 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>  			     struct page_pipe *pp, struct page_xfer *xfer,
>  			     struct parasite_dump_pages_args *args,
>  			     struct parasite_ctl *ctl, pmc_t *pmc,
> -			     bool has_parent, bool pre_dump)
> +			     bool has_parent, bool pre_dump, size_t *poff)
>  {
>  	u64 off = 0;
>  	u64 *map;
> @@ -361,6 +329,12 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>  				!vma_area_is(vma, VMA_ANON_SHARED))
>  		return 0;
>  
> +	if (!(vma->e->prot & PROT_READ)){
> +		if(pre_dump)
> +			return 0;
> +		has_parent = false;
> +	}
> +
>  	if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
>  		if (pre_dump)
>  			return 0;
> @@ -379,9 +353,7 @@ again:
>  	if (ret == -EAGAIN) {
>  		BUG_ON(!(pp->flags & PP_CHUNK_MODE));
>  
> -		ret = drain_pages(pp, ctl, args);
> -		if (!ret)
> -			ret = xfer_pages(pp, xfer);
> +		ret = xfer_pages(item->pid->real, pp, xfer, poff);
>  		if (!ret) {
>  			page_pipe_reinit(pp);
>  			goto again;
> @@ -406,6 +378,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	unsigned long pmc_size;
>  	int possible_pid_reuse = 0;
>  	bool has_parent;
> +	size_t poff = 0;
>  
>  	pr_info("\n");
>  	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real);
> @@ -470,11 +443,12 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	/*
>  	 * Step 1 -- generate the pagemap
>  	 */
> +	poff = 0;
>  	args->off = 0;
>  	has_parent = !!xfer.parent && !possible_pid_reuse;
>  	list_for_each_entry(vma_area, &vma_area_list->h, list) {
>  		ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
> -					&pmc, has_parent, mdc->pre_dump);
> +					&pmc, has_parent, mdc->pre_dump, &poff);
>  		if (ret < 0)
>  			goto out_xfer;
>  	}
> @@ -482,9 +456,8 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	if (mdc->lazy)
>  		memcpy(pargs_iovs(args), pp->iovs,
>  		       sizeof(struct iovec) * pp->nr_iovs);
> -	ret = drain_pages(pp, ctl, args);
> -	if (!ret && !mdc->pre_dump)
> -		ret = xfer_pages(pp, &xfer);
> +	if(!mdc->pre_dump)
> +		ret = xfer_pages(item->pid->real, pp, &xfer, &poff);
>  	if (ret)
>  		goto out_xfer;
>  
> @@ -529,17 +502,18 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>  	 *
>  	 * Afterwards -- reprotect memory back.
>  	 */
> +	if(!mdc->pre_dump){
> +		pargs->add_prot = PROT_READ;
> +		ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
> +		if (ret) {
> +			pr_err("Can't dump unprotect vmas with parasite\n");
> +			return ret;
> +		}
>  
> -	pargs->add_prot = PROT_READ;
> -	ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
> -	if (ret) {
> -		pr_err("Can't dump unprotect vmas with parasite\n");
> -		return ret;
> -	}
> -
> -	if (fault_injected(FI_DUMP_PAGES)) {
> -		pr_err("fault: Dump VMA pages failure!\n");
> -		return -1;
> +		if (fault_injected(FI_DUMP_PAGES)) {
> +			pr_err("fault: Dump VMA pages failure!\n");
> +			return -1;
> +		}
>  	}
>  
>  	ret = __parasite_dump_pages_seized(item, pargs, vma_area_list, mdc, ctl);
> @@ -549,10 +523,12 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>  		return ret;
>  	}
>  
> -	pargs->add_prot = 0;
> -	if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
> -		pr_err("Can't rollback unprotected vmas with parasite\n");
> -		ret = -1;
> +	if(!mdc->pre_dump){
> +		pargs->add_prot = 0;
> +		if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
> +			pr_err("Can't rollback unprotected vmas with parasite\n");
> +			ret = -1;
> +		}
>  	}
>  
>  	return ret;
> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
> index c32b893..c70ba70 100644
> --- a/criu/page-pipe.c
> +++ b/criu/page-pipe.c
> @@ -33,7 +33,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>  {
>  	int ret;
>  
> -	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE);
> +	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, (new_size * PAGE_SIZE) + 1);

Why the size should be changed here?
In any case, this change should be a separate patch.

>  	if (ret < 0)
>  		return -1;
>  
> @@ -41,7 +41,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>  	BUG_ON(ret < ppb->pipe_size);
>  
>  	pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret);
> -	ppb->pipe_size = ret;
> +	ppb->pipe_size = ret - 1;
>  
>  	return 0;
>  }
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index 9cdffd8..a5396b1 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -496,19 +496,75 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
>  		return PE_PRESENT;
>  }
>  
> -int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
> +static char userbuf[4 << 20];
> +
> +int page_xfer_dump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp, size_t *poff)
>  {
>  	struct page_pipe_buf *ppb;
>  	unsigned int cur_hole = 0;
> -	int ret;
> +	unsigned int ret, ret2;
> +	size_t off;
> +	struct iovec *remoteiovs = pp->iovs;
>  
>  	pr_debug("Transferring pages:\n");
>  
> +	off = *poff;
> +
>  	list_for_each_entry(ppb, &pp->bufs, l) {
>  		unsigned int i;
>  
>  		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>  
> +		size_t bufsize = sizeof(userbuf);
> +		struct iovec bufvec = {.iov_len = bufsize};
> +		bufvec.iov_base = userbuf;
> +
> +		ret = syscall(__NR_process_vm_readv, pid, &bufvec, 1, \
> +						&remoteiovs[off], ppb->nr_segs, 0);

I think glibc has a wrapper for __NR_process_vm_readv, why do you need to
use syscall(__NR_process_vm_readv, ...) here?

> +		if (ret == -1) {
> +			switch (errno) {
> +				case EINVAL:
> +					pr_debug("process_vm_readv: Invalid arguments\n");
> +				break;
> +				case EFAULT:
> +					pr_debug("process_vm_readv: Unable to access remote iov\n");
> +				break;
> +				case ENOMEM:
> +					pr_debug("process_vm_readv: Unable to allocate memory\n");
> +				break;
> +				case EPERM:
> +					pr_debug("process_vm_readv: Insufficient privileges\n");
> +				break;
> +				case ESRCH:
> +					pr_debug("process_vm_readv: Target process doesn't exist\n");
> +				break;
> +				default:
> +					pr_debug("process_vm_readv: Uncategorised error\n");
> +			}
> +			return 0;
> +		}
> +
> +		/* Handling partial reads due to modified mappings*/
> +
> +		if(ret != ppb->pages_in * PAGE_SIZE){
> +			pr_debug("Can't read remote iovs (%d(%lu)/%d)\n", ret,\
> +				(unsigned long int)PAGE_SIZE * ppb->pages_in, ppb->pages_in);
> +			continue;
> +		}
> +
> +		bufvec.iov_len = ret;
> +		ret2 = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
> +
> +		if(ret2 == -1){
> +			pr_debug("vmsplice: Failed to splice user buffer to pipe\n");
> +			continue;
> +		}
> +
> +		if(ret != ret2){
> +			pr_debug("Partial splice from user buffer to pipe (%d)\n", ret2);
> +			continue;
> +		}
> +

As I've said, I believe the whole block around process_vm_readv() +
vmsplice() should be in pre-dump version of drain_pages().

>  		for (i = 0; i < ppb->nr_segs; i++) {
>  			struct iovec iov = ppb->iov[i];
>  			u32 flags;
> @@ -530,8 +586,10 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>  						ppb->p[0], iov.iov_len))
>  				return -1;
>  		}
> +		off += ppb->nr_segs;
>  	}
>  
> +	*poff = off;
>  	return dump_holes(xfer, pp, &cur_hole, NULL);
>  }
>  
> diff --git a/criu/shmem.c b/criu/shmem.c
> index 03b088f..a797bde 100644
> --- a/criu/shmem.c
> +++ b/criu/shmem.c
> @@ -629,19 +629,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map)
>  	return 0;
>  }

I don't think there is much value in doing process_vm_splice() for shared
memory. We anyway attach to the same shared memory the target processes
use and simply fill the pipes with it's contents.
  
> -static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer)
> +static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, size_t *off)
>  {
> -	struct page_pipe_buf *ppb;
> -
> -	list_for_each_entry(ppb, &pp->bufs, l)
> -		if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
> -					SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
> -				ppb->pages_in * PAGE_SIZE) {
> -			pr_perror("Can't get shmem into page-pipe");
> -			return -1;
> -		}
> -
> -	return page_xfer_dump_pages(xfer, pp);
> +	return page_xfer_dump_pages(getpid(), xfer, pp, off);
>  }
>  
>  static int next_data_segment(int fd, unsigned long pfn,
> @@ -678,6 +668,7 @@ static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si)
>  	int err, ret = -1;
>  	unsigned long pfn, nrpages, next_data_pnf = 0, next_hole_pfn = 0;
>  	unsigned long pages[2] = {};
> +	size_t off = 0;
>  
>  	nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
>  
> @@ -726,7 +717,7 @@ again:
>  		}
>  
>  		if (ret == -EAGAIN) {
> -			ret = dump_pages(pp, &xfer);
> +			ret = dump_pages(pp, &xfer, &off);
>  			if (ret)
>  				goto err_xfer;
>  			page_pipe_reinit(pp);
> @@ -742,7 +733,7 @@ again:
>  	cnt_add(CNT_SHPAGES_SKIPPED_PARENT, pages[0]);
>  	cnt_add(CNT_SHPAGES_WRITTEN, pages[1]);
>  
> -	ret = dump_pages(pp, &xfer);
> +	ret = dump_pages(pp, &xfer, &off);
>  
>  err_xfer:
>  	xfer.close(&xfer);
> -- 
> 2.7.4
> 
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
> 

-- 
Sincerely yours,
Mike.



More information about the CRIU mailing list