[CRIU] [PATCH] vmsplice user-buffer(grabbed by process_vm_readv) to page-pipe

abhishek dubey dubeyabhishek777 at gmail.com
Mon Jul 8 10:09:12 MSK 2019


Hi Radostin,

Please find inline replies.

On 07/07/19 2:35 AM, Radostin Stoyanov wrote:
> Hi Abhishek,
>
>
> On 05/07/2019 13:04, Abhishek Dubey wrote:
>> This patch implements usage of process_vm_readv
>> syscall to collect memory pages from target process during
>> pre-dump. process_vm_readv collects pages in user-buffer,
>> which is later vmspliced to page-pipes.
> It might be useful to include some performance information about the 
> changes in this patch (e.g. you can use "-v0 --display-stats").
Sure I will do that.
>
>>
>> Signed-off-by: Abhishek Dubey<dubeyabhishek777 at gmail.com>
>> ---
>>   criu/cr-dump.c           |  3 +-
>>   criu/include/page-xfer.h |  2 +-
>>   criu/mem.c               | 88 
>> ++++++++++++++++++------------------------------
>>   criu/page-pipe.c         |  4 +--
>>   criu/page-xfer.c         | 62 ++++++++++++++++++++++++++++++++--
>>   criu/shmem.c             | 19 +++--------
>>   6 files changed, 102 insertions(+), 76 deletions(-)
>>
>> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
>> index 7f2e5ed..ee5f4f3 100644
>> --- a/criu/cr-dump.c
>> +++ b/criu/cr-dump.c
>> @@ -1501,6 +1501,7 @@ static int cr_pre_dump_finish(int status)
>>           struct parasite_ctl *ctl = dmpi(item)->parasite_ctl;
>>           struct page_pipe *mem_pp;
>>           struct page_xfer xfer;
>> +        size_t off = 0;
>>             if (!ctl)
>>               continue;
>> @@ -1512,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>>               goto err;
>>             mem_pp = dmpi(item)->mem_pp;
>> -        ret = page_xfer_dump_pages(&xfer, mem_pp);
>> +        ret = page_xfer_dump_pages(item->pid->real, &xfer, mem_pp, 
>> &off);
>>             xfer.close(&xfer);
>>   diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
>> index fa72273..74d42f2 100644
>> --- a/criu/include/page-xfer.h
>> +++ b/criu/include/page-xfer.h
>> @@ -47,7 +47,7 @@ struct page_xfer {
>>     extern int open_page_xfer(struct page_xfer *xfer, int fd_type, 
>> unsigned long id);
>>   struct page_pipe;
>> -extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe 
>> *);
>> +extern int page_xfer_dump_pages(int pid, struct page_xfer *, struct 
>> page_pipe *, size_t *poff);
>>   extern int connect_to_page_server_to_send(void);
>>   extern int connect_to_page_server_to_recv(int epfd);
>>   extern int disconnect_from_page_server(void);
>> diff --git a/criu/mem.c b/criu/mem.c
>> index 6a1a87a..844d726 100644
>> --- a/criu/mem.c
>> +++ b/criu/mem.c
>> @@ -260,39 +260,7 @@ static struct parasite_dump_pages_args 
>> *prep_dump_pages_args(struct parasite_ctl
>>       return args;
>>   }
>>   -static int drain_pages(struct page_pipe *pp, struct parasite_ctl 
>> *ctl,
>> -              struct parasite_dump_pages_args *args)
>> -{
>> -    struct page_pipe_buf *ppb;
>> -    int ret = 0;
>> -
>> -    debug_show_page_pipe(pp);
>> -
>> -    /* Step 2 -- grab pages into page-pipe */
>> -    list_for_each_entry(ppb, &pp->bufs, l) {
>> -        args->nr_segs = ppb->nr_segs;
>> -        args->nr_pages = ppb->pages_in;
>> -        pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
>> -                args->nr_pages, args->nr_segs, ppb->pipe_size, 
>> args->off);
>> -
>> -        ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl);
>> -        if (ret < 0)
>> -            return -1;
>> -        ret = compel_util_send_fd(ctl, ppb->p[1]);
>> -        if (ret)
>> -            return -1;
>> -
>> -        ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl);
>> -        if (ret < 0)
>> -            return -1;
>> -
>> -        args->off += args->nr_segs;
>> -    }
>> -
>> -    return 0;
>> -}
>> -
>> -static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>> +static int xfer_pages( int pid, struct page_pipe *pp, struct 
>> page_xfer *xfer, size_t *poff)
> please remove the space before "int pid"
OK.
>>   {
>>       int ret;
> could you please update the comment in this function ("Step 3" should 
> be "Step 2")?
Yes, I will update it.
>
>>   @@ -301,7 +269,7 @@ static int xfer_pages(struct page_pipe *pp, 
>> struct page_xfer *xfer)
>>        *           pre-dump action (see pre_dump_one_task)
>>        */
>>       timing_start(TIME_MEMWRITE);
>> -    ret = page_xfer_dump_pages(xfer, pp);
>> +    ret = page_xfer_dump_pages(pid, xfer, pp, poff);
>>       timing_stop(TIME_MEMWRITE);
>>         return ret;
>> @@ -351,7 +319,7 @@ static int generate_vma_iovs(struct pstree_item 
>> *item, struct vma_area *vma,
>>                    struct page_pipe *pp, struct page_xfer *xfer,
>>                    struct parasite_dump_pages_args *args,
>>                    struct parasite_ctl *ctl, pmc_t *pmc,
>> -                 bool has_parent, bool pre_dump)
>> +                 bool has_parent, bool pre_dump, size_t *poff)
>>   {
>>       u64 off = 0;
>>       u64 *map;
>> @@ -361,6 +329,12 @@ static int generate_vma_iovs(struct pstree_item 
>> *item, struct vma_area *vma,
>>                   !vma_area_is(vma, VMA_ANON_SHARED))
>>           return 0;
>>   +    if (!(vma->e->prot & PROT_READ)){
>> +        if(pre_dump)
>> +            return 0;
>> +        has_parent = false;
>> +    }
>> +
>>       if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
>>           if (pre_dump)
>>               return 0;
>> @@ -379,9 +353,7 @@ again:
>>       if (ret == -EAGAIN) {
>>           BUG_ON(!(pp->flags & PP_CHUNK_MODE));
>>   -        ret = drain_pages(pp, ctl, args);
>> -        if (!ret)
>> -            ret = xfer_pages(pp, xfer);
>> +        ret = xfer_pages(item->pid->real, pp, xfer, poff);
>>           if (!ret) {
>>               page_pipe_reinit(pp);
>>               goto again;
>> @@ -406,6 +378,7 @@ static int __parasite_dump_pages_seized(struct 
>> pstree_item *item,
>>       unsigned long pmc_size;
>>       int possible_pid_reuse = 0;
>>       bool has_parent;
>> +    size_t poff = 0;
>>         pr_info("\n");
>>       pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, 
>> item->pid->real);
>> @@ -470,11 +443,12 @@ static int __parasite_dump_pages_seized(struct 
>> pstree_item *item,
>>       /*
>>        * Step 1 -- generate the pagemap
>>        */
>> +    poff = 0;
> poff has been initialised to 0, it seems unnecessary to set this value 
> again?
I will remove it.
>>       args->off = 0;
>>       has_parent = !!xfer.parent && !possible_pid_reuse;
>>       list_for_each_entry(vma_area, &vma_area_list->h, list) {
>>           ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
>> -                    &pmc, has_parent, mdc->pre_dump);
>> +                    &pmc, has_parent, mdc->pre_dump, &poff);
>>           if (ret < 0)
>>               goto out_xfer;
>>       }
>> @@ -482,9 +456,8 @@ static int __parasite_dump_pages_seized(struct 
>> pstree_item *item,
>>       if (mdc->lazy)
>>           memcpy(pargs_iovs(args), pp->iovs,
>>                  sizeof(struct iovec) * pp->nr_iovs);
>> -    ret = drain_pages(pp, ctl, args);
>> -    if (!ret && !mdc->pre_dump)
>> -        ret = xfer_pages(pp, &xfer);
>> +    if(!mdc->pre_dump)
> please add a space after 'if'
I have missed same at many places. Will handle it.
>> +        ret = xfer_pages(item->pid->real, pp, &xfer, &poff);
>>       if (ret)
>>           goto out_xfer;
>>   @@ -529,17 +502,18 @@ int parasite_dump_pages_seized(struct 
>> pstree_item *item,
>>        *
>>        * Afterwards -- reprotect memory back.
>>        */
>> +    if(!mdc->pre_dump){
> again here, and a space before the curly brace
Sure!
>> +        pargs->add_prot = PROT_READ;
>> +        ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
>> +        if (ret) {
>> +            pr_err("Can't dump unprotect vmas with parasite\n");
>> +            return ret;
>> +        }
>>   -    pargs->add_prot = PROT_READ;
>> -    ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
>> -    if (ret) {
>> -        pr_err("Can't dump unprotect vmas with parasite\n");
>> -        return ret;
>> -    }
>> -
>> -    if (fault_injected(FI_DUMP_PAGES)) {
>> -        pr_err("fault: Dump VMA pages failure!\n");
>> -        return -1;
>> +        if (fault_injected(FI_DUMP_PAGES)) {
>> +            pr_err("fault: Dump VMA pages failure!\n");
>> +            return -1;
>> +        }
>>       }
>>         ret = __parasite_dump_pages_seized(item, pargs, 
>> vma_area_list, mdc, ctl);
>> @@ -549,10 +523,12 @@ int parasite_dump_pages_seized(struct 
>> pstree_item *item,
>>           return ret;
>>       }
>>   -    pargs->add_prot = 0;
>> -    if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
>> -        pr_err("Can't rollback unprotected vmas with parasite\n");
>> -        ret = -1;
>> +    if(!mdc->pre_dump){
> ditto
ACK.
>> +        pargs->add_prot = 0;
>> +        if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
>> +            pr_err("Can't rollback unprotected vmas with parasite\n");
>> +            ret = -1;
>> +        }
>>       }
>>         return ret;
>> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
>> index c32b893..c70ba70 100644
>> --- a/criu/page-pipe.c
>> +++ b/criu/page-pipe.c
>> @@ -33,7 +33,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf 
>> *ppb, unsigned long new_size)
>>   {
>>       int ret;
>>   -    ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE);
>> +    ret = fcntl(ppb->p[0], F_SETPIPE_SZ, (new_size * PAGE_SIZE) + 1);
>>       if (ret < 0)
>>           return -1;
>>   @@ -41,7 +41,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf 
>> *ppb, unsigned long new_size)
>>       BUG_ON(ret < ppb->pipe_size);
>>         pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret);
>> -    ppb->pipe_size = ret;
>> +    ppb->pipe_size = ret - 1;
>>         return 0;
>>   }
>> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
>> index 9cdffd8..a5396b1 100644
>> --- a/criu/page-xfer.c
>> +++ b/criu/page-xfer.c
>> @@ -496,19 +496,75 @@ static inline u32 ppb_xfer_flags(struct 
>> page_xfer *xfer, struct page_pipe_buf *p
>>           return PE_PRESENT;
>>   }
>>   -int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe 
>> *pp)
>> +static char userbuf[4 << 20];
> maybe the buffer size should be (PIPE_MAX_SIZE * PAGE_SIZE)?

In initial discussion, to keep things simple we decided on fixed size 
buffer of 4MB, since PIPE_MAX_SIZE is defined using multiple ingredients.

>> +
>> +int page_xfer_dump_pages(int pid, struct page_xfer *xfer, struct 
>> page_pipe *pp, size_t *poff)
>>   {
>>       struct page_pipe_buf *ppb;
>>       unsigned int cur_hole = 0;
>> -    int ret;
>> +    unsigned int ret, ret2;
>> +    size_t off;
>> +    struct iovec *remoteiovs = pp->iovs;
>>         pr_debug("Transferring pages:\n");
>>   +    off = *poff;
>> +
>>       list_for_each_entry(ppb, &pp->bufs, l) {
>>           unsigned int i;
>>             pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>>   +        size_t bufsize = sizeof(userbuf);
>> +        struct iovec bufvec = {.iov_len = bufsize};
>> +        bufvec.iov_base = userbuf;
>> +
>> +        ret = syscall(__NR_process_vm_readv, pid, &bufvec, 1, \
>> +                        &remoteiovs[off], ppb->nr_segs, 0);
>> +        if (ret == -1) {
>> +            switch (errno) {
> you can use pr_perror() to print a string representation of errno

I need to handle process_vm_readv() errors gracefully, avoiding 
immediate termination.

If not pr_debug, may be pr_info could be better substitute.

>> +                case EINVAL:
>> +                    pr_debug("process_vm_readv: Invalid arguments\n");
>> +                break;
>> +                case EFAULT:
>> +                    pr_debug("process_vm_readv: Unable to access 
>> remote iov\n");
>> +                break;
>> +                case ENOMEM:
>> +                    pr_debug("process_vm_readv: Unable to allocate 
>> memory\n");
>> +                break;
>> +                case EPERM:
>> +                    pr_debug("process_vm_readv: Insufficient 
>> privileges\n");
>> +                break;
>> +                case ESRCH:
>> +                    pr_debug("process_vm_readv: Target process 
>> doesn't exist\n");
>> +                break;
>> +                default:
>> +                    pr_debug("process_vm_readv: Uncategorised 
>> error\n");
>> +            }
>> +            return 0;
> why return 0 here? maybe we should exit with an error if 
> process_vm_readv() fails?
Since, target process once unfrozen could modify the mappings as against 
to what was collected initially in collect_mappings. For transition test 
cases, syscall is expected to fail and there comes graceful handling. In 
graceful handling, for now, we are just skipping the complete iovec 
processing and moving to next iovec or if "process is not found", moving 
to next process in pstree.
>
>> +        }
>> +
>> +        /* Handling partial reads due to modified mappings*/
>> +
>> +        if(ret != ppb->pages_in * PAGE_SIZE){
> please add a space after 'if' and before '{'
Sure.
>> +            pr_debug("Can't read remote iovs (%d(%lu)/%d)\n", ret,\
>> +                (unsigned long int)PAGE_SIZE * ppb->pages_in, 
>> ppb->pages_in);
>> +            continue;
>> +        }
>> +
>> +        bufvec.iov_len = ret;
>> +        ret2 = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
>> +
>> +        if(ret2 == -1){
> ditto
OK
>> +            pr_debug("vmsplice: Failed to splice user buffer to 
>> pipe\n");
>> +            continue;
>> +        }
>> +
>> +        if(ret != ret2){
> OK
>> +            pr_debug("Partial splice from user buffer to pipe 
>> (%d)\n", ret2);
>> +            continue;
>> +        }
>> +
>>           for (i = 0; i < ppb->nr_segs; i++) {
>>               struct iovec iov = ppb->iov[i];
>>               u32 flags;
>> @@ -530,8 +586,10 @@ int page_xfer_dump_pages(struct page_xfer *xfer, 
>> struct page_pipe *pp)
>>                           ppb->p[0], iov.iov_len))
>>                   return -1;
>>           }
>> +        off += ppb->nr_segs;
>>       }
>>   +    *poff = off;
>>       return dump_holes(xfer, pp, &cur_hole, NULL);
>>   }
>>   diff --git a/criu/shmem.c b/criu/shmem.c
>> index 03b088f..a797bde 100644
>> --- a/criu/shmem.c
>> +++ b/criu/shmem.c
>> @@ -629,19 +629,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 
>> *map)
>>       return 0;
>>   }
>>   -static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer)
>> +static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, 
>> size_t *off)
>>   {
>> -    struct page_pipe_buf *ppb;
>> -
>> -    list_for_each_entry(ppb, &pp->bufs, l)
>> -        if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
>> -                    SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
>> -                ppb->pages_in * PAGE_SIZE) {
>> -            pr_perror("Can't get shmem into page-pipe");
>> -            return -1;
>> -        }
>> -
>> -    return page_xfer_dump_pages(xfer, pp);
>> +    return page_xfer_dump_pages(getpid(), xfer, pp, off);
>>   }
>>     static int next_data_segment(int fd, unsigned long pfn,
>> @@ -678,6 +668,7 @@ static int do_dump_one_shmem(int fd, void *addr, 
>> struct shmem_info *si)
>>       int err, ret = -1;
>>       unsigned long pfn, nrpages, next_data_pnf = 0, next_hole_pfn = 0;
>>       unsigned long pages[2] = {};
>> +    size_t off = 0;
>>         nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
>>   @@ -726,7 +717,7 @@ again:
>>           }
>>             if (ret == -EAGAIN) {
>> -            ret = dump_pages(pp, &xfer);
>> +            ret = dump_pages(pp, &xfer, &off);
>>               if (ret)
>>                   goto err_xfer;
>>               page_pipe_reinit(pp);
>> @@ -742,7 +733,7 @@ again:
>>       cnt_add(CNT_SHPAGES_SKIPPED_PARENT, pages[0]);
>>       cnt_add(CNT_SHPAGES_WRITTEN, pages[1]);
>>   -    ret = dump_pages(pp, &xfer);
>> +    ret = dump_pages(pp, &xfer, &off);
>>     err_xfer:
>>       xfer.close(&xfer);
-Abhishek


More information about the CRIU mailing list