[CRIU] [PATCH 3/6] Drain memory using process_vm_readv syscall
abhishek dubey
dubeyabhishek777 at gmail.com
Fri Aug 2 10:47:49 MSK 2019
On 30/07/19 6:33 PM, Pavel Emelianov wrote:
> On 7/25/19 4:13 AM, Abhishek Dubey wrote:
>> moving out page_drain to cr_pre_dump_finish for
>> pre-dump stage. During frozen state, only iovecs
>> will be generated and draining of page will happen
>> after the task has been unfrozen. Shared memory
>> pre-dumping is not modified.
>>
>> Signed-off-by: Abhishek Dubey <dubeyabhishek777 at gmail.com>
>> ---
>> criu/cr-dump.c | 2 +-
>> criu/include/page-xfer.h | 4 ++
>> criu/mem.c | 13 +++-
>> criu/page-xfer.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++-
>> 4 files changed, 192 insertions(+), 3 deletions(-)
>>
>> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
>> index e070b8b..4569372 100644
>> --- a/criu/cr-dump.c
>> +++ b/criu/cr-dump.c
>> @@ -1513,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>> goto err;
>>
>> mem_pp = dmpi(item)->mem_pp;
>> - ret = page_xfer_dump_pages(&xfer, mem_pp);
>> + ret = page_xfer_predump_pages(item->pid->real, &xfer, mem_pp);
>>
>> xfer.close(&xfer);
>>
>> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
>> index fa72273..c3ad877 100644
>> --- a/criu/include/page-xfer.h
>> +++ b/criu/include/page-xfer.h
>> @@ -9,6 +9,9 @@ struct ps_info {
>>
>> extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd);
>>
>> +/* to skip pagemap generation for skipped iovs */
>> +#define SKIP_PAGEMAP (void*)0xdeadbeef
>> +
>> /*
>> * page_xfer -- transfer pages into image file.
>> * Two images backends are implemented -- local image file
>> @@ -48,6 +51,7 @@ struct page_xfer {
>> extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
>> struct page_pipe;
>> extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
>> +extern int page_xfer_predump_pages(int pid, struct page_xfer *, struct page_pipe *);
>> extern int connect_to_page_server_to_send(void);
>> extern int connect_to_page_server_to_recv(int epfd);
>> extern int disconnect_from_page_server(void);
>> diff --git a/criu/mem.c b/criu/mem.c
>> index 5c13690..00c7951 100644
>> --- a/criu/mem.c
>> +++ b/criu/mem.c
>> @@ -488,7 +488,18 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>> if (mdc->lazy)
>> memcpy(pargs_iovs(args), pp->iovs,
>> sizeof(struct iovec) * pp->nr_iovs);
>> - ret = drain_pages(pp, ctl, args);
>> +
>> + /*
>> + * Faking drain_pages for pre-dump here. Actual drain_pages for pre-dump
>> + * will happen after task unfreezing in cr_pre_dump_finish(). This is
>> + * actual optimization which reduces time for which process was frozen
>> + * during pre-dump.
>> + */
>> + if (mdc->pre_dump)
>> + ret = 0;
>> + else
>> + ret = drain_pages(pp, ctl, args);
>> +
>> if (!ret && !mdc->pre_dump)
>> ret = xfer_pages(pp, &xfer);
>> if (ret)
>> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
>> index fe457d2..40ec29e 100644
>> --- a/criu/page-xfer.c
>> +++ b/criu/page-xfer.c
>> @@ -499,16 +499,190 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
>> return PE_PRESENT;
>> }
>>
>> +static char userbuf[4 << 20];
>> +
>> +ssize_t copy_to_userbuf(int pid, struct iovec* liov, struct iovec* riov, unsigned long riov_cnt)
>> +{
>> + ssize_t ret;
>> +
>> + ret = process_vm_readv(pid, liov, 1, riov, riov_cnt, 0);
>> +
>> + /* Target process doesn't exist */
>> + if (ret == -1 && errno == ESRCH) {
>> + pr_err("Target process with PID %d not found\n", pid);
>> + return -2;
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +/*
>> + * This function returns the index of that iov in an iovec, which failed to get
>> + * processed by process_vm_readv or may be partially processed
>> + */
>> +unsigned int faulty_iov_index(ssize_t bytes_read, struct iovec* riov, size_t index,
>> + unsigned int riovcnt, unsigned long *iov_cntr, unsigned long *page_diff)
>> +{
>> + ssize_t processed_bytes = 0;
>> + *iov_cntr = 0;
>> + *page_diff = 0;
>> +
>> + while (processed_bytes < bytes_read && index < riovcnt) {
>> +
>> + processed_bytes += riov[index].iov_len;
>> + (*iov_cntr) += 1;
>> + index += 1;
>> + }
>> +
>> + /* one iov is partially read */
>> + if (bytes_read < processed_bytes) {
>> + *page_diff = processed_bytes - bytes_read;
>> + index -= 1;
>> + }
>> +
>> + return index;
>> +}
>> +
>> +long processing_ppb_userbuf(int pid, struct page_pipe_buf *ppb, struct iovec *bufvec)
> The return value and the bufvec argument seem to duplicate each other, as bufvec.iov_len
> is the amount of data read, so is the return value.
Yes both are complement of each other.
More explanation:
-----------------------
The bufvec.iov_len denotes how much buffer is available for next data to
be read
while processing remaining iovec. Whereas return value denotes, actually
how many bytes have been read.
Eg: Userbuffer is of 512 page size.
IOVEC: [{addr A, 8192} {addr B, 4096}]
processing IOVEC[0]:
bufvec.iov_len = 512 * 4096
Bytes_read = 8192
---------------------------------------
bufvec length available for further processing = 512 *4096 - 8192 = 510
* 4096
---------------------------------------
processing IOVEC[1]:
bufvec.iov_len = 510 * 4096
Bytes_read = 12288
---------------------------------------
bufvec length available for further processing = 510 *4096 - 4096 = 509
* 4096
So, finally bufvec.iov_len = 509 * 4096 (509 pages) and Bytes_read =
12288 (3 pages)
Yes, here both are 512's complement of each other. Just to keep things
intuitive I did this way.
>
>> +{
>> + struct iovec *riov = ppb->iov;
>> + ssize_t bytes_read = 0;
>> + unsigned long pro_iovs = 0, start = 0, iov_cntr = 0, end;
> I have a subtle feeling that pro_iovs, start and end are somewhat duplicating each other.
> Please, explain them and especially the case why start can be not equal to pro_iovs.
I will try to eliminate redundancy, if not then I will explain all cases
why it's necessary.
>
>> + unsigned long pages_read = 0, pages_skipped = 0;
>> + unsigned long page_diff = 0;
>> +
>> + bufvec->iov_len = sizeof(userbuf);
>> + bufvec->iov_base = userbuf;
>> +
>> + while (pro_iovs < ppb->nr_segs)
>> + {
>> + bytes_read = copy_to_userbuf(pid, bufvec, &riov[start],
>> + ppb->nr_segs - pro_iovs);
>> +
>> + if (bytes_read == -2)
>> + return -2;
> This -2 is not handled by the caller.
Handled now. Thanks :)
>
>> +
>> + if (bytes_read == -1)
>> + {
>> + /*
>> + * In other errors, adjust page count and mark the page
>> + * to be skipped by pagemap generation
>> + */
>> +
>> + cnt_sub(CNT_PAGES_WRITTEN, riov[start].iov_len/PAGE_SIZE);
>> + pages_skipped += riov[start].iov_len/PAGE_SIZE;
>> + riov[start].iov_base = SKIP_PAGEMAP;
>> +
>> + pro_iovs += 1;
>> + start += 1;
>> + continue;
>> + }
>> +
>> + pages_read += bytes_read/PAGE_SIZE;
>> +
>> + if (pages_read + pages_skipped == ppb->pages_in)
>> + break;
>> +
>> + end = faulty_iov_index(bytes_read, riov,
>> + start, ppb->nr_segs, &iov_cntr, &page_diff);
>> +
>> + /*
>> + * One single iov could be partially read, unless unmapped page in
>> + * iov range is not hit by process_vm_readv, need to handle this
>> + * special case
>> + */
> Please, explain this if-else, what does it do, especially two things below:
I will explain this with an example:
Consider a simple IOVEC: [{addrA, 2 pages}, {addrB, 4 pages}]
corresponding memory representation : < addrA > < addrA + 1* PAGE_SIZE >
< addrB > < addrB + 1* PAGE_SIZE > < addrB + 2* PAGE_SIZE > < addrB + 3*
PAGE_SIZE >
While processing IOVEC through process_vm_readv, following cases arise:
Case 1: #pages read = 2
---------------------------------
It implies first page of IOVEC[1] is not accessible i.e. <addrB>
unmapped/permission modified, so process_vm_readv is unable to process
anything beyond that address and returns with processing only IOVEC[0].
Case 2: #pages read = 3
---------------------------------
It implies second page of IOVEC[1] is not accessible, i.e. < addrB + 1*
PAGE_SIZE > can't be processed. But, here first page of IOVEC[1]
is read along with 2 pages of IOVEC[0]. This is the case of partially
processing an IOV.
IF part handles Case 2 and ELSE part handles Case 1.
>> + if (!page_diff)
>> + {
>> + cnt_sub(CNT_PAGES_WRITTEN, riov[end].iov_len/PAGE_SIZE);
>> + pages_skipped += riov[end].iov_len/PAGE_SIZE;
>> + riov[end].iov_base = SKIP_PAGEMAP;
> ... why do we skip the last riov and why do we mess with cnt_sub at all?
Why I skip last riov:
--------------------------
In current implementation, if a IOV have its very first page unreadable(
as in Case 1), I just skip processing that IOV.
This is what Andrei was talking about, that some mappings may modify, we
need to find the unmodified part(readable) and pre-dump it.
In Case 1, readable pages may be 2nd,3rd and 4th pages of IOVEC[1],
which I didn't processed at all for now. I just skip this IOV and move-on.
In Case 2, readable pages may be 3rd and 4th pages of IOVEC[1], which I
skipped processing for now. I can't skip partially read IOV so modify
page-count and move.
This needs to be handled and I am thinking on best way to handle
remaining pages of culprit IOVs.
Why I mess with cnt_sub:
---------------------------------
CNT_PAGES_WRITTEN is calculated at iovec generation. IOVEC gets modified
since their generation. Pages which can't be pre-dumped (first page of
IOVEC[1] in Case 1) are
counted, so they need to be adjusted. In current implementation since I
am skipping unprocessed culprit IOV, so I am subtracting corresponding
number of pages from
CNT_PAGES_WRITTEN. This avoids failing at page-count mismatch in
test-suite run for pre-dump.
This logic will change with upcoming implementation.
>
>> + start = end + 1;
>> + pro_iovs += iov_cntr + 1;
>> + }
>> + else
>> + {
>> + riov[end].iov_len -= page_diff;
>> + cnt_sub(CNT_PAGES_WRITTEN, page_diff/PAGE_SIZE);
>> + pages_skipped += page_diff/PAGE_SIZE;
>> + start = end + 1;
>> + pro_iovs += iov_cntr;
>> + }
>> +
>> + bufvec->iov_base = userbuf + pages_read * PAGE_SIZE;
>> + bufvec->iov_len -= bytes_read;
>> + }
>> +
>> + return pages_read;
>> +}
>> +
>> +
>> +int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp)
>> +{
>> + struct page_pipe_buf *ppb;
>> + unsigned int cur_hole = 0, i;
>> + unsigned long ret, pages_read;
>> + struct iovec bufvec;
>> +
>> + list_for_each_entry(ppb, &pp->bufs, l) {
>> +
>> + pages_read = processing_ppb_userbuf(pid, ppb, &bufvec);
>> +
>> + if (pages_read == -1)
>> + return -1;
>> +
>> + bufvec.iov_base = userbuf;
>> + bufvec.iov_len = pages_read * PAGE_SIZE;
>> + ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
>> +
>> + if (ret == -1 || ret != pages_read * PAGE_SIZE) {
>> + pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret);
>> + return -1;
>> + }
>> +
>> + /* generating pagemap */
>> + for (i = 0; i < ppb->nr_segs; i++) {
>> +
>> + if (ppb->iov[i].iov_base == SKIP_PAGEMAP)
>> + continue;
>> +
>> + struct iovec iov = ppb->iov[i];
>> + u32 flags;
>> +
>> + ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base);
>> + if (ret)
>> + return ret;
>> +
>> + BUG_ON(iov.iov_base < (void *)xfer->offset);
>> + iov.iov_base -= xfer->offset;
>> + pr_debug("\tp %p [%u]\n", iov.iov_base,
>> + (unsigned int)(iov.iov_len / PAGE_SIZE));
>> +
>> + flags = ppb_xfer_flags(xfer, ppb);
>> +
>> + if (xfer->write_pagemap(xfer, &iov, flags))
>> + return -1;
>> + if ((flags & PE_PRESENT) && xfer->write_pages(xfer,
> Wow, if the flags does not have the PE_PRESENT bit set, why do we even get to the
> reading the pages contents?
This check was redundant and carried from original function.
>
>> + ppb->p[0], iov.iov_len))
>> + return -1;
>> +
>> + }
>> +
>> + }
>> +
>> + return dump_holes(xfer, pp, &cur_hole, NULL);
>> +}
>> +
>> int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>> {
>> struct page_pipe_buf *ppb;
>> unsigned int cur_hole = 0;
>> + unsigned int i;
>> int ret;
>>
>> pr_debug("Transferring pages:\n");
>>
>> list_for_each_entry(ppb, &pp->bufs, l) {
>> - unsigned int i;
> This change is cosmetic, please, do not merge it with the rest of the code.
Sure.
>
>>
>> pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>>
>>
-Abhishek
More information about the CRIU
mailing list