[CRIU] [PATCH 3/6] Drain memory using process_vm_readv syscall
Pavel Emelianov
xemul at virtuozzo.com
Tue Jul 30 16:03:31 MSK 2019
On 7/25/19 4:13 AM, Abhishek Dubey wrote:
> moving out page_drain to cr_pre_dump_finish for
> pre-dump stage. During frozen state, only iovecs
> will be generated and draining of page will happen
> after the task has been unfrozen. Shared memory
> pre-dumping is not modified.
>
> Signed-off-by: Abhishek Dubey <dubeyabhishek777 at gmail.com>
> ---
> criu/cr-dump.c | 2 +-
> criu/include/page-xfer.h | 4 ++
> criu/mem.c | 13 +++-
> criu/page-xfer.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++-
> 4 files changed, 192 insertions(+), 3 deletions(-)
>
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index e070b8b..4569372 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1513,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
> goto err;
>
> mem_pp = dmpi(item)->mem_pp;
> - ret = page_xfer_dump_pages(&xfer, mem_pp);
> + ret = page_xfer_predump_pages(item->pid->real, &xfer, mem_pp);
>
> xfer.close(&xfer);
>
> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
> index fa72273..c3ad877 100644
> --- a/criu/include/page-xfer.h
> +++ b/criu/include/page-xfer.h
> @@ -9,6 +9,9 @@ struct ps_info {
>
> extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd);
>
> +/* to skip pagemap generation for skipped iovs */
> +#define SKIP_PAGEMAP (void*)0xdeadbeef
> +
> /*
> * page_xfer -- transfer pages into image file.
> * Two images backends are implemented -- local image file
> @@ -48,6 +51,7 @@ struct page_xfer {
> extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
> struct page_pipe;
> extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
> +extern int page_xfer_predump_pages(int pid, struct page_xfer *, struct page_pipe *);
> extern int connect_to_page_server_to_send(void);
> extern int connect_to_page_server_to_recv(int epfd);
> extern int disconnect_from_page_server(void);
> diff --git a/criu/mem.c b/criu/mem.c
> index 5c13690..00c7951 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -488,7 +488,18 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
> if (mdc->lazy)
> memcpy(pargs_iovs(args), pp->iovs,
> sizeof(struct iovec) * pp->nr_iovs);
> - ret = drain_pages(pp, ctl, args);
> +
> + /*
> + * Faking drain_pages for pre-dump here. Actual drain_pages for pre-dump
> + * will happen after task unfreezing in cr_pre_dump_finish(). This is
> + * actual optimization which reduces time for which process was frozen
> + * during pre-dump.
> + */
> + if (mdc->pre_dump)
> + ret = 0;
> + else
> + ret = drain_pages(pp, ctl, args);
> +
> if (!ret && !mdc->pre_dump)
> ret = xfer_pages(pp, &xfer);
> if (ret)
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index fe457d2..40ec29e 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -499,16 +499,190 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
> return PE_PRESENT;
> }
>
> +static char userbuf[4 << 20];
> +
> +ssize_t copy_to_userbuf(int pid, struct iovec* liov, struct iovec* riov, unsigned long riov_cnt)
> +{
> + ssize_t ret;
> +
> + ret = process_vm_readv(pid, liov, 1, riov, riov_cnt, 0);
> +
> + /* Target process doesn't exist */
> + if (ret == -1 && errno == ESRCH) {
> + pr_err("Target process with PID %d not found\n", pid);
> + return -2;
> + }
> +
> + return ret;
> +}
> +
> +/*
> + * This function returns the index of that iov in an iovec, which failed to get
> + * processed by process_vm_readv or may be partially processed
> + */
> +unsigned int faulty_iov_index(ssize_t bytes_read, struct iovec* riov, size_t index,
> + unsigned int riovcnt, unsigned long *iov_cntr, unsigned long *page_diff)
> +{
> + ssize_t processed_bytes = 0;
> + *iov_cntr = 0;
> + *page_diff = 0;
> +
> + while (processed_bytes < bytes_read && index < riovcnt) {
> +
> + processed_bytes += riov[index].iov_len;
> + (*iov_cntr) += 1;
> + index += 1;
> + }
> +
> + /* one iov is partially read */
> + if (bytes_read < processed_bytes) {
> + *page_diff = processed_bytes - bytes_read;
> + index -= 1;
> + }
> +
> + return index;
> +}
> +
> +long processing_ppb_userbuf(int pid, struct page_pipe_buf *ppb, struct iovec *bufvec)
The return value and the bufvec argument seem to duplicate each other, as bufvec.iov_len
is the amount of data read, so is the return value.
> +{
> + struct iovec *riov = ppb->iov;
> + ssize_t bytes_read = 0;
> + unsigned long pro_iovs = 0, start = 0, iov_cntr = 0, end;
I have a subtle feeling that pro_iovs, start and end are somewhat duplicating each other.
Please, explain them and especially the case why start can be not equal to pro_iovs.
> + unsigned long pages_read = 0, pages_skipped = 0;
> + unsigned long page_diff = 0;
> +
> + bufvec->iov_len = sizeof(userbuf);
> + bufvec->iov_base = userbuf;
> +
> + while (pro_iovs < ppb->nr_segs)
> + {
> + bytes_read = copy_to_userbuf(pid, bufvec, &riov[start],
> + ppb->nr_segs - pro_iovs);
> +
> + if (bytes_read == -2)
> + return -2;
This -2 is not handled by the caller.
> +
> + if (bytes_read == -1)
> + {
> + /*
> + * In other errors, adjust page count and mark the page
> + * to be skipped by pagemap generation
> + */
> +
> + cnt_sub(CNT_PAGES_WRITTEN, riov[start].iov_len/PAGE_SIZE);
> + pages_skipped += riov[start].iov_len/PAGE_SIZE;
> + riov[start].iov_base = SKIP_PAGEMAP;
> +
> + pro_iovs += 1;
> + start += 1;
> + continue;
> + }
> +
> + pages_read += bytes_read/PAGE_SIZE;
> +
> + if (pages_read + pages_skipped == ppb->pages_in)
> + break;
> +
> + end = faulty_iov_index(bytes_read, riov,
> + start, ppb->nr_segs, &iov_cntr, &page_diff);
> +
> + /*
> + * One single iov could be partially read, unless unmapped page in
> + * iov range is not hit by process_vm_readv, need to handle this
> + * special case
> + */
Please, explain this if-else, what does it do, especially two things below:
> + if (!page_diff)
> + {
> + cnt_sub(CNT_PAGES_WRITTEN, riov[end].iov_len/PAGE_SIZE);
> + pages_skipped += riov[end].iov_len/PAGE_SIZE;
> + riov[end].iov_base = SKIP_PAGEMAP;
... why do we skip the last riov and why do we mess with cnt_sub at all?
> + start = end + 1;
> + pro_iovs += iov_cntr + 1;
> + }
> + else
> + {
> + riov[end].iov_len -= page_diff;
> + cnt_sub(CNT_PAGES_WRITTEN, page_diff/PAGE_SIZE);
> + pages_skipped += page_diff/PAGE_SIZE;
> + start = end + 1;
> + pro_iovs += iov_cntr;
> + }
> +
> + bufvec->iov_base = userbuf + pages_read * PAGE_SIZE;
> + bufvec->iov_len -= bytes_read;
> + }
> +
> + return pages_read;
> +}
> +
> +
> +int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp)
> +{
> + struct page_pipe_buf *ppb;
> + unsigned int cur_hole = 0, i;
> + unsigned long ret, pages_read;
> + struct iovec bufvec;
> +
> + list_for_each_entry(ppb, &pp->bufs, l) {
> +
> + pages_read = processing_ppb_userbuf(pid, ppb, &bufvec);
> +
> + if (pages_read == -1)
> + return -1;
> +
> + bufvec.iov_base = userbuf;
> + bufvec.iov_len = pages_read * PAGE_SIZE;
> + ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
> +
> + if (ret == -1 || ret != pages_read * PAGE_SIZE) {
> + pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret);
> + return -1;
> + }
> +
> + /* generating pagemap */
> + for (i = 0; i < ppb->nr_segs; i++) {
> +
> + if (ppb->iov[i].iov_base == SKIP_PAGEMAP)
> + continue;
> +
> + struct iovec iov = ppb->iov[i];
> + u32 flags;
> +
> + ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base);
> + if (ret)
> + return ret;
> +
> + BUG_ON(iov.iov_base < (void *)xfer->offset);
> + iov.iov_base -= xfer->offset;
> + pr_debug("\tp %p [%u]\n", iov.iov_base,
> + (unsigned int)(iov.iov_len / PAGE_SIZE));
> +
> + flags = ppb_xfer_flags(xfer, ppb);
> +
> + if (xfer->write_pagemap(xfer, &iov, flags))
> + return -1;
> + if ((flags & PE_PRESENT) && xfer->write_pages(xfer,
Wow, if the flags does not have the PE_PRESENT bit set, why do we even get to the
reading the pages contents?
> + ppb->p[0], iov.iov_len))
> + return -1;
> +
> + }
> +
> + }
> +
> + return dump_holes(xfer, pp, &cur_hole, NULL);
> +}
> +
> int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
> {
> struct page_pipe_buf *ppb;
> unsigned int cur_hole = 0;
> + unsigned int i;
> int ret;
>
> pr_debug("Transferring pages:\n");
>
> list_for_each_entry(ppb, &pp->bufs, l) {
> - unsigned int i;
This change is cosmetic, please, do not merge it with the rest of the code.
>
> pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>
>
More information about the CRIU
mailing list