[CRIU] [PATCH 6/7] read mode pre-dump implementation
Dmitry Safonov
0x7f454c46 at gmail.com
Thu Aug 29 00:44:03 MSK 2019
On 8/25/19 2:58 AM, Abhishek Dubey wrote:
[..]
> +
> +/*
> + * Optimized pre-dump algorithm
> + * ==============================
[..]
> + */
Thanks for the nice long comment!
> +static char userbuf[PIPE_MAX_SIZE << 12];
Is this a PAGE_SHIFT?
It probably not what you want on arm64, ppc64 where page_size could be
different from 4Kb with Large Pages. Furthermore, you use it only in one
function as a temporary buffer - could we move it from .data section
into dynamic allocation with malloc()?
> +
> +unsigned long handle_faulty_iov(int pid, struct iovec* riov,
> + unsigned long faulty_index,
> + struct iovec *bufvec, struct iovec* aux_iov,
> + unsigned long* aux_len,
> + unsigned long partial_read_bytes)
> +{
> + /* Handling Case 2*/
> + if (riov[faulty_index].iov_len == PAGE_SIZE) {
> + cnt_sub(CNT_PAGES_WRITTEN, 1);
> + return 0;
> + }
> +
> + struct iovec dummy;
> + ssize_t bytes_read;
> + unsigned long offset = 0;
> + unsigned long final_read_cnt = 0;
Could you move variable declarations in the function begin, please?
> +
> + /* Handling Case 3-Part 3.2*/
> + offset = (partial_read_bytes)? partial_read_bytes : PAGE_SIZE;
> +
> + dummy.iov_base = riov[faulty_index].iov_base + offset;
> + dummy.iov_len = riov[faulty_index].iov_len - offset;
> +
> + if (!partial_read_bytes)
> + cnt_sub(CNT_PAGES_WRITTEN, 1);
> +
> + while (dummy.iov_len) {
> +
> + bytes_read = process_vm_readv(pid, bufvec, 1, &dummy, 1, 0);
Could you check errno please?
> +
> + if(bytes_read == -1) {
> + /* Handling faulty page read in faulty iov */
> + cnt_sub(CNT_PAGES_WRITTEN, 1);
> + dummy.iov_base += PAGE_SIZE;
> + dummy.iov_len -= PAGE_SIZE;
> + continue;
> + }
> +
> + /* If aux-iov can merge and expand or new entry required */
> + if (aux_iov[(*aux_len)-1].iov_base +
> + aux_iov[(*aux_len)-1].iov_len == dummy.iov_base)
> + aux_iov[(*aux_len)-1].iov_len += bytes_read;
> + else {
> + aux_iov[*aux_len].iov_base = dummy.iov_base;
> + aux_iov[*aux_len].iov_len = bytes_read;
> + (*aux_len) += 1;
> + }
> +
> + dummy.iov_base += bytes_read;
> + dummy.iov_len -= bytes_read;
> + bufvec->iov_base += bytes_read;
> + bufvec->iov_len -= bytes_read;
> + final_read_cnt += bytes_read;
> + }
> +
> + return final_read_cnt;
> +}
> +
> +/*
> + * This function will position start pointer to the latest
> + * successfully read iov in iovec. In case of partial read it
> + * returns partial_read_bytes, otherwise 0.
> + */
> +static unsigned long analyze_iov(ssize_t bytes_read, struct iovec* riov,
> + unsigned long *index, struct iovec *aux_iov,
> + unsigned long *aux_len)
> +{
> + ssize_t processed_bytes = 0;
> + unsigned long partial_read_bytes = 0;
> +
> + /* correlating iovs with read bytes */
> + while (processed_bytes < bytes_read) {
> +
> + processed_bytes += riov[*index].iov_len;
> + aux_iov[*aux_len].iov_base = riov[*index].iov_base;
> + aux_iov[*aux_len].iov_len = riov[*index].iov_len;
> +
> + (*aux_len) += 1;
> + (*index) += 1;
> + }
> +
> + /* handling partially processed faulty iov*/
> + if (processed_bytes - bytes_read) {
> +
> + (*index) -= 1;
> +
> + partial_read_bytes = riov[*index].iov_len
> + - (processed_bytes - bytes_read);
> + aux_iov[*aux_len-1].iov_len = partial_read_bytes;
> + }
> +
> + return partial_read_bytes;
> +}
> +
> +/*
> + * This function iterates over complete ppb->iov entries and pass
> + * them to process_vm_readv syscall.
> + *
> + * Since process_vm_readv returns count of successfully read bytes.
> + * It does not point to iovec entry associated to last successful
> + * byte read. The correlation between bytes read and corresponding
> + * iovec is setup through analyze_iov function.
> + *
> + * If all iovecs are not processed in one go, it means there exists
> + * some faulty iov entry(memory mapping modified after it was grabbed)
> + * in iovec. process_vm_readv syscall stops at such faulty iov and
> + * skip processing further any entry in iovec. This is handled by
> + * handle_faulty_iov function.
> + */
> +static long fill_userbuf(int pid, struct page_pipe_buf *ppb,
> + struct iovec *bufvec,
> + struct iovec* aux_iov,
> + unsigned long *aux_len)
> +{
> + struct iovec *riov = ppb->iov;
> + ssize_t bytes_read;
> + unsigned long total_read = 0;
> + unsigned long start = 0;
> + unsigned long partial_read_bytes = 0;
> +
> + while (start < ppb->nr_segs) {
> +
> + bytes_read = process_vm_readv(pid, bufvec, 1, &riov[start],
> + ppb->nr_segs - start, 0);
> +
> + if (bytes_read == -1) {
> + /* Handling Case 1*/
> + if (riov[start].iov_len == PAGE_SIZE) {
> + cnt_sub(CNT_PAGES_WRITTEN, 1);
> + start += 1;
> + continue;
> + } else if (errno == ESRCH) {
> + pr_debug("Target process PID:%d not found\n", pid);
> + return PR_UNAVIL;
Hmm, I'm not enjoying special return codes..
Could we do return -errno please?
While at it, could you zerofy errno before process_vm_readv() and check
if it's expected?
> + }
> + }
> +
> + partial_read_bytes = 0;
> +
> + if (bytes_read > 0) {
> + partial_read_bytes = analyze_iov(bytes_read, riov,
> + &start, aux_iov,
> + aux_len);
> + bufvec->iov_base += bytes_read;
> + bufvec->iov_len -= bytes_read;
> + total_read += bytes_read;
> + }
> +
> + /*
> + * If all iovs not processed in one go,
> + * it means some iov in between has failed.
> + */
> + if (start < ppb->nr_segs)
> + total_read += handle_faulty_iov(pid, riov, start, bufvec,
> + aux_iov, aux_len,
> + partial_read_bytes);
> + start += 1;
> +
> + }
> +
> + return total_read;
> +}
> +
> +/*
> + * This function is similar to page_xfer_dump_pages, instead it uses
> + * auxiliary_iov array for pagemap generation.
> + *
> + * The entries of ppb->iov may mismatch with actual process mappings
> + * present at time of pre-dump. Such entries need to be adjusted as per
> + * the pages read by process_vm_readv syscall. These adjusted entries
> + * along with unmodified entries are present in aux_iov array.
> + */
> +
> +int page_xfer_predump_pages(int pid, struct page_xfer *xfer,
> + struct page_pipe *pp)
> +{
> + struct page_pipe_buf *ppb;
> + unsigned int cur_hole = 0, i;
> + unsigned long ret, bytes_read;
> + struct iovec bufvec;
> +
> + struct iovec aux_iov[PIPE_MAX_SIZE];
> + unsigned long aux_len;
> +
> + list_for_each_entry(ppb, &pp->bufs, l) {
> +
> + aux_len = 0;
> + bufvec.iov_len = sizeof(userbuf);
> + bufvec.iov_base = userbuf;
> +
> + bytes_read = fill_userbuf(pid, ppb, &bufvec, aux_iov, &aux_len);
> +
> + if (bytes_read == PR_UNAVIL)
> + return -1;
> +
> + bufvec.iov_base = userbuf;
> + bufvec.iov_len = bytes_read;
> + ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
> +
> + if (ret == -1 || ret != bytes_read) {
> + pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret);
Probably you want pr_error()
> + return -1;
> + }
> +
> + /* generating pagemap */
> + for (i = 0; i < aux_len; i++) {
> +
> + struct iovec iov = aux_iov[i];
> + u32 flags;
> +
> + ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base);
> + if (ret)
> + return ret;
> +
> + BUG_ON(iov.iov_base < (void *)xfer->offset);
> + iov.iov_base -= xfer->offset;
> + pr_debug("\t p %p [%u]\n", iov.iov_base,
> + (unsigned int)(iov.iov_len / PAGE_SIZE));
> +
> + flags = ppb_xfer_flags(xfer, ppb);
> +
> + if (xfer->write_pagemap(xfer, &iov, flags))
> + return -1;
> + if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len))
> + return -1;
> + }
> + }
> +
> + return dump_holes(xfer, pp, &cur_hole, NULL);
> +}
> +
> +
> int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
> {
> struct page_pipe_buf *ppb;
>
Thanks,
Dmitry
More information about the CRIU
mailing list