[CRIU] [PATCH 6/7] read mode pre-dump implementation
abhishek dubey
dubeyabhishek777 at gmail.com
Thu Aug 29 07:08:52 MSK 2019
On 29/08/19 3:14 AM, Dmitry Safonov wrote:
> On 8/25/19 2:58 AM, Abhishek Dubey wrote:
> [..]
>> +
>> +/*
>> + * Optimized pre-dump algorithm
>> + * ==============================
> [..]
>> + */
> Thanks for the nice long comment!
:)
>
>> +static char userbuf[PIPE_MAX_SIZE << 12];
> Is this a PAGE_SHIFT?
I wish to have a buffer of PIPE_MAX_SIZE pages, where each page is 4KB.
> It probably not what you want on arm64, ppc64 where page_size could be
> different from 4Kb with Large Pages. Furthermore, you use it only in one
> function as a temporary buffer - could we move it from .data section
> into dynamic allocation with malloc()?
is this a correct replacement? architecture independent one?
char* userbuf = malloc(PIPE_MAX_SIZE * PAGE_SIZE);
>
>> +
>> +unsigned long handle_faulty_iov(int pid, struct iovec* riov,
>> + unsigned long faulty_index,
>> + struct iovec *bufvec, struct iovec* aux_iov,
>> + unsigned long* aux_len,
>> + unsigned long partial_read_bytes)
>> +{
>> + /* Handling Case 2*/
>> + if (riov[faulty_index].iov_len == PAGE_SIZE) {
>> + cnt_sub(CNT_PAGES_WRITTEN, 1);
>> + return 0;
>> + }
>> +
>> + struct iovec dummy;
>> + ssize_t bytes_read;
>> + unsigned long offset = 0;
>> + unsigned long final_read_cnt = 0;
> Could you move variable declarations in the function begin, please?
Ok, I will do that.
>> +
>> + /* Handling Case 3-Part 3.2*/
>> + offset = (partial_read_bytes)? partial_read_bytes : PAGE_SIZE;
>> +
>> + dummy.iov_base = riov[faulty_index].iov_base + offset;
>> + dummy.iov_len = riov[faulty_index].iov_len - offset;
>> +
>> + if (!partial_read_bytes)
>> + cnt_sub(CNT_PAGES_WRITTEN, 1);
>> +
>> + while (dummy.iov_len) {
>> +
>> + bytes_read = process_vm_readv(pid, bufvec, 1, &dummy, 1, 0);
> Could you check errno please?
In the very first patch I had that check. In the review it was not found
very useful, so omitted that.
@Andrei @Xemul : how to go about it?
>
>> +
>> + if(bytes_read == -1) {
>> + /* Handling faulty page read in faulty iov */
>> + cnt_sub(CNT_PAGES_WRITTEN, 1);
>> + dummy.iov_base += PAGE_SIZE;
>> + dummy.iov_len -= PAGE_SIZE;
>> + continue;
>> + }
>> +
>> + /* If aux-iov can merge and expand or new entry required */
>> + if (aux_iov[(*aux_len)-1].iov_base +
>> + aux_iov[(*aux_len)-1].iov_len == dummy.iov_base)
>> + aux_iov[(*aux_len)-1].iov_len += bytes_read;
>> + else {
>> + aux_iov[*aux_len].iov_base = dummy.iov_base;
>> + aux_iov[*aux_len].iov_len = bytes_read;
>> + (*aux_len) += 1;
>> + }
>> +
>> + dummy.iov_base += bytes_read;
>> + dummy.iov_len -= bytes_read;
>> + bufvec->iov_base += bytes_read;
>> + bufvec->iov_len -= bytes_read;
>> + final_read_cnt += bytes_read;
>> + }
>> +
>> + return final_read_cnt;
>> +}
>> +
>> +/*
>> + * This function will position start pointer to the latest
>> + * successfully read iov in iovec. In case of partial read it
>> + * returns partial_read_bytes, otherwise 0.
>> + */
>> +static unsigned long analyze_iov(ssize_t bytes_read, struct iovec* riov,
>> + unsigned long *index, struct iovec *aux_iov,
>> + unsigned long *aux_len)
>> +{
>> + ssize_t processed_bytes = 0;
>> + unsigned long partial_read_bytes = 0;
>> +
>> + /* correlating iovs with read bytes */
>> + while (processed_bytes < bytes_read) {
>> +
>> + processed_bytes += riov[*index].iov_len;
>> + aux_iov[*aux_len].iov_base = riov[*index].iov_base;
>> + aux_iov[*aux_len].iov_len = riov[*index].iov_len;
>> +
>> + (*aux_len) += 1;
>> + (*index) += 1;
>> + }
>> +
>> + /* handling partially processed faulty iov*/
>> + if (processed_bytes - bytes_read) {
>> +
>> + (*index) -= 1;
>> +
>> + partial_read_bytes = riov[*index].iov_len
>> + - (processed_bytes - bytes_read);
>> + aux_iov[*aux_len-1].iov_len = partial_read_bytes;
>> + }
>> +
>> + return partial_read_bytes;
>> +}
>> +
>> +/*
>> + * This function iterates over complete ppb->iov entries and pass
>> + * them to process_vm_readv syscall.
>> + *
>> + * Since process_vm_readv returns count of successfully read bytes.
>> + * It does not point to iovec entry associated to last successful
>> + * byte read. The correlation between bytes read and corresponding
>> + * iovec is setup through analyze_iov function.
>> + *
>> + * If all iovecs are not processed in one go, it means there exists
>> + * some faulty iov entry(memory mapping modified after it was grabbed)
>> + * in iovec. process_vm_readv syscall stops at such faulty iov and
>> + * skip processing further any entry in iovec. This is handled by
>> + * handle_faulty_iov function.
>> + */
>> +static long fill_userbuf(int pid, struct page_pipe_buf *ppb,
>> + struct iovec *bufvec,
>> + struct iovec* aux_iov,
>> + unsigned long *aux_len)
>> +{
>> + struct iovec *riov = ppb->iov;
>> + ssize_t bytes_read;
>> + unsigned long total_read = 0;
>> + unsigned long start = 0;
>> + unsigned long partial_read_bytes = 0;
>> +
>> + while (start < ppb->nr_segs) {
>> +
>> + bytes_read = process_vm_readv(pid, bufvec, 1, &riov[start],
>> + ppb->nr_segs - start, 0);
>> +
>> + if (bytes_read == -1) {
>> + /* Handling Case 1*/
>> + if (riov[start].iov_len == PAGE_SIZE) {
>> + cnt_sub(CNT_PAGES_WRITTEN, 1);
>> + start += 1;
>> + continue;
>> + } else if (errno == ESRCH) {
>> + pr_debug("Target process PID:%d not found\n", pid);
>> + return PR_UNAVIL;
> Hmm, I'm not enjoying special return codes..
> Could we do return -errno please?
> While at it, could you zerofy errno before process_vm_readv() and check
> if it's expected?
Sure! I will do it.
>
>> + }
>> + }
>> +
>> + partial_read_bytes = 0;
>> +
>> + if (bytes_read > 0) {
>> + partial_read_bytes = analyze_iov(bytes_read, riov,
>> + &start, aux_iov,
>> + aux_len);
>> + bufvec->iov_base += bytes_read;
>> + bufvec->iov_len -= bytes_read;
>> + total_read += bytes_read;
>> + }
>> +
>> + /*
>> + * If all iovs not processed in one go,
>> + * it means some iov in between has failed.
>> + */
>> + if (start < ppb->nr_segs)
>> + total_read += handle_faulty_iov(pid, riov, start, bufvec,
>> + aux_iov, aux_len,
>> + partial_read_bytes);
>> + start += 1;
>> +
>> + }
>> +
>> + return total_read;
>> +}
>> +
>> +/*
>> + * This function is similar to page_xfer_dump_pages, instead it uses
>> + * auxiliary_iov array for pagemap generation.
>> + *
>> + * The entries of ppb->iov may mismatch with actual process mappings
>> + * present at time of pre-dump. Such entries need to be adjusted as per
>> + * the pages read by process_vm_readv syscall. These adjusted entries
>> + * along with unmodified entries are present in aux_iov array.
>> + */
>> +
>> +int page_xfer_predump_pages(int pid, struct page_xfer *xfer,
>> + struct page_pipe *pp)
>> +{
>> + struct page_pipe_buf *ppb;
>> + unsigned int cur_hole = 0, i;
>> + unsigned long ret, bytes_read;
>> + struct iovec bufvec;
>> +
>> + struct iovec aux_iov[PIPE_MAX_SIZE];
>> + unsigned long aux_len;
>> +
>> + list_for_each_entry(ppb, &pp->bufs, l) {
>> +
>> + aux_len = 0;
>> + bufvec.iov_len = sizeof(userbuf);
>> + bufvec.iov_base = userbuf;
>> +
>> + bytes_read = fill_userbuf(pid, ppb, &bufvec, aux_iov, &aux_len);
>> +
>> + if (bytes_read == PR_UNAVIL)
>> + return -1;
>> +
>> + bufvec.iov_base = userbuf;
>> + bufvec.iov_len = bytes_read;
>> + ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
>> +
>> + if (ret == -1 || ret != bytes_read) {
>> + pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret);
> Probably you want pr_error()
Yes, maybe. Other splicing failure also had pr_error().
What is difference between two?
>
>> + return -1;
>> + }
>> +
>> + /* generating pagemap */
>> + for (i = 0; i < aux_len; i++) {
>> +
>> + struct iovec iov = aux_iov[i];
>> + u32 flags;
>> +
>> + ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base);
>> + if (ret)
>> + return ret;
>> +
>> + BUG_ON(iov.iov_base < (void *)xfer->offset);
>> + iov.iov_base -= xfer->offset;
>> + pr_debug("\t p %p [%u]\n", iov.iov_base,
>> + (unsigned int)(iov.iov_len / PAGE_SIZE));
>> +
>> + flags = ppb_xfer_flags(xfer, ppb);
>> +
>> + if (xfer->write_pagemap(xfer, &iov, flags))
>> + return -1;
>> + if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len))
>> + return -1;
>> + }
>> + }
>> +
>> + return dump_holes(xfer, pp, &cur_hole, NULL);
>> +}
>> +
>> +
>> int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>> {
>> struct page_pipe_buf *ppb;
>>
> Thanks,
> Dmitry
-Abhishek
More information about the CRIU
mailing list