[CRIU] [PATCH 6/7] read mode pre-dump implementation

Thu Aug 29 07:08:52 MSK 2019

On 29/08/19 3:14 AM, Dmitry Safonov wrote:
> On 8/25/19 2:58 AM, Abhishek Dubey wrote:
> [..]
>> +
>> +/*
>> + * Optimized pre-dump algorithm
>> + * ==============================
> [..]
>> + */
> Thanks for the nice long comment!
:)
>
>> +static char userbuf[PIPE_MAX_SIZE << 12];
> Is this a PAGE_SHIFT?

I wish to have a buffer of PIPE_MAX_SIZE pages, where each page is 4KB.

> It probably not what you want on arm64, ppc64 where page_size could be
> different from 4Kb with Large Pages. Furthermore, you use it only in one
> function as a temporary buffer - could we move it from .data section
> into dynamic allocation with malloc()?

is this a correct replacement? architecture independent one?

char* userbuf = malloc(PIPE_MAX_SIZE * PAGE_SIZE);

>
>> +
>> +unsigned long handle_faulty_iov(int pid, struct iovec* riov,
>> +                                unsigned long faulty_index,
>> +                                struct iovec *bufvec, struct iovec* aux_iov,
>> +                                unsigned long* aux_len,
>> +                                unsigned long partial_read_bytes)
>> +{
>> +	/* Handling Case 2*/
>> +	if (riov[faulty_index].iov_len == PAGE_SIZE) {
>> +		cnt_sub(CNT_PAGES_WRITTEN, 1);
>> +		return 0;
>> +	}
>> +
>> +	struct iovec dummy;
>> +	ssize_t bytes_read;
>> +	unsigned long offset = 0;
>> +	unsigned long final_read_cnt = 0;
> Could you move variable declarations in the function begin, please?
Ok, I will do that.
>> +
>> +	/* Handling Case 3-Part 3.2*/
>> +	offset = (partial_read_bytes)? partial_read_bytes : PAGE_SIZE;
>> +
>> +	dummy.iov_base = riov[faulty_index].iov_base + offset;
>> +	dummy.iov_len = riov[faulty_index].iov_len - offset;
>> +
>> +	if (!partial_read_bytes)
>> +		cnt_sub(CNT_PAGES_WRITTEN, 1);
>> +
>> +	while (dummy.iov_len) {
>> +
>> +		bytes_read = process_vm_readv(pid, bufvec, 1, &dummy, 1, 0);
> Could you check errno please?

In the very first patch I had that check. In the review it was not found 
very useful, so omitted that.

@Andrei @Xemul : how to go about it?

>
>> +
>> +		if(bytes_read == -1) {
>> +			/* Handling faulty page read in faulty iov */
>> +			cnt_sub(CNT_PAGES_WRITTEN, 1);
>> +			dummy.iov_base += PAGE_SIZE;
>> +			dummy.iov_len -= PAGE_SIZE;
>> +			continue;
>> +		}
>> +
>> +		/* If aux-iov can merge and expand or new entry required */
>> +		if (aux_iov[(*aux_len)-1].iov_base +
>> +			aux_iov[(*aux_len)-1].iov_len == dummy.iov_base)
>> +			aux_iov[(*aux_len)-1].iov_len += bytes_read;
>> +		else {
>> +			aux_iov[*aux_len].iov_base = dummy.iov_base;
>> +			aux_iov[*aux_len].iov_len = bytes_read;
>> +			(*aux_len) += 1;
>> +		}
>> +
>> +		dummy.iov_base += bytes_read;
>> +		dummy.iov_len -= bytes_read;
>> +		bufvec->iov_base += bytes_read;
>> +		bufvec->iov_len -= bytes_read;
>> +		final_read_cnt += bytes_read;
>> +	}
>> +
>> +	return final_read_cnt;
>> +}
>> +
>> +/*
>> + * This function will position start pointer to the latest
>> + * successfully read iov in iovec. In case of partial read it
>> + * returns partial_read_bytes, otherwise 0.
>> + */
>> +static unsigned long analyze_iov(ssize_t bytes_read, struct iovec* riov,
>> +                                 unsigned long *index, struct iovec *aux_iov,
>> +                                 unsigned long *aux_len)
>> +{
>> +	ssize_t processed_bytes = 0;
>> +	unsigned long partial_read_bytes = 0;
>> +
>> +	/* correlating iovs with read bytes */
>> +	while (processed_bytes < bytes_read) {
>> +
>> +		processed_bytes += riov[*index].iov_len;
>> +		aux_iov[*aux_len].iov_base = riov[*index].iov_base;
>> +		aux_iov[*aux_len].iov_len = riov[*index].iov_len;
>> +
>> +		(*aux_len) += 1;
>> +		(*index) += 1;
>> +	}
>> +
>> +	/* handling partially processed faulty iov*/
>> +	if (processed_bytes - bytes_read) {
>> +
>> +		(*index) -= 1;
>> +
>> +		partial_read_bytes = riov[*index].iov_len
>> +					- (processed_bytes - bytes_read);
>> +		aux_iov[*aux_len-1].iov_len = partial_read_bytes;
>> +	}
>> +
>> +	return partial_read_bytes;
>> +}
>> +
>> +/*
>> + * This function iterates over complete ppb->iov entries and pass
>> + * them to process_vm_readv syscall.
>> + *
>> + * Since process_vm_readv returns count of successfully read bytes.
>> + * It does not point to iovec entry associated to last successful
>> + * byte read. The correlation between bytes read and corresponding
>> + * iovec is setup through analyze_iov function.
>> + *
>> + * If all iovecs are not processed in one go, it means there exists
>> + * some faulty iov entry(memory mapping modified after it was grabbed)
>> + * in iovec. process_vm_readv syscall stops at such faulty iov and
>> + * skip processing further any entry in iovec. This is handled by
>> + * handle_faulty_iov function.
>> + */
>> +static long fill_userbuf(int pid, struct page_pipe_buf *ppb,
>> +                                  struct iovec *bufvec,
>> +                                  struct iovec* aux_iov,
>> +                                  unsigned long *aux_len)
>> +{
>> +	struct iovec *riov = ppb->iov;
>> +	ssize_t bytes_read;
>> +	unsigned long total_read = 0;
>> +	unsigned long start = 0;
>> +	unsigned long partial_read_bytes = 0;
>> +
>> +	while (start < ppb->nr_segs) {
>> +
>> +		bytes_read = process_vm_readv(pid, bufvec, 1, &riov[start],
>> +					           ppb->nr_segs - start, 0);
>> +
>> +		if (bytes_read == -1) {
>> +			/* Handling Case 1*/
>> +			if (riov[start].iov_len == PAGE_SIZE) {
>> +				cnt_sub(CNT_PAGES_WRITTEN, 1);
>> +				start += 1;
>> +				continue;
>> +			} else if (errno == ESRCH) {
>> +				pr_debug("Target process PID:%d not found\n", pid);
>> +				return PR_UNAVIL;
> Hmm, I'm not enjoying special return codes..
> Could we do return -errno please?
> While at it, could you zerofy errno before process_vm_readv() and check
> if it's expected?
Sure! I will do it.
>
>> +			}
>> +		}
>> +
>> +		partial_read_bytes = 0;
>> +
>> +		if (bytes_read > 0) {
>> +			partial_read_bytes = analyze_iov(bytes_read, riov,
>> +							 &start, aux_iov,
>> +							 aux_len);
>> +			bufvec->iov_base += bytes_read;
>> +			bufvec->iov_len -= bytes_read;
>> +			total_read += bytes_read;
>> +		}
>> +
>> +		/*
>> +		 * If all iovs not processed in one go,
>> +		 * it means some iov in between has failed.
>> +		 */
>> +		if (start < ppb->nr_segs)
>> +			total_read += handle_faulty_iov(pid, riov, start, bufvec,
>> +							aux_iov, aux_len,
>> +							partial_read_bytes);
>> +		start += 1;
>> +
>> +	}
>> +
>> +	return total_read;
>> +}
>> +
>> +/*
>> + * This function is similar to page_xfer_dump_pages, instead it uses
>> + * auxiliary_iov array for pagemap generation.
>> + *
>> + * The entries of ppb->iov may mismatch with actual process mappings
>> + * present at time of pre-dump. Such entries need to be adjusted as per
>> + * the pages read by process_vm_readv syscall. These adjusted entries
>> + * along with unmodified entries are present in aux_iov array.
>> + */
>> +
>> +int page_xfer_predump_pages(int pid, struct page_xfer *xfer,
>> +                                     struct page_pipe *pp)
>> +{
>> +	struct page_pipe_buf *ppb;
>> +	unsigned int cur_hole = 0, i;
>> +	unsigned long ret, bytes_read;
>> +	struct iovec bufvec;
>> +
>> +	struct iovec aux_iov[PIPE_MAX_SIZE];
>> +	unsigned long aux_len;
>> +
>> +	list_for_each_entry(ppb, &pp->bufs, l) {
>> +
>> +		aux_len = 0;
>> +		bufvec.iov_len = sizeof(userbuf);
>> +		bufvec.iov_base = userbuf;
>> +
>> +		bytes_read = fill_userbuf(pid, ppb, &bufvec, aux_iov, &aux_len);
>> +
>> +		if (bytes_read == PR_UNAVIL)
>> +			return -1;
>> +
>> +		bufvec.iov_base = userbuf;
>> +		bufvec.iov_len = bytes_read;
>> +		ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
>> +
>> +		if (ret == -1 || ret != bytes_read) {
>> +			pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret);
> Probably you want pr_error()

Yes, maybe. Other splicing failure also had pr_error().

What is difference between two?

>
>> +			return -1;
>> +		}
>> +
>> +		/* generating pagemap */
>> +		for (i = 0; i < aux_len; i++) {
>> +
>> +			struct iovec iov = aux_iov[i];
>> +			u32 flags;
>> +
>> +			ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base);
>> +			if (ret)
>> +				return ret;
>> +
>> +			BUG_ON(iov.iov_base < (void *)xfer->offset);
>> +			iov.iov_base -= xfer->offset;
>> +			pr_debug("\t p %p [%u]\n", iov.iov_base,
>> +				(unsigned int)(iov.iov_len / PAGE_SIZE));
>> +
>> +			flags = ppb_xfer_flags(xfer, ppb);
>> +
>> +			if (xfer->write_pagemap(xfer, &iov, flags))
>> +				return -1;
>> +			if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len))
>> +				return -1;
>> +		}
>> +        }
>> +
>> +	return dump_holes(xfer, pp, &cur_hole, NULL);
>> +}
>> +
>> +
>>   int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>>   {
>>   	struct page_pipe_buf *ppb;
>>
> Thanks,
>            Dmitry
-Abhishek