[CRIU] [PATCH 6/7] read mode pre-dump implementation

Thu Aug 29 00:44:03 MSK 2019

On 8/25/19 2:58 AM, Abhishek Dubey wrote:
[..]
> +
> +/*
> + * Optimized pre-dump algorithm
> + * ==============================
[..]
> + */

Thanks for the nice long comment!

> +static char userbuf[PIPE_MAX_SIZE << 12];

Is this a PAGE_SHIFT?
It probably not what you want on arm64, ppc64 where page_size could be
different from 4Kb with Large Pages. Furthermore, you use it only in one
function as a temporary buffer - could we move it from .data section
into dynamic allocation with malloc()?

> +
> +unsigned long handle_faulty_iov(int pid, struct iovec* riov,
> +                                unsigned long faulty_index,
> +                                struct iovec *bufvec, struct iovec* aux_iov,
> +                                unsigned long* aux_len,
> +                                unsigned long partial_read_bytes)
> +{
> +	/* Handling Case 2*/
> +	if (riov[faulty_index].iov_len == PAGE_SIZE) {
> +		cnt_sub(CNT_PAGES_WRITTEN, 1);
> +		return 0;
> +	}
> +
> +	struct iovec dummy;
> +	ssize_t bytes_read;
> +	unsigned long offset = 0;
> +	unsigned long final_read_cnt = 0;

Could you move variable declarations in the function begin, please?

> +
> +	/* Handling Case 3-Part 3.2*/
> +	offset = (partial_read_bytes)? partial_read_bytes : PAGE_SIZE;
> +
> +	dummy.iov_base = riov[faulty_index].iov_base + offset;
> +	dummy.iov_len = riov[faulty_index].iov_len - offset;
> +
> +	if (!partial_read_bytes)
> +		cnt_sub(CNT_PAGES_WRITTEN, 1);
> +
> +	while (dummy.iov_len) {
> +
> +		bytes_read = process_vm_readv(pid, bufvec, 1, &dummy, 1, 0);

Could you check errno please?

> +
> +		if(bytes_read == -1) {
> +			/* Handling faulty page read in faulty iov */
> +			cnt_sub(CNT_PAGES_WRITTEN, 1);
> +			dummy.iov_base += PAGE_SIZE;
> +			dummy.iov_len -= PAGE_SIZE;
> +			continue;
> +		}
> +
> +		/* If aux-iov can merge and expand or new entry required */
> +		if (aux_iov[(*aux_len)-1].iov_base +
> +			aux_iov[(*aux_len)-1].iov_len == dummy.iov_base)
> +			aux_iov[(*aux_len)-1].iov_len += bytes_read;
> +		else {
> +			aux_iov[*aux_len].iov_base = dummy.iov_base;
> +			aux_iov[*aux_len].iov_len = bytes_read;
> +			(*aux_len) += 1;
> +		}
> +
> +		dummy.iov_base += bytes_read;
> +		dummy.iov_len -= bytes_read;
> +		bufvec->iov_base += bytes_read;
> +		bufvec->iov_len -= bytes_read;
> +		final_read_cnt += bytes_read;
> +	}
> +
> +	return final_read_cnt;
> +}
> +
> +/*
> + * This function will position start pointer to the latest
> + * successfully read iov in iovec. In case of partial read it
> + * returns partial_read_bytes, otherwise 0.
> + */
> +static unsigned long analyze_iov(ssize_t bytes_read, struct iovec* riov,
> +                                 unsigned long *index, struct iovec *aux_iov,
> +                                 unsigned long *aux_len)
> +{
> +	ssize_t processed_bytes = 0;
> +	unsigned long partial_read_bytes = 0;
> +
> +	/* correlating iovs with read bytes */
> +	while (processed_bytes < bytes_read) {
> +
> +		processed_bytes += riov[*index].iov_len;
> +		aux_iov[*aux_len].iov_base = riov[*index].iov_base;
> +		aux_iov[*aux_len].iov_len = riov[*index].iov_len;
> +
> +		(*aux_len) += 1;
> +		(*index) += 1;
> +	}
> +
> +	/* handling partially processed faulty iov*/
> +	if (processed_bytes - bytes_read) {
> +
> +		(*index) -= 1;
> +
> +		partial_read_bytes = riov[*index].iov_len
> +					- (processed_bytes - bytes_read);
> +		aux_iov[*aux_len-1].iov_len = partial_read_bytes;
> +	}
> +
> +	return partial_read_bytes;
> +}
> +
> +/*
> + * This function iterates over complete ppb->iov entries and pass
> + * them to process_vm_readv syscall.
> + *
> + * Since process_vm_readv returns count of successfully read bytes.
> + * It does not point to iovec entry associated to last successful
> + * byte read. The correlation between bytes read and corresponding
> + * iovec is setup through analyze_iov function.
> + *
> + * If all iovecs are not processed in one go, it means there exists
> + * some faulty iov entry(memory mapping modified after it was grabbed)
> + * in iovec. process_vm_readv syscall stops at such faulty iov and
> + * skip processing further any entry in iovec. This is handled by
> + * handle_faulty_iov function.
> + */
> +static long fill_userbuf(int pid, struct page_pipe_buf *ppb,
> +                                  struct iovec *bufvec,
> +                                  struct iovec* aux_iov,
> +                                  unsigned long *aux_len)
> +{
> +	struct iovec *riov = ppb->iov;
> +	ssize_t bytes_read;
> +	unsigned long total_read = 0;
> +	unsigned long start = 0;
> +	unsigned long partial_read_bytes = 0;
> +
> +	while (start < ppb->nr_segs) {
> +
> +		bytes_read = process_vm_readv(pid, bufvec, 1, &riov[start],
> +					           ppb->nr_segs - start, 0);
> +
> +		if (bytes_read == -1) {
> +			/* Handling Case 1*/
> +			if (riov[start].iov_len == PAGE_SIZE) {
> +				cnt_sub(CNT_PAGES_WRITTEN, 1);
> +				start += 1;
> +				continue;
> +			} else if (errno == ESRCH) {
> +				pr_debug("Target process PID:%d not found\n", pid);
> +				return PR_UNAVIL;

Hmm, I'm not enjoying special return codes..
Could we do return -errno please?
While at it, could you zerofy errno before process_vm_readv() and check
if it's expected?

> +			}
> +		}
> +
> +		partial_read_bytes = 0;
> +
> +		if (bytes_read > 0) {
> +			partial_read_bytes = analyze_iov(bytes_read, riov,
> +							 &start, aux_iov,
> +							 aux_len);
> +			bufvec->iov_base += bytes_read;
> +			bufvec->iov_len -= bytes_read;
> +			total_read += bytes_read;
> +		}
> +
> +		/*
> +		 * If all iovs not processed in one go,
> +		 * it means some iov in between has failed.
> +		 */
> +		if (start < ppb->nr_segs)
> +			total_read += handle_faulty_iov(pid, riov, start, bufvec,
> +							aux_iov, aux_len,
> +							partial_read_bytes);
> +		start += 1;
> +
> +	}
> +
> +	return total_read;
> +}
> +
> +/*
> + * This function is similar to page_xfer_dump_pages, instead it uses
> + * auxiliary_iov array for pagemap generation.
> + *
> + * The entries of ppb->iov may mismatch with actual process mappings
> + * present at time of pre-dump. Such entries need to be adjusted as per
> + * the pages read by process_vm_readv syscall. These adjusted entries
> + * along with unmodified entries are present in aux_iov array.
> + */
> +
> +int page_xfer_predump_pages(int pid, struct page_xfer *xfer,
> +                                     struct page_pipe *pp)
> +{
> +	struct page_pipe_buf *ppb;
> +	unsigned int cur_hole = 0, i;
> +	unsigned long ret, bytes_read;
> +	struct iovec bufvec;
> +
> +	struct iovec aux_iov[PIPE_MAX_SIZE];
> +	unsigned long aux_len;
> +
> +	list_for_each_entry(ppb, &pp->bufs, l) {
> +
> +		aux_len = 0;
> +		bufvec.iov_len = sizeof(userbuf);
> +		bufvec.iov_base = userbuf;
> +
> +		bytes_read = fill_userbuf(pid, ppb, &bufvec, aux_iov, &aux_len);
> +
> +		if (bytes_read == PR_UNAVIL)
> +			return -1;
> +
> +		bufvec.iov_base = userbuf;
> +		bufvec.iov_len = bytes_read;
> +		ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
> +
> +		if (ret == -1 || ret != bytes_read) {
> +			pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret);

Probably you want pr_error()

> +			return -1;
> +		}
> +
> +		/* generating pagemap */
> +		for (i = 0; i < aux_len; i++) {
> +
> +			struct iovec iov = aux_iov[i];
> +			u32 flags;
> +
> +			ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base);
> +			if (ret)
> +				return ret;
> +
> +			BUG_ON(iov.iov_base < (void *)xfer->offset);
> +			iov.iov_base -= xfer->offset;
> +			pr_debug("\t p %p [%u]\n", iov.iov_base,
> +				(unsigned int)(iov.iov_len / PAGE_SIZE));
> +
> +			flags = ppb_xfer_flags(xfer, ppb);
> +
> +			if (xfer->write_pagemap(xfer, &iov, flags))
> +				return -1;
> +			if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len))
> +				return -1;
> +		}
> +        }
> +
> +	return dump_holes(xfer, pp, &cur_hole, NULL);
> +}
> +
> +
>  int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>  {
>  	struct page_pipe_buf *ppb;
> 

Thanks,
          Dmitry