[CRIU] [PATCH v2 3/3] aio: Restore aio ring content

Thu Mar 17 09:08:45 PDT 2016

On 17.03.2016 17:32, Pavel Emelyanov wrote:
> On 03/17/2016 05:26 PM, Kirill Tkhai wrote:
>> On 17.03.2016 16:19, Pavel Emelyanov wrote:
>>>
>>>> @@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
>>>>  static inline bool vma_entry_is_private(VmaEntry *entry,
>>>>  					unsigned long task_size)
>>>>  {
>>>> -	return vma_entry_is(entry, VMA_AREA_REGULAR)	&&
>>>> +	return (vma_entry_is(entry, VMA_AREA_REGULAR)	&&
>>>>  		(vma_entry_is(entry, VMA_ANON_PRIVATE)	||
>>>>  		 vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
>>>> -		 (entry->end <= task_size);
>>>> +		 (entry->end <= task_size)) ||
>>>> +		vma_entry_is(entry, VMA_AREA_AIORING);
>>>
>>> I'm not sure this is safe. How would pre-dumps act on rings?
>>
>> Could you please explain what kind of problems are possible here?
>> I don't see a memory predump.
> 
> The vma_entry_is_private() check is too generic. E.g. such vmas are being
> soft-dirty-tracked. Do we want the same for AIO rings? I bet we don't :)

For user AIO ring buffer looks like an anonymous memory. There are no difference
between them, it's writable and modifiable. So if we track anonymous memory,
we have to track AIO ring buffer too.

>>>>  }
>>>>  
>>>>  static inline bool vma_area_is_private(struct vma_area *vma,
>>>> diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
>>>> index 1df3e71..d82518e 100644
>>>> --- a/criu/pie/parasite.c
>>>> +++ b/criu/pie/parasite.c
>>>> @@ -410,14 +410,7 @@ static int parasite_check_aios(struct parasite_check_aios_args *args)
>>>>  			return -1;
>>>>  		}
>>>>  
>>>> -		/*
>>>> -		 * XXX what else can we do if there are requests
>>>> -		 * in the ring?
>>>> -		 */
>>>> -		if (ring->head != ring->tail) {
>>>> -			pr_err("Pending AIO requests in ring #%d\n", i);
>>>> -			return -1;
>>>> -		}
>>>> +		/* XXX: wait aio completion */
>>>>  
>>>>  		args->ring[i].max_reqs = ring->nr;
>>>>  	}
>>>> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
>>>> index f7bde75..d19f4dc 100644
>>>> --- a/criu/pie/restorer.c
>>>> +++ b/criu/pie/restorer.c
>>>> @@ -3,6 +3,7 @@
>>>>  
>>>>  #include <linux/securebits.h>
>>>>  #include <linux/capability.h>
>>>> +#include <linux/aio_abi.h>
>>>>  #include <sys/types.h>
>>>>  #include <sys/mman.h>
>>>>  #include <sys/stat.h>
>>>> @@ -546,6 +547,120 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
>>>>  	return addr;
>>>>  }
>>>>  
>>>> +/*
>>>> + * This restores aio ring header, content, head and in-kernel position
>>>> + * of tail. To set tail, we write to /dev/null and use the fact this
>>>> + * operation is synchronious for the device. Also, we unmap temporary
>>>> + * anonymous area, used to store content of ring buffer during restore
>>>> + * and mapped in map_private_vma().
>>>> + */
>>>> +static int restore_aio_ring(struct rst_aio_ring *raio)
>>>> +{
>>>> +	struct aio_ring *ring = (void *)raio->addr;
>>>> +	unsigned head = ring->head;
>>>> +	unsigned tail = ring->tail;
>>>> +	struct iocb *iocb, **iocbp;
>>>> +	unsigned long ctx = 0;
>>>> +	int i, count, fd, ret;
>>>> +	char buf[1];
>>>> +
>>>> +	ret = sys_io_setup(raio->nr_req, &ctx);
>>>> +	if (ret < 0) {
>>>> +		pr_err("Ring setup failed with %d\n", ret);
>>>> +		return -1;
>>>> +	}
>>>> +
>>>> +	if (tail == 0 && head == 0)
>>>> +		goto populate;
>>>
>>> if (tail == head) ?
>>
>> No. If tail is not zero we should move it.
>> If tail == 0 and head != 0, we have nr_req-1-head completed
>> events in rings, so we should move tail nr_req times (full
>> buffer size). This case, number of completed events in kernel
>> will set right.
> 
> OK, add this as code comment.
> 
>>>> +
>>>> +	fd = sys_open("/dev/null", O_WRONLY, 0);
>>>> +	if (fd < 0) {
>>>> +		pr_err("Can't open /dev/null for aio\n");
>>>> +		return -1;
>>>> +	}
>>>> +
>>>> +	if (tail >= head)
>>>> +		count = tail;
>>>
>>> count = tail - head ?
>>
>> No. We can't consider tail and head like difference only, because
>> a user task may cache this values. We have to restore them exactly
>> like they were before. If we move tail on tail-head only, it will
>> be wrong.
>>
>>>> +	else
>>>> +		count = ring->nr - 1;
>>>
>>> count = ring->nr - (tail - head) ?
>>
>> No. It will be a number greater than ring->nr. We can't submit more reqs
>> than allocated in io_setup().
> 
> OK, so what does the 'count' variable contain then?

Number of allocated struct iocb, i.e. maximum number of elements in a try.

>>>> +
>>>> +	iocb = (void *)sys_mmap(NULL, count * sizeof(struct iocb), PROT_READ|PROT_WRITE,
>>>> +				MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
>>>> +	iocbp = (void *)sys_mmap(NULL, count * sizeof(struct iocb *), PROT_READ|PROT_WRITE,
>>>> +				MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
>>>
>>> Use singe mmap for all the memory you need.
>>
>> OK
>>
>>>> +	if (iocb == MAP_FAILED || iocbp == MAP_FAILED) {
>>>> +		pr_err("Can't mmap aio tmp buffer\n");
>>>> +		return -1;
>>>> +	}
>>>> +
>>>> +	for (i = 0; i < count; i++) {
>>>> +		iocbp[i] = &iocb[i];
>>>> +		iocb[i].aio_fildes = fd;
>>>> +		iocb[i].aio_buf = (unsigned long)buf;
>>>> +		iocb[i].aio_nbytes = 1;
>>>> +		iocb[i].aio_lio_opcode = IOCB_CMD_PWRITE;
>>>
>>> Only writes? Are there asynchronous reads?
>>
>> No. It's just to move tail and head, so there is no difference
>> which operation is used for that. Statuses are restored below.
> 
> I'd use reads then...

Read from /dev/null is slower, because it populates user buffer. Write just throws it away.

>>>> +	}
>>>> +
>>>> +	i = count;
>>>> +	do {
>>>> +		ret = sys_io_submit(ctx, i, iocbp);
>>>> +		if (ret < 0) {
>>>> +			pr_err("Can't submit %d aio iocbs: ret=%d\n", i, ret);
>>>> +			return -1;
>>>> +		}
>>>> +		i -= ret;
>>>> +
>>>> +		if (count - i > head)
>>>> +		 /*
>>>> +		  * Though count is less than maximum available reqs, kernel's
>>>> +		  * get_reqs_available() takes only a number of reqs, which is
>>>> +		  * aliquot to kioctx::req_batch. So, set head to free a space
>>>> +		  * for next io_submit().
>>>> +		  *
>>>> +		  * Direct set of head is equal to sys_io_getevents() call. See
>>>> +		  * kernel for the details.
>>>> +		  */
>>>> +			((struct aio_ring *)ctx)->head = head;
>>>> +	} while (i);
>>>> +
>>>> +	if (tail < head) {
>>>> +		ret = sys_io_submit(ctx, tail + 1, iocbp);
>>>
>>> Why?!
>>
>> You can't submit more than nr_req at once.
> 
> But you do it in a loop above anyway.

Yes, because it's impossible to submit nr_req requests from single cpu on newer kernel.
It can submit only a number of requests, which is aliquot to kioctx::req_batch. If nr_req
is not aliquot, then leftover reqs are unavailable.

To fix this, we advance head earlier.

>> So, we need to advance head before,
>> and this action acts like io_getevents() call. We set head manually because it's
>> faster. After head is set, kernel sees we have available space in buffer, and
>> second io_submit works correct.
>>
>>>> +		if (ret != tail + 1) {
>>>> +			pr_err("Can't submit %d aio iocbs more, ret=%d\n", tail + 1, ret);
>>>> +			return -1;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	sys_munmap(iocb, count * sizeof(struct iocb));
>>>> +	sys_munmap(iocbp, count * sizeof(struct iocb *));
>>>> +	sys_close(fd);
>>>> +populate:
>>>> +	count = raio->len/sizeof(unsigned long);
>>>> +	for (i = 0; i < count; i++)
>>>> +		((unsigned long *)ctx)[i] = ((unsigned long *)ring)[i];
>>>
>>> Don't copy the whole rings, we only need the events.
>>
>> OK. We may skip aio_ring fields up to tail. But anyway it may be
>> useful to restore them from aio_ring::magic.
> 
> I'd say we only need to restore head, tail and events. And do it _explicitly_
> rather than with memcpy-s over parts of the structure.

OK, if something changed in kernel in the future, we'll update criu.

>>>> +
>>>> +	/* Unmap temporary anonymous area */
>>>> +	sys_munmap(ring, raio->len);
>>>
>>> Presumably this is not required, sys_mremap() unmaps the target.
>>
>> OK
>>
>>>> +
>>>> +	/*
>>>> +	 * If we failed to get the proper nr_req right and
>>>> +	 * created smaller or larger ring, then this remap
>>>> +	 * will (should) fail, since AIO rings has immutable
>>>> +	 * size.
>>>> +	 *
>>>> +	 * This is not great, but anyway better than putting
>>>> +	 * a ring of wrong size into correct place.
>>>> +	 */
>>>> +	ctx = sys_mremap(ctx, raio->len, raio->len,
>>>> +				MREMAP_FIXED | MREMAP_MAYMOVE,
>>>> +				raio->addr);
>>>> +	if (ctx != raio->addr) {
>>>> +		pr_err("Ring remap failed with %ld\n", ctx);
>>>> +		return -1;
>>>> +	}
>>>> +	return 0;
>>>> +}
>>>> +
>>>>  static void rst_tcp_repair_off(struct rst_tcp_sock *rts)
>>>>  {
>>>>  	int aux, ret;
>>>> @@ -999,56 +1114,9 @@ long __export_restore_task(struct task_restore_args *args)
>>>>  	 * Now when all VMAs are in their places time to set
>>>>  	 * up AIO rings.
>>>>  	 */
>>>> -
>>>> -	for (i = 0; i < args->rings_n; i++) {
>>>> -		struct rst_aio_ring *raio = &args->rings[i];
>>>> -		unsigned long ctx = 0;
>>>> -		int ret;
>>>> -
>>>> -		ret = sys_io_setup(raio->nr_req, &ctx);
>>>> -		if (ret < 0) {
>>>> -			pr_err("Ring setup failed with %d\n", ret);
>>>> +	for (i = 0; i < args->rings_n; i++)
>>>> +		if (restore_aio_ring(&args->rings[i]) < 0)
>>>>  			goto core_restore_end;
>>>
>>> For such huge patching introduce the intermediate patch that just moves
>>> the needed code into a helper w/o any changes.
>>
>> OK
>>
>>>> -		}
>>>> -
>>>> -		if (ctx == raio->addr) /* Lucky bastards we are! */
>>>> -			continue;
>>>> -
>>>> -		/*
>>>> -		 * If we failed to get the proper nr_req right and
>>>> -		 * created smaller or larger ring, then this remap
>>>> -		 * will (should) fail, since AIO rings has immutable
>>>> -		 * size.
>>>> -		 *
>>>> -		 * This is not great, but anyway better than putting
>>>> -		 * a ring of wrong size into correct place.
>>>> -		 */
>>>> -
>>>> -		ctx = sys_mremap(ctx, raio->len, raio->len,
>>>> -					MREMAP_FIXED | MREMAP_MAYMOVE,
>>>> -					raio->addr);
>>>> -		if (ctx != raio->addr) {
>>>> -			pr_err("Ring remap failed with %ld\n", ctx);
>>>> -			goto core_restore_end;
>>>> -		}
>>>> -
>>>> -		/*
>>>> -		 * Now check that kernel not just remapped the
>>>> -		 * ring into new place, but updated the internal
>>>> -		 * context state respectively.
>>>> -		 */
>>>> -
>>>> -		ret = sys_io_getevents(ctx, 0, 1, NULL, NULL);
>>>> -		if (ret != 0) {
>>>> -			if (ret < 0)
>>>> -				pr_err("Kernel doesn't remap AIO rings\n");
>>>> -			else
>>>> -				pr_err("AIO context screwed up\n");
>>>> -
>>>> -			goto core_restore_end;
>>>> -		}
>>>> -	}
>>>> -
>>>>  	/*
>>>>  	 * Finally restore madivse() bits
>>>>  	 */
>>>>
>>>> _______________________________________________
>>>> CRIU mailing list
>>>> CRIU at openvz.org
>>>> https://lists.openvz.org/mailman/listinfo/criu
>>>> .
>>>>
>>>
>> .
>>
>