[CRIU] [PATCH v2 3/3] aio: Restore aio ring content

Thu Mar 17 07:26:01 PDT 2016

On 17.03.2016 16:19, Pavel Emelyanov wrote:
> 
>> @@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
>>  static inline bool vma_entry_is_private(VmaEntry *entry,
>>  					unsigned long task_size)
>>  {
>> -	return vma_entry_is(entry, VMA_AREA_REGULAR)	&&
>> +	return (vma_entry_is(entry, VMA_AREA_REGULAR)	&&
>>  		(vma_entry_is(entry, VMA_ANON_PRIVATE)	||
>>  		 vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
>> -		 (entry->end <= task_size);
>> +		 (entry->end <= task_size)) ||
>> +		vma_entry_is(entry, VMA_AREA_AIORING);
> 
> I'm not sure this is safe. How would pre-dumps act on rings?

Could you please explain what kind of problems are possible here?
I don't see a memory predump.

>>  }
>>  
>>  static inline bool vma_area_is_private(struct vma_area *vma,
>> diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
>> index 1df3e71..d82518e 100644
>> --- a/criu/pie/parasite.c
>> +++ b/criu/pie/parasite.c
>> @@ -410,14 +410,7 @@ static int parasite_check_aios(struct parasite_check_aios_args *args)
>>  			return -1;
>>  		}
>>  
>> -		/*
>> -		 * XXX what else can we do if there are requests
>> -		 * in the ring?
>> -		 */
>> -		if (ring->head != ring->tail) {
>> -			pr_err("Pending AIO requests in ring #%d\n", i);
>> -			return -1;
>> -		}
>> +		/* XXX: wait aio completion */
>>  
>>  		args->ring[i].max_reqs = ring->nr;
>>  	}
>> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
>> index f7bde75..d19f4dc 100644
>> --- a/criu/pie/restorer.c
>> +++ b/criu/pie/restorer.c
>> @@ -3,6 +3,7 @@
>>  
>>  #include <linux/securebits.h>
>>  #include <linux/capability.h>
>> +#include <linux/aio_abi.h>
>>  #include <sys/types.h>
>>  #include <sys/mman.h>
>>  #include <sys/stat.h>
>> @@ -546,6 +547,120 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
>>  	return addr;
>>  }
>>  
>> +/*
>> + * This restores aio ring header, content, head and in-kernel position
>> + * of tail. To set tail, we write to /dev/null and use the fact this
>> + * operation is synchronious for the device. Also, we unmap temporary
>> + * anonymous area, used to store content of ring buffer during restore
>> + * and mapped in map_private_vma().
>> + */
>> +static int restore_aio_ring(struct rst_aio_ring *raio)
>> +{
>> +	struct aio_ring *ring = (void *)raio->addr;
>> +	unsigned head = ring->head;
>> +	unsigned tail = ring->tail;
>> +	struct iocb *iocb, **iocbp;
>> +	unsigned long ctx = 0;
>> +	int i, count, fd, ret;
>> +	char buf[1];
>> +
>> +	ret = sys_io_setup(raio->nr_req, &ctx);
>> +	if (ret < 0) {
>> +		pr_err("Ring setup failed with %d\n", ret);
>> +		return -1;
>> +	}
>> +
>> +	if (tail == 0 && head == 0)
>> +		goto populate;
> 
> if (tail == head) ?

No. If tail is not zero we should move it.
If tail == 0 and head != 0, we have nr_req-1-head completed
events in rings, so we should move tail nr_req times (full
buffer size). This case, number of completed events in kernel
will set right.

>> +
>> +	fd = sys_open("/dev/null", O_WRONLY, 0);
>> +	if (fd < 0) {
>> +		pr_err("Can't open /dev/null for aio\n");
>> +		return -1;
>> +	}
>> +
>> +	if (tail >= head)
>> +		count = tail;
> 
> count = tail - head ?

No. We can't consider tail and head like difference only, because
a user task may cache this values. We have to restore them exactly
like they were before. If we move tail on tail-head only, it will
be wrong.

>> +	else
>> +		count = ring->nr - 1;
> 
> count = ring->nr - (tail - head) ?

No. It will be a number greater than ring->nr. We can't submit more reqs
than allocated in io_setup().

>> +
>> +	iocb = (void *)sys_mmap(NULL, count * sizeof(struct iocb), PROT_READ|PROT_WRITE,
>> +				MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
>> +	iocbp = (void *)sys_mmap(NULL, count * sizeof(struct iocb *), PROT_READ|PROT_WRITE,
>> +				MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
> 
> Use singe mmap for all the memory you need.

OK

>> +	if (iocb == MAP_FAILED || iocbp == MAP_FAILED) {
>> +		pr_err("Can't mmap aio tmp buffer\n");
>> +		return -1;
>> +	}
>> +
>> +	for (i = 0; i < count; i++) {
>> +		iocbp[i] = &iocb[i];
>> +		iocb[i].aio_fildes = fd;
>> +		iocb[i].aio_buf = (unsigned long)buf;
>> +		iocb[i].aio_nbytes = 1;
>> +		iocb[i].aio_lio_opcode = IOCB_CMD_PWRITE;
> 
> Only writes? Are there asynchronous reads?

No. It's just to move tail and head, so there is no difference
which operation is used for that. Statuses are restored below.

>> +	}
>> +
>> +	i = count;
>> +	do {
>> +		ret = sys_io_submit(ctx, i, iocbp);
>> +		if (ret < 0) {
>> +			pr_err("Can't submit %d aio iocbs: ret=%d\n", i, ret);
>> +			return -1;
>> +		}
>> +		i -= ret;
>> +
>> +		if (count - i > head)
>> +		 /*
>> +		  * Though count is less than maximum available reqs, kernel's
>> +		  * get_reqs_available() takes only a number of reqs, which is
>> +		  * aliquot to kioctx::req_batch. So, set head to free a space
>> +		  * for next io_submit().
>> +		  *
>> +		  * Direct set of head is equal to sys_io_getevents() call. See
>> +		  * kernel for the details.
>> +		  */
>> +			((struct aio_ring *)ctx)->head = head;
>> +	} while (i);
>> +
>> +	if (tail < head) {
>> +		ret = sys_io_submit(ctx, tail + 1, iocbp);
> 
> Why?!

You can't submit more than nr_req at once. So, we need to advance head before,
and this action acts like io_getevents() call. We set head manually because it's
faster. After head is set, kernel sees we have available space in buffer, and
second io_submit works correct.

>> +		if (ret != tail + 1) {
>> +			pr_err("Can't submit %d aio iocbs more, ret=%d\n", tail + 1, ret);
>> +			return -1;
>> +		}
>> +	}
>> +
>> +	sys_munmap(iocb, count * sizeof(struct iocb));
>> +	sys_munmap(iocbp, count * sizeof(struct iocb *));
>> +	sys_close(fd);
>> +populate:
>> +	count = raio->len/sizeof(unsigned long);
>> +	for (i = 0; i < count; i++)
>> +		((unsigned long *)ctx)[i] = ((unsigned long *)ring)[i];
> 
> Don't copy the whole rings, we only need the events.

OK. We may skip aio_ring fields up to tail. But anyway it may be
useful to restore them from aio_ring::magic.

>> +
>> +	/* Unmap temporary anonymous area */
>> +	sys_munmap(ring, raio->len);
> 
> Presumably this is not required, sys_mremap() unmaps the target.

OK

>> +
>> +	/*
>> +	 * If we failed to get the proper nr_req right and
>> +	 * created smaller or larger ring, then this remap
>> +	 * will (should) fail, since AIO rings has immutable
>> +	 * size.
>> +	 *
>> +	 * This is not great, but anyway better than putting
>> +	 * a ring of wrong size into correct place.
>> +	 */
>> +	ctx = sys_mremap(ctx, raio->len, raio->len,
>> +				MREMAP_FIXED | MREMAP_MAYMOVE,
>> +				raio->addr);
>> +	if (ctx != raio->addr) {
>> +		pr_err("Ring remap failed with %ld\n", ctx);
>> +		return -1;
>> +	}
>> +	return 0;
>> +}
>> +
>>  static void rst_tcp_repair_off(struct rst_tcp_sock *rts)
>>  {
>>  	int aux, ret;
>> @@ -999,56 +1114,9 @@ long __export_restore_task(struct task_restore_args *args)
>>  	 * Now when all VMAs are in their places time to set
>>  	 * up AIO rings.
>>  	 */
>> -
>> -	for (i = 0; i < args->rings_n; i++) {
>> -		struct rst_aio_ring *raio = &args->rings[i];
>> -		unsigned long ctx = 0;
>> -		int ret;
>> -
>> -		ret = sys_io_setup(raio->nr_req, &ctx);
>> -		if (ret < 0) {
>> -			pr_err("Ring setup failed with %d\n", ret);
>> +	for (i = 0; i < args->rings_n; i++)
>> +		if (restore_aio_ring(&args->rings[i]) < 0)
>>  			goto core_restore_end;
> 
> For such huge patching introduce the intermediate patch that just moves
> the needed code into a helper w/o any changes.

OK

>> -		}
>> -
>> -		if (ctx == raio->addr) /* Lucky bastards we are! */
>> -			continue;
>> -
>> -		/*
>> -		 * If we failed to get the proper nr_req right and
>> -		 * created smaller or larger ring, then this remap
>> -		 * will (should) fail, since AIO rings has immutable
>> -		 * size.
>> -		 *
>> -		 * This is not great, but anyway better than putting
>> -		 * a ring of wrong size into correct place.
>> -		 */
>> -
>> -		ctx = sys_mremap(ctx, raio->len, raio->len,
>> -					MREMAP_FIXED | MREMAP_MAYMOVE,
>> -					raio->addr);
>> -		if (ctx != raio->addr) {
>> -			pr_err("Ring remap failed with %ld\n", ctx);
>> -			goto core_restore_end;
>> -		}
>> -
>> -		/*
>> -		 * Now check that kernel not just remapped the
>> -		 * ring into new place, but updated the internal
>> -		 * context state respectively.
>> -		 */
>> -
>> -		ret = sys_io_getevents(ctx, 0, 1, NULL, NULL);
>> -		if (ret != 0) {
>> -			if (ret < 0)
>> -				pr_err("Kernel doesn't remap AIO rings\n");
>> -			else
>> -				pr_err("AIO context screwed up\n");
>> -
>> -			goto core_restore_end;
>> -		}
>> -	}
>> -
>>  	/*
>>  	 * Finally restore madivse() bits
>>  	 */
>>
>> _______________________________________________
>> CRIU mailing list
>> CRIU at openvz.org
>> https://lists.openvz.org/mailman/listinfo/criu
>> .
>>
>