[CRIU] [PATCH v2 3/3] aio: Restore aio ring content

Pavel Emelyanov xemul at virtuozzo.com
Mon Mar 14 07:56:12 PDT 2016


On 03/14/2016 02:49 PM, Kirill Tkhai wrote:
> 1)Dump/restore mmaped aio ring like any other private vma entry.
> 2)Create io context, set head and tail using write to /dev/null.
> 3)Copy aio ring restored in (1) to created in (2).
> 4)Unmap temporary ring (1).
> 5)Remap (2) to address of (1).

I need better description of supported states. There are pending
requests and completed requests and criu tries not to c/r anything.

What is changed with this patch?

> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> ---
>  criu/arch/ppc64/syscalls/syscall-ppc64.tbl |    1 
>  criu/arch/x86/syscalls/syscall_32.tbl      |    1 
>  criu/arch/x86/syscalls/syscall_64.tbl      |    1 
>  criu/cr-restore.c                          |    6 +
>  criu/include/syscall-types.h               |    1 
>  criu/include/vma.h                         |    5 +
>  criu/pie/parasite.c                        |    9 --
>  criu/pie/restorer.c                        |  166 ++++++++++++++++++++--------
>  8 files changed, 130 insertions(+), 60 deletions(-)
> 
> diff --git a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
> index 3319379..e71a1ad 100644
> --- a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
> +++ b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
> @@ -102,4 +102,5 @@ __NR_seccomp		358		sys_seccomp		(unsigned int op, unsigned int flags, const char
>  __NR_memfd_create	360		sys_memfd_create	(const char *name, unsigned int flags)
>  __NR_io_setup		227		sys_io_setup		(unsigned nr_events, aio_context_t *ctx_idp)
>  __NR_io_getevents	229		sys_io_getevents	(aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
> +__NR_io_submit		230		sys_io_submit		(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
>  __NR_ipc		117		sys_ipc			(unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth)
> diff --git a/criu/arch/x86/syscalls/syscall_32.tbl b/criu/arch/x86/syscalls/syscall_32.tbl
> index c527122..2b61530 100644
> --- a/criu/arch/x86/syscalls/syscall_32.tbl
> +++ b/criu/arch/x86/syscalls/syscall_32.tbl
> @@ -66,6 +66,7 @@ __NR_set_thread_area	243		sys_set_thread_area	(user_desc_t *info)
>  __NR_get_thread_area	244		sys_get_thread_area	(user_desc_t *info)
>  __NR_io_setup		245		sys_io_setup		(unsigned nr_reqs, aio_context_t *ctx32p)
>  __NR_io_getevents	247		sys_io_getevents	(aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
> +__NR_io_submit		248		sys_io_submit		(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
>  __NR_exit_group		252		sys_exit_group		(int error_code)
>  __NR_set_tid_address	258		sys_set_tid_address	(int *tid_addr)
>  __NR_timer_create	259		sys_timer_create	(clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
> diff --git a/criu/arch/x86/syscalls/syscall_64.tbl b/criu/arch/x86/syscalls/syscall_64.tbl
> index 5c32d4c..e01ea8f 100644
> --- a/criu/arch/x86/syscalls/syscall_64.tbl
> +++ b/criu/arch/x86/syscalls/syscall_64.tbl
> @@ -74,6 +74,7 @@ __NR_futex			202		sys_futex		(u32 *uaddr, int op, u32 val, struct timespec *utim
>  __NR_set_thread_area		205		sys_set_thread_area	(user_desc_t *info)
>  __NR_io_setup			206		sys_io_setup		(unsigned nr_events, aio_context_t *ctx)
>  __NR_io_getevents		208		sys_io_getevents	(aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
> +__NR_io_submit			209		sys_io_submit		(aio_context_t ctx, long nr, struct iocb **iocbpp)
>  __NR_get_thread_area		211		sys_get_thread_area	(user_desc_t *info)
>  __NR_set_tid_address		218		sys_set_tid_address	(int *tid_addr)
>  __NR_restart_syscall		219		sys_restart_syscall	(void)
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 30ddff9..922fa14 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -316,6 +316,7 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
>  
>  	size = vma_entry_len(vma->e);
>  	if (paddr == NULL) {
> +		int flag = 0;
>  		/*
>  		 * The respective memory area was NOT found in the parent.
>  		 * Map a new one.
> @@ -323,9 +324,12 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
>  		pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
>  			vma->e->start, vma->e->end, vma->e->pgoff);
>  
> +		if (vma_entry_is(vma->e, VMA_AREA_AIORING))
> +			flag |= MAP_ANONYMOUS;
> +
>  		addr = mmap(*tgt_addr, size,
>  				vma->e->prot | PROT_WRITE,
> -				vma->e->flags | MAP_FIXED,
> +				vma->e->flags | MAP_FIXED | flag,
>  				vma->e->fd, vma->e->pgoff);
>  
>  		if (addr == MAP_FAILED) {
> diff --git a/criu/include/syscall-types.h b/criu/include/syscall-types.h
> index b056f6d..5b4e1aa 100644
> --- a/criu/include/syscall-types.h
> +++ b/criu/include/syscall-types.h
> @@ -31,6 +31,7 @@ struct rusage;
>  struct file_handle;
>  struct robust_list_head;
>  struct io_event;
> +struct iocb;
>  struct timespec;
>  
>  typedef unsigned long aio_context_t;
> diff --git a/criu/include/vma.h b/criu/include/vma.h
> index 247c5a3..ce4d5f7 100644
> --- a/criu/include/vma.h
> +++ b/criu/include/vma.h
> @@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
>  static inline bool vma_entry_is_private(VmaEntry *entry,
>  					unsigned long task_size)
>  {
> -	return vma_entry_is(entry, VMA_AREA_REGULAR)	&&
> +	return (vma_entry_is(entry, VMA_AREA_REGULAR)	&&
>  		(vma_entry_is(entry, VMA_ANON_PRIVATE)	||
>  		 vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
> -		 (entry->end <= task_size);
> +		 (entry->end <= task_size)) ||
> +		vma_entry_is(entry, VMA_AREA_AIORING);
>  }
>  
>  static inline bool vma_area_is_private(struct vma_area *vma,
> diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
> index 1df3e71..d82518e 100644
> --- a/criu/pie/parasite.c
> +++ b/criu/pie/parasite.c
> @@ -410,14 +410,7 @@ static int parasite_check_aios(struct parasite_check_aios_args *args)
>  			return -1;
>  		}
>  
> -		/*
> -		 * XXX what else can we do if there are requests
> -		 * in the ring?
> -		 */
> -		if (ring->head != ring->tail) {
> -			pr_err("Pending AIO requests in ring #%d\n", i);
> -			return -1;
> -		}
> +		/* XXX: wait aio completion */
>  
>  		args->ring[i].max_reqs = ring->nr;
>  	}
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index f7bde75..d19f4dc 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -3,6 +3,7 @@
>  
>  #include <linux/securebits.h>
>  #include <linux/capability.h>
> +#include <linux/aio_abi.h>
>  #include <sys/types.h>
>  #include <sys/mman.h>
>  #include <sys/stat.h>
> @@ -546,6 +547,120 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
>  	return addr;
>  }
>  
> +/*
> + * This restores aio ring header, content, head and in-kernel position
> + * of tail. To set tail, we write to /dev/null and use the fact this
> + * operation is synchronious for the device. Also, we unmap temporary
> + * anonymous area, used to store content of ring buffer during restore
> + * and mapped in map_private_vma().
> + */
> +static int restore_aio_ring(struct rst_aio_ring *raio)
> +{
> +	struct aio_ring *ring = (void *)raio->addr;
> +	unsigned head = ring->head;
> +	unsigned tail = ring->tail;
> +	struct iocb *iocb, **iocbp;
> +	unsigned long ctx = 0;
> +	int i, count, fd, ret;
> +	char buf[1];
> +
> +	ret = sys_io_setup(raio->nr_req, &ctx);
> +	if (ret < 0) {
> +		pr_err("Ring setup failed with %d\n", ret);
> +		return -1;
> +	}
> +
> +	if (tail == 0 && head == 0)
> +		goto populate;
> +
> +	fd = sys_open("/dev/null", O_WRONLY, 0);
> +	if (fd < 0) {
> +		pr_err("Can't open /dev/null for aio\n");
> +		return -1;
> +	}
> +
> +	if (tail >= head)
> +		count = tail;
> +	else
> +		count = ring->nr - 1;
> +
> +	iocb = (void *)sys_mmap(NULL, count * sizeof(struct iocb), PROT_READ|PROT_WRITE,
> +				MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
> +	iocbp = (void *)sys_mmap(NULL, count * sizeof(struct iocb *), PROT_READ|PROT_WRITE,
> +				MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
> +	if (iocb == MAP_FAILED || iocbp == MAP_FAILED) {
> +		pr_err("Can't mmap aio tmp buffer\n");
> +		return -1;
> +	}
> +
> +	for (i = 0; i < count; i++) {
> +		iocbp[i] = &iocb[i];
> +		iocb[i].aio_fildes = fd;
> +		iocb[i].aio_buf = (unsigned long)buf;
> +		iocb[i].aio_nbytes = 1;
> +		iocb[i].aio_lio_opcode = IOCB_CMD_PWRITE;
> +	}
> +
> +	i = count;
> +	do {
> +		ret = sys_io_submit(ctx, i, iocbp);
> +		if (ret < 0) {
> +			pr_err("Can't submit %d aio iocbs: ret=%d\n", i, ret);
> +			return -1;
> +		}
> +		i -= ret;
> +
> +		if (count - i > head)
> +		 /*
> +		  * Though count is less than maximum available reqs, kernel's
> +		  * get_reqs_available() takes only a number of reqs, which is
> +		  * aliquot to kioctx::req_batch. So, set head to free a space
> +		  * for next io_submit().
> +		  *
> +		  * Direct set of head is equal to sys_io_getevents() call. See
> +		  * kernel for the details.
> +		  */
> +			((struct aio_ring *)ctx)->head = head;
> +	} while (i);
> +
> +	if (tail < head) {
> +		ret = sys_io_submit(ctx, tail + 1, iocbp);
> +		if (ret != tail + 1) {
> +			pr_err("Can't submit %d aio iocbs more, ret=%d\n", tail + 1, ret);
> +			return -1;
> +		}
> +	}
> +
> +	sys_munmap(iocb, count * sizeof(struct iocb));
> +	sys_munmap(iocbp, count * sizeof(struct iocb *));
> +	sys_close(fd);
> +populate:
> +	count = raio->len/sizeof(unsigned long);
> +	for (i = 0; i < count; i++)
> +		((unsigned long *)ctx)[i] = ((unsigned long *)ring)[i];
> +
> +	/* Unmap temporary anonymous area */
> +	sys_munmap(ring, raio->len);
> +
> +	/*
> +	 * If we failed to get the proper nr_req right and
> +	 * created smaller or larger ring, then this remap
> +	 * will (should) fail, since AIO rings has immutable
> +	 * size.
> +	 *
> +	 * This is not great, but anyway better than putting
> +	 * a ring of wrong size into correct place.
> +	 */
> +	ctx = sys_mremap(ctx, raio->len, raio->len,
> +				MREMAP_FIXED | MREMAP_MAYMOVE,
> +				raio->addr);
> +	if (ctx != raio->addr) {
> +		pr_err("Ring remap failed with %ld\n", ctx);
> +		return -1;
> +	}
> +	return 0;
> +}
> +
>  static void rst_tcp_repair_off(struct rst_tcp_sock *rts)
>  {
>  	int aux, ret;
> @@ -999,56 +1114,9 @@ long __export_restore_task(struct task_restore_args *args)
>  	 * Now when all VMAs are in their places time to set
>  	 * up AIO rings.
>  	 */
> -
> -	for (i = 0; i < args->rings_n; i++) {
> -		struct rst_aio_ring *raio = &args->rings[i];
> -		unsigned long ctx = 0;
> -		int ret;
> -
> -		ret = sys_io_setup(raio->nr_req, &ctx);
> -		if (ret < 0) {
> -			pr_err("Ring setup failed with %d\n", ret);
> +	for (i = 0; i < args->rings_n; i++)
> +		if (restore_aio_ring(&args->rings[i]) < 0)
>  			goto core_restore_end;
> -		}
> -
> -		if (ctx == raio->addr) /* Lucky bastards we are! */
> -			continue;
> -
> -		/*
> -		 * If we failed to get the proper nr_req right and
> -		 * created smaller or larger ring, then this remap
> -		 * will (should) fail, since AIO rings has immutable
> -		 * size.
> -		 *
> -		 * This is not great, but anyway better than putting
> -		 * a ring of wrong size into correct place.
> -		 */
> -
> -		ctx = sys_mremap(ctx, raio->len, raio->len,
> -					MREMAP_FIXED | MREMAP_MAYMOVE,
> -					raio->addr);
> -		if (ctx != raio->addr) {
> -			pr_err("Ring remap failed with %ld\n", ctx);
> -			goto core_restore_end;
> -		}
> -
> -		/*
> -		 * Now check that kernel not just remapped the
> -		 * ring into new place, but updated the internal
> -		 * context state respectively.
> -		 */
> -
> -		ret = sys_io_getevents(ctx, 0, 1, NULL, NULL);
> -		if (ret != 0) {
> -			if (ret < 0)
> -				pr_err("Kernel doesn't remap AIO rings\n");
> -			else
> -				pr_err("AIO context screwed up\n");
> -
> -			goto core_restore_end;
> -		}
> -	}
> -
>  	/*
>  	 * Finally restore madivse() bits
>  	 */
> 
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
> .
> 



More information about the CRIU mailing list