[CRIU] [PATCH v6 4/5] aio: Restore aio ring content

Kirill Tkhai ktkhai at virtuozzo.com
Fri Mar 25 06:32:29 PDT 2016



On 25.03.2016 14:58, Pavel Emelyanov wrote:
> On 03/23/2016 06:00 PM, Kirill Tkhai wrote:
>> 1)Dump/restore mmaped aio ring like any other private vma entry,
>>   with the only exception we do not predump it.
>> 2)Create io context, set head and tail using write to /dev/null.
>> 3)Copy aio ring restored in (1) to created in (2).
>> 4)Remap (2) to address of (1).
>>
>> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
>> ---
>>  criu/arch/ppc64/syscalls/syscall-ppc64.tbl |    1 
>>  criu/arch/x86/syscalls/syscall_32.tbl      |    1 
>>  criu/arch/x86/syscalls/syscall_64.tbl      |    1 
>>  criu/cr-restore.c                          |   10 ++-
>>  criu/include/syscall-types.h               |    1 
>>  criu/include/vma.h                         |    5 +
>>  criu/mem.c                                 |    8 ++
>>  criu/pie/parasite.c                        |    9 --
>>  criu/pie/restorer.c                        |  112 +++++++++++++++++++++++-----
>>  9 files changed, 117 insertions(+), 31 deletions(-)
>>
>> diff --git a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
>> index 3319379..e71a1ad 100644
>> --- a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
>> +++ b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
>> @@ -102,4 +102,5 @@ __NR_seccomp		358		sys_seccomp		(unsigned int op, unsigned int flags, const char
>>  __NR_memfd_create	360		sys_memfd_create	(const char *name, unsigned int flags)
>>  __NR_io_setup		227		sys_io_setup		(unsigned nr_events, aio_context_t *ctx_idp)
>>  __NR_io_getevents	229		sys_io_getevents	(aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
>> +__NR_io_submit		230		sys_io_submit		(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
>>  __NR_ipc		117		sys_ipc			(unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth)
>> diff --git a/criu/arch/x86/syscalls/syscall_32.tbl b/criu/arch/x86/syscalls/syscall_32.tbl
>> index c527122..2b61530 100644
>> --- a/criu/arch/x86/syscalls/syscall_32.tbl
>> +++ b/criu/arch/x86/syscalls/syscall_32.tbl
>> @@ -66,6 +66,7 @@ __NR_set_thread_area	243		sys_set_thread_area	(user_desc_t *info)
>>  __NR_get_thread_area	244		sys_get_thread_area	(user_desc_t *info)
>>  __NR_io_setup		245		sys_io_setup		(unsigned nr_reqs, aio_context_t *ctx32p)
>>  __NR_io_getevents	247		sys_io_getevents	(aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
>> +__NR_io_submit		248		sys_io_submit		(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
>>  __NR_exit_group		252		sys_exit_group		(int error_code)
>>  __NR_set_tid_address	258		sys_set_tid_address	(int *tid_addr)
>>  __NR_timer_create	259		sys_timer_create	(clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
>> diff --git a/criu/arch/x86/syscalls/syscall_64.tbl b/criu/arch/x86/syscalls/syscall_64.tbl
>> index 5c32d4c..e01ea8f 100644
>> --- a/criu/arch/x86/syscalls/syscall_64.tbl
>> +++ b/criu/arch/x86/syscalls/syscall_64.tbl
>> @@ -74,6 +74,7 @@ __NR_futex			202		sys_futex		(u32 *uaddr, int op, u32 val, struct timespec *utim
>>  __NR_set_thread_area		205		sys_set_thread_area	(user_desc_t *info)
>>  __NR_io_setup			206		sys_io_setup		(unsigned nr_events, aio_context_t *ctx)
>>  __NR_io_getevents		208		sys_io_getevents	(aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
>> +__NR_io_submit			209		sys_io_submit		(aio_context_t ctx, long nr, struct iocb **iocbpp)
>>  __NR_get_thread_area		211		sys_get_thread_area	(user_desc_t *info)
>>  __NR_set_tid_address		218		sys_set_tid_address	(int *tid_addr)
>>  __NR_restart_syscall		219		sys_restart_syscall	(void)
>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>> index 0c6a828..a40552e 100644
>> --- a/criu/cr-restore.c
>> +++ b/criu/cr-restore.c
>> @@ -316,6 +316,7 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
>>  
>>  	size = vma_entry_len(vma->e);
>>  	if (paddr == NULL) {
>> +		int flag = 0;
>>  		/*
>>  		 * The respective memory area was NOT found in the parent.
>>  		 * Map a new one.
>> @@ -323,9 +324,16 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
>>  		pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
>>  			vma->e->start, vma->e->end, vma->e->pgoff);
>>  
>> +		/*
>> +		 * Restore AIO ring buffer content to temporary anonymous area.
>> +		 * This will be placed in io_setup'ed AIO in restore_aio_ring().
>> +		 */
>> +		if (vma_entry_is(vma->e, VMA_AREA_AIORING))
>> +			flag |= MAP_ANONYMOUS;
>> +
>>  		addr = mmap(*tgt_addr, size,
>>  				vma->e->prot | PROT_WRITE,
>> -				vma->e->flags | MAP_FIXED,
>> +				vma->e->flags | MAP_FIXED | flag,
>>  				vma->e->fd, vma->e->pgoff);
>>  
>>  		if (addr == MAP_FAILED) {
>> diff --git a/criu/include/syscall-types.h b/criu/include/syscall-types.h
>> index fdfed9a..75fb353 100644
>> --- a/criu/include/syscall-types.h
>> +++ b/criu/include/syscall-types.h
>> @@ -31,6 +31,7 @@ struct rusage;
>>  struct file_handle;
>>  struct robust_list_head;
>>  struct io_event;
>> +struct iocb;
>>  struct timespec;
>>  
>>  typedef unsigned long aio_context_t;
>> diff --git a/criu/include/vma.h b/criu/include/vma.h
>> index 247c5a3..ce4d5f7 100644
>> --- a/criu/include/vma.h
>> +++ b/criu/include/vma.h
>> @@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
>>  static inline bool vma_entry_is_private(VmaEntry *entry,
>>  					unsigned long task_size)
>>  {
>> -	return vma_entry_is(entry, VMA_AREA_REGULAR)	&&
>> +	return (vma_entry_is(entry, VMA_AREA_REGULAR)	&&
>>  		(vma_entry_is(entry, VMA_ANON_PRIVATE)	||
>>  		 vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
>> -		 (entry->end <= task_size);
>> +		 (entry->end <= task_size)) ||
>> +		vma_entry_is(entry, VMA_AREA_AIORING);
>>  }
>>  
>>  static inline bool vma_area_is_private(struct vma_area *vma,
>> diff --git a/criu/mem.c b/criu/mem.c
>> index 1379f09..0513823 100644
>> --- a/criu/mem.c
>> +++ b/criu/mem.c
>> @@ -189,6 +189,12 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
>>  	list_for_each_entry(vma, &vma_area_list->h, list) {
>>  		if (!vma_area_is_private(vma, kdat.task_size))
>>  			continue;
>> +		/*
>> +		 * Kernel write to aio ring is not soft-dirty tracked,
>> +		 * so we ignore them at pre-dump.
>> +		 */
>> +		if (vma_entry_is(vma->e, VMA_AREA_AIORING) && pp_ret)
>> +			continue;
>>  		if (vma->e->prot & PROT_READ)
>>  			continue;
>>  
>> @@ -302,6 +308,8 @@ static int __parasite_dump_pages_seized(struct parasite_ctl *ctl,
>>  
>>  		if (!vma_area_is_private(vma_area, kdat.task_size))
>>  			continue;
>> +		if (vma_entry_is(vma_area->e, VMA_AREA_AIORING) && pp_ret)
>> +			continue;
>>  
>>  		map = pmc_get_map(&pmc, vma_area);
>>  		if (!map)
>> diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
>> index 1df3e71..d82518e 100644
>> --- a/criu/pie/parasite.c
>> +++ b/criu/pie/parasite.c
>> @@ -410,14 +410,7 @@ static int parasite_check_aios(struct parasite_check_aios_args *args)
>>  			return -1;
>>  		}
>>  
>> -		/*
>> -		 * XXX what else can we do if there are requests
>> -		 * in the ring?
>> -		 */
>> -		if (ring->head != ring->tail) {
>> -			pr_err("Pending AIO requests in ring #%d\n", i);
>> -			return -1;
>> -		}
>> +		/* XXX: wait aio completion */
>>  
>>  		args->ring[i].max_reqs = ring->nr;
>>  	}
>> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
>> index b28e2f8..c655705 100644
>> --- a/criu/pie/restorer.c
>> +++ b/criu/pie/restorer.c
>> @@ -3,6 +3,7 @@
>>  
>>  #include <linux/securebits.h>
>>  #include <linux/capability.h>
>> +#include <linux/aio_abi.h>
>>  #include <sys/types.h>
>>  #include <sys/mman.h>
>>  #include <sys/stat.h>
>> @@ -16,6 +17,7 @@
>>  #include <signal.h>
>>  
>>  #include "compiler.h"
>> +#include "asm/string.h"
>>  #include "asm/types.h"
>>  #include "syscall.h"
>>  #include "config.h"
>> @@ -545,10 +547,23 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
>>  	return addr;
>>  }
>>  
>> +/*
>> + * This restores aio ring header, content, head and in-kernel position
>> + * of tail. To set tail, we write to /dev/null and use the fact this
>> + * operation is synchronious for the device. Also, we unmap temporary
>> + * anonymous area, used to store content of ring buffer during restore
>> + * and mapped in map_private_vma().
>> + */
>>  static int restore_aio_ring(struct rst_aio_ring *raio)
>>  {
>> +	struct aio_ring *ring = (void *)raio->addr;
>> +	int i, maxr, count, fd, ret;
>> +	unsigned head = ring->head;
>> +	unsigned tail = ring->tail;
> 
> OK, now about some sanity. The data we get here (head, tail and raio->len
> at the very end) come from raw memory dumps, don't they?
> 
> What if there's some trash and we go a screw memory up?

There weren't any checks of raio->len before my patchset, so I followed this way.
Which suggestion do you have to do here?

head and tail are taken by kernel of modulo of nr_events. If they are too big,
then we just submit many extra requests, but this won't corrupt a memory.
Of course, we may compare them with raio->len. Do we have to?

>> +	struct iocb *iocb, **iocbp;
>>  	unsigned long ctx = 0;
>> -	int ret;
>> +	unsigned size;
>> +	char buf[1];
>>  
>>  	ret = sys_io_setup(raio->nr_req, &ctx);
>>  	if (ret < 0) {
>> @@ -556,8 +571,80 @@ static int restore_aio_ring(struct rst_aio_ring *raio)
>>  		return -1;
>>  	}
>>  
>> -	if (ctx == raio->addr) /* Lucky bastards we are! */
>> -		return 0;
>> +	if (tail == 0 && head == 0)
>> +		goto populate;
> 
> In this case we go to call memcpy. Do we __really__ need to do it?

I think we need this, because it restores memory 1:1 like it was before.
We can't relay on an assumption a userspace works sane. Some tasks may
think they own a ring buffer, if they do not submit a request and if no
in-flight requests.

There is no a error and this will not be a great speed-up, if we skip this.


More information about the CRIU mailing list