[CRIU] [PATCH v6 4/5] aio: Restore aio ring content

Pavel Emelyanov xemul at virtuozzo.com
Fri Mar 25 08:37:23 PDT 2016


On 03/25/2016 04:32 PM, Kirill Tkhai wrote:
> 
> 
> On 25.03.2016 14:58, Pavel Emelyanov wrote:
>> On 03/23/2016 06:00 PM, Kirill Tkhai wrote:
>>> 1)Dump/restore mmaped aio ring like any other private vma entry,
>>>   with the only exception we do not predump it.
>>> 2)Create io context, set head and tail using write to /dev/null.
>>> 3)Copy aio ring restored in (1) to created in (2).
>>> 4)Remap (2) to address of (1).
>>>
>>> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
>>> ---
>>>  criu/arch/ppc64/syscalls/syscall-ppc64.tbl |    1 
>>>  criu/arch/x86/syscalls/syscall_32.tbl      |    1 
>>>  criu/arch/x86/syscalls/syscall_64.tbl      |    1 
>>>  criu/cr-restore.c                          |   10 ++-
>>>  criu/include/syscall-types.h               |    1 
>>>  criu/include/vma.h                         |    5 +
>>>  criu/mem.c                                 |    8 ++
>>>  criu/pie/parasite.c                        |    9 --
>>>  criu/pie/restorer.c                        |  112 +++++++++++++++++++++++-----
>>>  9 files changed, 117 insertions(+), 31 deletions(-)
>>>
>>> diff --git a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
>>> index 3319379..e71a1ad 100644
>>> --- a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
>>> +++ b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
>>> @@ -102,4 +102,5 @@ __NR_seccomp		358		sys_seccomp		(unsigned int op, unsigned int flags, const char
>>>  __NR_memfd_create	360		sys_memfd_create	(const char *name, unsigned int flags)
>>>  __NR_io_setup		227		sys_io_setup		(unsigned nr_events, aio_context_t *ctx_idp)
>>>  __NR_io_getevents	229		sys_io_getevents	(aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
>>> +__NR_io_submit		230		sys_io_submit		(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
>>>  __NR_ipc		117		sys_ipc			(unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth)
>>> diff --git a/criu/arch/x86/syscalls/syscall_32.tbl b/criu/arch/x86/syscalls/syscall_32.tbl
>>> index c527122..2b61530 100644
>>> --- a/criu/arch/x86/syscalls/syscall_32.tbl
>>> +++ b/criu/arch/x86/syscalls/syscall_32.tbl
>>> @@ -66,6 +66,7 @@ __NR_set_thread_area	243		sys_set_thread_area	(user_desc_t *info)
>>>  __NR_get_thread_area	244		sys_get_thread_area	(user_desc_t *info)
>>>  __NR_io_setup		245		sys_io_setup		(unsigned nr_reqs, aio_context_t *ctx32p)
>>>  __NR_io_getevents	247		sys_io_getevents	(aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
>>> +__NR_io_submit		248		sys_io_submit		(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
>>>  __NR_exit_group		252		sys_exit_group		(int error_code)
>>>  __NR_set_tid_address	258		sys_set_tid_address	(int *tid_addr)
>>>  __NR_timer_create	259		sys_timer_create	(clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
>>> diff --git a/criu/arch/x86/syscalls/syscall_64.tbl b/criu/arch/x86/syscalls/syscall_64.tbl
>>> index 5c32d4c..e01ea8f 100644
>>> --- a/criu/arch/x86/syscalls/syscall_64.tbl
>>> +++ b/criu/arch/x86/syscalls/syscall_64.tbl
>>> @@ -74,6 +74,7 @@ __NR_futex			202		sys_futex		(u32 *uaddr, int op, u32 val, struct timespec *utim
>>>  __NR_set_thread_area		205		sys_set_thread_area	(user_desc_t *info)
>>>  __NR_io_setup			206		sys_io_setup		(unsigned nr_events, aio_context_t *ctx)
>>>  __NR_io_getevents		208		sys_io_getevents	(aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
>>> +__NR_io_submit			209		sys_io_submit		(aio_context_t ctx, long nr, struct iocb **iocbpp)
>>>  __NR_get_thread_area		211		sys_get_thread_area	(user_desc_t *info)
>>>  __NR_set_tid_address		218		sys_set_tid_address	(int *tid_addr)
>>>  __NR_restart_syscall		219		sys_restart_syscall	(void)
>>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>>> index 0c6a828..a40552e 100644
>>> --- a/criu/cr-restore.c
>>> +++ b/criu/cr-restore.c
>>> @@ -316,6 +316,7 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
>>>  
>>>  	size = vma_entry_len(vma->e);
>>>  	if (paddr == NULL) {
>>> +		int flag = 0;
>>>  		/*
>>>  		 * The respective memory area was NOT found in the parent.
>>>  		 * Map a new one.
>>> @@ -323,9 +324,16 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
>>>  		pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
>>>  			vma->e->start, vma->e->end, vma->e->pgoff);
>>>  
>>> +		/*
>>> +		 * Restore AIO ring buffer content to temporary anonymous area.
>>> +		 * This will be placed in io_setup'ed AIO in restore_aio_ring().
>>> +		 */
>>> +		if (vma_entry_is(vma->e, VMA_AREA_AIORING))
>>> +			flag |= MAP_ANONYMOUS;
>>> +
>>>  		addr = mmap(*tgt_addr, size,
>>>  				vma->e->prot | PROT_WRITE,
>>> -				vma->e->flags | MAP_FIXED,
>>> +				vma->e->flags | MAP_FIXED | flag,
>>>  				vma->e->fd, vma->e->pgoff);
>>>  
>>>  		if (addr == MAP_FAILED) {
>>> diff --git a/criu/include/syscall-types.h b/criu/include/syscall-types.h
>>> index fdfed9a..75fb353 100644
>>> --- a/criu/include/syscall-types.h
>>> +++ b/criu/include/syscall-types.h
>>> @@ -31,6 +31,7 @@ struct rusage;
>>>  struct file_handle;
>>>  struct robust_list_head;
>>>  struct io_event;
>>> +struct iocb;
>>>  struct timespec;
>>>  
>>>  typedef unsigned long aio_context_t;
>>> diff --git a/criu/include/vma.h b/criu/include/vma.h
>>> index 247c5a3..ce4d5f7 100644
>>> --- a/criu/include/vma.h
>>> +++ b/criu/include/vma.h
>>> @@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
>>>  static inline bool vma_entry_is_private(VmaEntry *entry,
>>>  					unsigned long task_size)
>>>  {
>>> -	return vma_entry_is(entry, VMA_AREA_REGULAR)	&&
>>> +	return (vma_entry_is(entry, VMA_AREA_REGULAR)	&&
>>>  		(vma_entry_is(entry, VMA_ANON_PRIVATE)	||
>>>  		 vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
>>> -		 (entry->end <= task_size);
>>> +		 (entry->end <= task_size)) ||
>>> +		vma_entry_is(entry, VMA_AREA_AIORING);
>>>  }
>>>  
>>>  static inline bool vma_area_is_private(struct vma_area *vma,
>>> diff --git a/criu/mem.c b/criu/mem.c
>>> index 1379f09..0513823 100644
>>> --- a/criu/mem.c
>>> +++ b/criu/mem.c
>>> @@ -189,6 +189,12 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
>>>  	list_for_each_entry(vma, &vma_area_list->h, list) {
>>>  		if (!vma_area_is_private(vma, kdat.task_size))
>>>  			continue;
>>> +		/*
>>> +		 * Kernel write to aio ring is not soft-dirty tracked,
>>> +		 * so we ignore them at pre-dump.
>>> +		 */
>>> +		if (vma_entry_is(vma->e, VMA_AREA_AIORING) && pp_ret)
>>> +			continue;
>>>  		if (vma->e->prot & PROT_READ)
>>>  			continue;
>>>  
>>> @@ -302,6 +308,8 @@ static int __parasite_dump_pages_seized(struct parasite_ctl *ctl,
>>>  
>>>  		if (!vma_area_is_private(vma_area, kdat.task_size))
>>>  			continue;
>>> +		if (vma_entry_is(vma_area->e, VMA_AREA_AIORING) && pp_ret)
>>> +			continue;
>>>  
>>>  		map = pmc_get_map(&pmc, vma_area);
>>>  		if (!map)
>>> diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
>>> index 1df3e71..d82518e 100644
>>> --- a/criu/pie/parasite.c
>>> +++ b/criu/pie/parasite.c
>>> @@ -410,14 +410,7 @@ static int parasite_check_aios(struct parasite_check_aios_args *args)
>>>  			return -1;
>>>  		}
>>>  
>>> -		/*
>>> -		 * XXX what else can we do if there are requests
>>> -		 * in the ring?
>>> -		 */
>>> -		if (ring->head != ring->tail) {
>>> -			pr_err("Pending AIO requests in ring #%d\n", i);
>>> -			return -1;
>>> -		}
>>> +		/* XXX: wait aio completion */
>>>  
>>>  		args->ring[i].max_reqs = ring->nr;
>>>  	}
>>> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
>>> index b28e2f8..c655705 100644
>>> --- a/criu/pie/restorer.c
>>> +++ b/criu/pie/restorer.c
>>> @@ -3,6 +3,7 @@
>>>  
>>>  #include <linux/securebits.h>
>>>  #include <linux/capability.h>
>>> +#include <linux/aio_abi.h>
>>>  #include <sys/types.h>
>>>  #include <sys/mman.h>
>>>  #include <sys/stat.h>
>>> @@ -16,6 +17,7 @@
>>>  #include <signal.h>
>>>  
>>>  #include "compiler.h"
>>> +#include "asm/string.h"
>>>  #include "asm/types.h"
>>>  #include "syscall.h"
>>>  #include "config.h"
>>> @@ -545,10 +547,23 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
>>>  	return addr;
>>>  }
>>>  
>>> +/*
>>> + * This restores aio ring header, content, head and in-kernel position
>>> + * of tail. To set tail, we write to /dev/null and use the fact this
>>> + * operation is synchronious for the device. Also, we unmap temporary
>>> + * anonymous area, used to store content of ring buffer during restore
>>> + * and mapped in map_private_vma().
>>> + */
>>>  static int restore_aio_ring(struct rst_aio_ring *raio)
>>>  {
>>> +	struct aio_ring *ring = (void *)raio->addr;
>>> +	int i, maxr, count, fd, ret;
>>> +	unsigned head = ring->head;
>>> +	unsigned tail = ring->tail;
>>
>> OK, now about some sanity. The data we get here (head, tail and raio->len
>> at the very end) come from raw memory dumps, don't they?
>>
>> What if there's some trash and we go a screw memory up?
> 
> There weren't any checks of raio->len before my patchset, so I followed this way.
> Which suggestion do you have to do here?
> 
> head and tail are taken by kernel of modulo of nr_events. If they are too big,
> then we just submit many extra requests, but this won't corrupt a memory.
> Of course, we may compare them with raio->len. Do we have to?

Yes, but we used the len purely to mmap() the ring. Be it too big
we'd just bail out with ENOMEM. In your case you memcpy memory contents
based on these values. So yes, send me patch 6/5 with sanity checks
of len, head and tail.

>>> +	struct iocb *iocb, **iocbp;
>>>  	unsigned long ctx = 0;
>>> -	int ret;
>>> +	unsigned size;
>>> +	char buf[1];
>>>  
>>>  	ret = sys_io_setup(raio->nr_req, &ctx);
>>>  	if (ret < 0) {
>>> @@ -556,8 +571,80 @@ static int restore_aio_ring(struct rst_aio_ring *raio)
>>>  		return -1;
>>>  	}
>>>  
>>> -	if (ctx == raio->addr) /* Lucky bastards we are! */
>>> -		return 0;
>>> +	if (tail == 0 && head == 0)
>>> +		goto populate;
>>
>> In this case we go to call memcpy. Do we __really__ need to do it?
> 
> I think we need this, because it restores memory 1:1 like it was before.
> We can't relay on an assumption a userspace works sane. Some tasks may
> think they own a ring buffer, if they do not submit a request and if no
> in-flight requests.
> 
> There is no a error and this will not be a great speed-up, if we skip this.

OK, let's keep it.

-- Pavel



More information about the CRIU mailing list