[CRIU] [PATCH v6 4/5] aio: Restore aio ring content
Pavel Emelyanov
xemul at virtuozzo.com
Fri Mar 25 08:37:23 PDT 2016
On 03/25/2016 04:32 PM, Kirill Tkhai wrote:
>
>
> On 25.03.2016 14:58, Pavel Emelyanov wrote:
>> On 03/23/2016 06:00 PM, Kirill Tkhai wrote:
>>> 1)Dump/restore mmaped aio ring like any other private vma entry,
>>> with the only exception we do not predump it.
>>> 2)Create io context, set head and tail using write to /dev/null.
>>> 3)Copy aio ring restored in (1) to created in (2).
>>> 4)Remap (2) to address of (1).
>>>
>>> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
>>> ---
>>> criu/arch/ppc64/syscalls/syscall-ppc64.tbl | 1
>>> criu/arch/x86/syscalls/syscall_32.tbl | 1
>>> criu/arch/x86/syscalls/syscall_64.tbl | 1
>>> criu/cr-restore.c | 10 ++-
>>> criu/include/syscall-types.h | 1
>>> criu/include/vma.h | 5 +
>>> criu/mem.c | 8 ++
>>> criu/pie/parasite.c | 9 --
>>> criu/pie/restorer.c | 112 +++++++++++++++++++++++-----
>>> 9 files changed, 117 insertions(+), 31 deletions(-)
>>>
>>> diff --git a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
>>> index 3319379..e71a1ad 100644
>>> --- a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
>>> +++ b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
>>> @@ -102,4 +102,5 @@ __NR_seccomp 358 sys_seccomp (unsigned int op, unsigned int flags, const char
>>> __NR_memfd_create 360 sys_memfd_create (const char *name, unsigned int flags)
>>> __NR_io_setup 227 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp)
>>> __NR_io_getevents 229 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
>>> +__NR_io_submit 230 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp)
>>> __NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth)
>>> diff --git a/criu/arch/x86/syscalls/syscall_32.tbl b/criu/arch/x86/syscalls/syscall_32.tbl
>>> index c527122..2b61530 100644
>>> --- a/criu/arch/x86/syscalls/syscall_32.tbl
>>> +++ b/criu/arch/x86/syscalls/syscall_32.tbl
>>> @@ -66,6 +66,7 @@ __NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info)
>>> __NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info)
>>> __NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p)
>>> __NR_io_getevents 247 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
>>> +__NR_io_submit 248 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp)
>>> __NR_exit_group 252 sys_exit_group (int error_code)
>>> __NR_set_tid_address 258 sys_set_tid_address (int *tid_addr)
>>> __NR_timer_create 259 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
>>> diff --git a/criu/arch/x86/syscalls/syscall_64.tbl b/criu/arch/x86/syscalls/syscall_64.tbl
>>> index 5c32d4c..e01ea8f 100644
>>> --- a/criu/arch/x86/syscalls/syscall_64.tbl
>>> +++ b/criu/arch/x86/syscalls/syscall_64.tbl
>>> @@ -74,6 +74,7 @@ __NR_futex 202 sys_futex (u32 *uaddr, int op, u32 val, struct timespec *utim
>>> __NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info)
>>> __NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx)
>>> __NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
>>> +__NR_io_submit 209 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp)
>>> __NR_get_thread_area 211 sys_get_thread_area (user_desc_t *info)
>>> __NR_set_tid_address 218 sys_set_tid_address (int *tid_addr)
>>> __NR_restart_syscall 219 sys_restart_syscall (void)
>>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>>> index 0c6a828..a40552e 100644
>>> --- a/criu/cr-restore.c
>>> +++ b/criu/cr-restore.c
>>> @@ -316,6 +316,7 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
>>>
>>> size = vma_entry_len(vma->e);
>>> if (paddr == NULL) {
>>> + int flag = 0;
>>> /*
>>> * The respective memory area was NOT found in the parent.
>>> * Map a new one.
>>> @@ -323,9 +324,16 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
>>> pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
>>> vma->e->start, vma->e->end, vma->e->pgoff);
>>>
>>> + /*
>>> + * Restore AIO ring buffer content to temporary anonymous area.
>>> + * This will be placed in io_setup'ed AIO in restore_aio_ring().
>>> + */
>>> + if (vma_entry_is(vma->e, VMA_AREA_AIORING))
>>> + flag |= MAP_ANONYMOUS;
>>> +
>>> addr = mmap(*tgt_addr, size,
>>> vma->e->prot | PROT_WRITE,
>>> - vma->e->flags | MAP_FIXED,
>>> + vma->e->flags | MAP_FIXED | flag,
>>> vma->e->fd, vma->e->pgoff);
>>>
>>> if (addr == MAP_FAILED) {
>>> diff --git a/criu/include/syscall-types.h b/criu/include/syscall-types.h
>>> index fdfed9a..75fb353 100644
>>> --- a/criu/include/syscall-types.h
>>> +++ b/criu/include/syscall-types.h
>>> @@ -31,6 +31,7 @@ struct rusage;
>>> struct file_handle;
>>> struct robust_list_head;
>>> struct io_event;
>>> +struct iocb;
>>> struct timespec;
>>>
>>> typedef unsigned long aio_context_t;
>>> diff --git a/criu/include/vma.h b/criu/include/vma.h
>>> index 247c5a3..ce4d5f7 100644
>>> --- a/criu/include/vma.h
>>> +++ b/criu/include/vma.h
>>> @@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
>>> static inline bool vma_entry_is_private(VmaEntry *entry,
>>> unsigned long task_size)
>>> {
>>> - return vma_entry_is(entry, VMA_AREA_REGULAR) &&
>>> + return (vma_entry_is(entry, VMA_AREA_REGULAR) &&
>>> (vma_entry_is(entry, VMA_ANON_PRIVATE) ||
>>> vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
>>> - (entry->end <= task_size);
>>> + (entry->end <= task_size)) ||
>>> + vma_entry_is(entry, VMA_AREA_AIORING);
>>> }
>>>
>>> static inline bool vma_area_is_private(struct vma_area *vma,
>>> diff --git a/criu/mem.c b/criu/mem.c
>>> index 1379f09..0513823 100644
>>> --- a/criu/mem.c
>>> +++ b/criu/mem.c
>>> @@ -189,6 +189,12 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
>>> list_for_each_entry(vma, &vma_area_list->h, list) {
>>> if (!vma_area_is_private(vma, kdat.task_size))
>>> continue;
>>> + /*
>>> + * Kernel write to aio ring is not soft-dirty tracked,
>>> + * so we ignore them at pre-dump.
>>> + */
>>> + if (vma_entry_is(vma->e, VMA_AREA_AIORING) && pp_ret)
>>> + continue;
>>> if (vma->e->prot & PROT_READ)
>>> continue;
>>>
>>> @@ -302,6 +308,8 @@ static int __parasite_dump_pages_seized(struct parasite_ctl *ctl,
>>>
>>> if (!vma_area_is_private(vma_area, kdat.task_size))
>>> continue;
>>> + if (vma_entry_is(vma_area->e, VMA_AREA_AIORING) && pp_ret)
>>> + continue;
>>>
>>> map = pmc_get_map(&pmc, vma_area);
>>> if (!map)
>>> diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
>>> index 1df3e71..d82518e 100644
>>> --- a/criu/pie/parasite.c
>>> +++ b/criu/pie/parasite.c
>>> @@ -410,14 +410,7 @@ static int parasite_check_aios(struct parasite_check_aios_args *args)
>>> return -1;
>>> }
>>>
>>> - /*
>>> - * XXX what else can we do if there are requests
>>> - * in the ring?
>>> - */
>>> - if (ring->head != ring->tail) {
>>> - pr_err("Pending AIO requests in ring #%d\n", i);
>>> - return -1;
>>> - }
>>> + /* XXX: wait aio completion */
>>>
>>> args->ring[i].max_reqs = ring->nr;
>>> }
>>> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
>>> index b28e2f8..c655705 100644
>>> --- a/criu/pie/restorer.c
>>> +++ b/criu/pie/restorer.c
>>> @@ -3,6 +3,7 @@
>>>
>>> #include <linux/securebits.h>
>>> #include <linux/capability.h>
>>> +#include <linux/aio_abi.h>
>>> #include <sys/types.h>
>>> #include <sys/mman.h>
>>> #include <sys/stat.h>
>>> @@ -16,6 +17,7 @@
>>> #include <signal.h>
>>>
>>> #include "compiler.h"
>>> +#include "asm/string.h"
>>> #include "asm/types.h"
>>> #include "syscall.h"
>>> #include "config.h"
>>> @@ -545,10 +547,23 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
>>> return addr;
>>> }
>>>
>>> +/*
>>> + * This restores aio ring header, content, head and in-kernel position
>>> + * of tail. To set tail, we write to /dev/null and use the fact this
>>> + * operation is synchronious for the device. Also, we unmap temporary
>>> + * anonymous area, used to store content of ring buffer during restore
>>> + * and mapped in map_private_vma().
>>> + */
>>> static int restore_aio_ring(struct rst_aio_ring *raio)
>>> {
>>> + struct aio_ring *ring = (void *)raio->addr;
>>> + int i, maxr, count, fd, ret;
>>> + unsigned head = ring->head;
>>> + unsigned tail = ring->tail;
>>
>> OK, now about some sanity. The data we get here (head, tail and raio->len
>> at the very end) come from raw memory dumps, don't they?
>>
>> What if there's some trash and we go a screw memory up?
>
> There weren't any checks of raio->len before my patchset, so I followed this way.
> Which suggestion do you have to do here?
>
> head and tail are taken by kernel of modulo of nr_events. If they are too big,
> then we just submit many extra requests, but this won't corrupt a memory.
> Of course, we may compare them with raio->len. Do we have to?
Yes, but we used the len purely to mmap() the ring. Be it too big
we'd just bail out with ENOMEM. In your case you memcpy memory contents
based on these values. So yes, send me patch 6/5 with sanity checks
of len, head and tail.
>>> + struct iocb *iocb, **iocbp;
>>> unsigned long ctx = 0;
>>> - int ret;
>>> + unsigned size;
>>> + char buf[1];
>>>
>>> ret = sys_io_setup(raio->nr_req, &ctx);
>>> if (ret < 0) {
>>> @@ -556,8 +571,80 @@ static int restore_aio_ring(struct rst_aio_ring *raio)
>>> return -1;
>>> }
>>>
>>> - if (ctx == raio->addr) /* Lucky bastards we are! */
>>> - return 0;
>>> + if (tail == 0 && head == 0)
>>> + goto populate;
>>
>> In this case we go to call memcpy. Do we __really__ need to do it?
>
> I think we need this, because it restores memory 1:1 like it was before.
> We can't relay on an assumption a userspace works sane. Some tasks may
> think they own a ring buffer, if they do not submit a request and if no
> in-flight requests.
>
> There is no a error and this will not be a great speed-up, if we skip this.
OK, let's keep it.
-- Pavel
More information about the CRIU
mailing list