[CRIU] [PATCH v6 4/5] aio: Restore aio ring content
Pavel Emelyanov
xemul at virtuozzo.com
Fri Mar 25 04:58:12 PDT 2016
On 03/23/2016 06:00 PM, Kirill Tkhai wrote:
> 1)Dump/restore mmaped aio ring like any other private vma entry,
> with the only exception we do not predump it.
> 2)Create io context, set head and tail using write to /dev/null.
> 3)Copy aio ring restored in (1) to created in (2).
> 4)Remap (2) to address of (1).
>
> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> ---
> criu/arch/ppc64/syscalls/syscall-ppc64.tbl | 1
> criu/arch/x86/syscalls/syscall_32.tbl | 1
> criu/arch/x86/syscalls/syscall_64.tbl | 1
> criu/cr-restore.c | 10 ++-
> criu/include/syscall-types.h | 1
> criu/include/vma.h | 5 +
> criu/mem.c | 8 ++
> criu/pie/parasite.c | 9 --
> criu/pie/restorer.c | 112 +++++++++++++++++++++++-----
> 9 files changed, 117 insertions(+), 31 deletions(-)
>
> diff --git a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
> index 3319379..e71a1ad 100644
> --- a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
> +++ b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
> @@ -102,4 +102,5 @@ __NR_seccomp 358 sys_seccomp (unsigned int op, unsigned int flags, const char
> __NR_memfd_create 360 sys_memfd_create (const char *name, unsigned int flags)
> __NR_io_setup 227 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp)
> __NR_io_getevents 229 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
> +__NR_io_submit 230 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp)
> __NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth)
> diff --git a/criu/arch/x86/syscalls/syscall_32.tbl b/criu/arch/x86/syscalls/syscall_32.tbl
> index c527122..2b61530 100644
> --- a/criu/arch/x86/syscalls/syscall_32.tbl
> +++ b/criu/arch/x86/syscalls/syscall_32.tbl
> @@ -66,6 +66,7 @@ __NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info)
> __NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info)
> __NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p)
> __NR_io_getevents 247 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
> +__NR_io_submit 248 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp)
> __NR_exit_group 252 sys_exit_group (int error_code)
> __NR_set_tid_address 258 sys_set_tid_address (int *tid_addr)
> __NR_timer_create 259 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
> diff --git a/criu/arch/x86/syscalls/syscall_64.tbl b/criu/arch/x86/syscalls/syscall_64.tbl
> index 5c32d4c..e01ea8f 100644
> --- a/criu/arch/x86/syscalls/syscall_64.tbl
> +++ b/criu/arch/x86/syscalls/syscall_64.tbl
> @@ -74,6 +74,7 @@ __NR_futex 202 sys_futex (u32 *uaddr, int op, u32 val, struct timespec *utim
> __NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info)
> __NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx)
> __NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
> +__NR_io_submit 209 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp)
> __NR_get_thread_area 211 sys_get_thread_area (user_desc_t *info)
> __NR_set_tid_address 218 sys_set_tid_address (int *tid_addr)
> __NR_restart_syscall 219 sys_restart_syscall (void)
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 0c6a828..a40552e 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -316,6 +316,7 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
>
> size = vma_entry_len(vma->e);
> if (paddr == NULL) {
> + int flag = 0;
> /*
> * The respective memory area was NOT found in the parent.
> * Map a new one.
> @@ -323,9 +324,16 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
> pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
> vma->e->start, vma->e->end, vma->e->pgoff);
>
> + /*
> + * Restore AIO ring buffer content to temporary anonymous area.
> + * This will be placed in io_setup'ed AIO in restore_aio_ring().
> + */
> + if (vma_entry_is(vma->e, VMA_AREA_AIORING))
> + flag |= MAP_ANONYMOUS;
> +
> addr = mmap(*tgt_addr, size,
> vma->e->prot | PROT_WRITE,
> - vma->e->flags | MAP_FIXED,
> + vma->e->flags | MAP_FIXED | flag,
> vma->e->fd, vma->e->pgoff);
>
> if (addr == MAP_FAILED) {
> diff --git a/criu/include/syscall-types.h b/criu/include/syscall-types.h
> index fdfed9a..75fb353 100644
> --- a/criu/include/syscall-types.h
> +++ b/criu/include/syscall-types.h
> @@ -31,6 +31,7 @@ struct rusage;
> struct file_handle;
> struct robust_list_head;
> struct io_event;
> +struct iocb;
> struct timespec;
>
> typedef unsigned long aio_context_t;
> diff --git a/criu/include/vma.h b/criu/include/vma.h
> index 247c5a3..ce4d5f7 100644
> --- a/criu/include/vma.h
> +++ b/criu/include/vma.h
> @@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
> static inline bool vma_entry_is_private(VmaEntry *entry,
> unsigned long task_size)
> {
> - return vma_entry_is(entry, VMA_AREA_REGULAR) &&
> + return (vma_entry_is(entry, VMA_AREA_REGULAR) &&
> (vma_entry_is(entry, VMA_ANON_PRIVATE) ||
> vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
> - (entry->end <= task_size);
> + (entry->end <= task_size)) ||
> + vma_entry_is(entry, VMA_AREA_AIORING);
> }
>
> static inline bool vma_area_is_private(struct vma_area *vma,
> diff --git a/criu/mem.c b/criu/mem.c
> index 1379f09..0513823 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -189,6 +189,12 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
> list_for_each_entry(vma, &vma_area_list->h, list) {
> if (!vma_area_is_private(vma, kdat.task_size))
> continue;
> + /*
> + * Kernel write to aio ring is not soft-dirty tracked,
> + * so we ignore them at pre-dump.
> + */
> + if (vma_entry_is(vma->e, VMA_AREA_AIORING) && pp_ret)
> + continue;
> if (vma->e->prot & PROT_READ)
> continue;
>
> @@ -302,6 +308,8 @@ static int __parasite_dump_pages_seized(struct parasite_ctl *ctl,
>
> if (!vma_area_is_private(vma_area, kdat.task_size))
> continue;
> + if (vma_entry_is(vma_area->e, VMA_AREA_AIORING) && pp_ret)
> + continue;
>
> map = pmc_get_map(&pmc, vma_area);
> if (!map)
> diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
> index 1df3e71..d82518e 100644
> --- a/criu/pie/parasite.c
> +++ b/criu/pie/parasite.c
> @@ -410,14 +410,7 @@ static int parasite_check_aios(struct parasite_check_aios_args *args)
> return -1;
> }
>
> - /*
> - * XXX what else can we do if there are requests
> - * in the ring?
> - */
> - if (ring->head != ring->tail) {
> - pr_err("Pending AIO requests in ring #%d\n", i);
> - return -1;
> - }
> + /* XXX: wait aio completion */
>
> args->ring[i].max_reqs = ring->nr;
> }
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index b28e2f8..c655705 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -3,6 +3,7 @@
>
> #include <linux/securebits.h>
> #include <linux/capability.h>
> +#include <linux/aio_abi.h>
> #include <sys/types.h>
> #include <sys/mman.h>
> #include <sys/stat.h>
> @@ -16,6 +17,7 @@
> #include <signal.h>
>
> #include "compiler.h"
> +#include "asm/string.h"
> #include "asm/types.h"
> #include "syscall.h"
> #include "config.h"
> @@ -545,10 +547,23 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
> return addr;
> }
>
> +/*
> + * This restores aio ring header, content, head and in-kernel position
> + * of tail. To set tail, we write to /dev/null and use the fact this
> + * operation is synchronious for the device. Also, we unmap temporary
> + * anonymous area, used to store content of ring buffer during restore
> + * and mapped in map_private_vma().
> + */
> static int restore_aio_ring(struct rst_aio_ring *raio)
> {
> + struct aio_ring *ring = (void *)raio->addr;
> + int i, maxr, count, fd, ret;
> + unsigned head = ring->head;
> + unsigned tail = ring->tail;
OK, now about some sanity. The data we get here (head, tail and raio->len
at the very end) come from raw memory dumps, don't they?
What if there's some trash and we go a screw memory up?
> + struct iocb *iocb, **iocbp;
> unsigned long ctx = 0;
> - int ret;
> + unsigned size;
> + char buf[1];
>
> ret = sys_io_setup(raio->nr_req, &ctx);
> if (ret < 0) {
> @@ -556,8 +571,80 @@ static int restore_aio_ring(struct rst_aio_ring *raio)
> return -1;
> }
>
> - if (ctx == raio->addr) /* Lucky bastards we are! */
> - return 0;
> + if (tail == 0 && head == 0)
> + goto populate;
In this case we go to call memcpy. Do we __really__ need to do it?
-- Pavel
More information about the CRIU
mailing list