[CRIU] [PATCH v2 3/3] aio: Restore aio ring content
Pavel Emelyanov
xemul at virtuozzo.com
Thu Mar 17 06:19:03 PDT 2016
> @@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
> static inline bool vma_entry_is_private(VmaEntry *entry,
> unsigned long task_size)
> {
> - return vma_entry_is(entry, VMA_AREA_REGULAR) &&
> + return (vma_entry_is(entry, VMA_AREA_REGULAR) &&
> (vma_entry_is(entry, VMA_ANON_PRIVATE) ||
> vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
> - (entry->end <= task_size);
> + (entry->end <= task_size)) ||
> + vma_entry_is(entry, VMA_AREA_AIORING);
I'm not sure this is safe. How would pre-dumps act on rings?
> }
>
> static inline bool vma_area_is_private(struct vma_area *vma,
> diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
> index 1df3e71..d82518e 100644
> --- a/criu/pie/parasite.c
> +++ b/criu/pie/parasite.c
> @@ -410,14 +410,7 @@ static int parasite_check_aios(struct parasite_check_aios_args *args)
> return -1;
> }
>
> - /*
> - * XXX what else can we do if there are requests
> - * in the ring?
> - */
> - if (ring->head != ring->tail) {
> - pr_err("Pending AIO requests in ring #%d\n", i);
> - return -1;
> - }
> + /* XXX: wait aio completion */
>
> args->ring[i].max_reqs = ring->nr;
> }
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index f7bde75..d19f4dc 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -3,6 +3,7 @@
>
> #include <linux/securebits.h>
> #include <linux/capability.h>
> +#include <linux/aio_abi.h>
> #include <sys/types.h>
> #include <sys/mman.h>
> #include <sys/stat.h>
> @@ -546,6 +547,120 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
> return addr;
> }
>
> +/*
> + * This restores aio ring header, content, head and in-kernel position
> + * of tail. To set tail, we write to /dev/null and use the fact this
> + * operation is synchronious for the device. Also, we unmap temporary
> + * anonymous area, used to store content of ring buffer during restore
> + * and mapped in map_private_vma().
> + */
> +static int restore_aio_ring(struct rst_aio_ring *raio)
> +{
> + struct aio_ring *ring = (void *)raio->addr;
> + unsigned head = ring->head;
> + unsigned tail = ring->tail;
> + struct iocb *iocb, **iocbp;
> + unsigned long ctx = 0;
> + int i, count, fd, ret;
> + char buf[1];
> +
> + ret = sys_io_setup(raio->nr_req, &ctx);
> + if (ret < 0) {
> + pr_err("Ring setup failed with %d\n", ret);
> + return -1;
> + }
> +
> + if (tail == 0 && head == 0)
> + goto populate;
if (tail == head) ?
> +
> + fd = sys_open("/dev/null", O_WRONLY, 0);
> + if (fd < 0) {
> + pr_err("Can't open /dev/null for aio\n");
> + return -1;
> + }
> +
> + if (tail >= head)
> + count = tail;
count = tail - head ?
> + else
> + count = ring->nr - 1;
count = ring->nr - (tail - head) ?
> +
> + iocb = (void *)sys_mmap(NULL, count * sizeof(struct iocb), PROT_READ|PROT_WRITE,
> + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
> + iocbp = (void *)sys_mmap(NULL, count * sizeof(struct iocb *), PROT_READ|PROT_WRITE,
> + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
Use singe mmap for all the memory you need.
> + if (iocb == MAP_FAILED || iocbp == MAP_FAILED) {
> + pr_err("Can't mmap aio tmp buffer\n");
> + return -1;
> + }
> +
> + for (i = 0; i < count; i++) {
> + iocbp[i] = &iocb[i];
> + iocb[i].aio_fildes = fd;
> + iocb[i].aio_buf = (unsigned long)buf;
> + iocb[i].aio_nbytes = 1;
> + iocb[i].aio_lio_opcode = IOCB_CMD_PWRITE;
Only writes? Are there asynchronous reads?
> + }
> +
> + i = count;
> + do {
> + ret = sys_io_submit(ctx, i, iocbp);
> + if (ret < 0) {
> + pr_err("Can't submit %d aio iocbs: ret=%d\n", i, ret);
> + return -1;
> + }
> + i -= ret;
> +
> + if (count - i > head)
> + /*
> + * Though count is less than maximum available reqs, kernel's
> + * get_reqs_available() takes only a number of reqs, which is
> + * aliquot to kioctx::req_batch. So, set head to free a space
> + * for next io_submit().
> + *
> + * Direct set of head is equal to sys_io_getevents() call. See
> + * kernel for the details.
> + */
> + ((struct aio_ring *)ctx)->head = head;
> + } while (i);
> +
> + if (tail < head) {
> + ret = sys_io_submit(ctx, tail + 1, iocbp);
Why?!
> + if (ret != tail + 1) {
> + pr_err("Can't submit %d aio iocbs more, ret=%d\n", tail + 1, ret);
> + return -1;
> + }
> + }
> +
> + sys_munmap(iocb, count * sizeof(struct iocb));
> + sys_munmap(iocbp, count * sizeof(struct iocb *));
> + sys_close(fd);
> +populate:
> + count = raio->len/sizeof(unsigned long);
> + for (i = 0; i < count; i++)
> + ((unsigned long *)ctx)[i] = ((unsigned long *)ring)[i];
Don't copy the whole rings, we only need the events.
> +
> + /* Unmap temporary anonymous area */
> + sys_munmap(ring, raio->len);
Presumably this is not required, sys_mremap() unmaps the target.
> +
> + /*
> + * If we failed to get the proper nr_req right and
> + * created smaller or larger ring, then this remap
> + * will (should) fail, since AIO rings has immutable
> + * size.
> + *
> + * This is not great, but anyway better than putting
> + * a ring of wrong size into correct place.
> + */
> + ctx = sys_mremap(ctx, raio->len, raio->len,
> + MREMAP_FIXED | MREMAP_MAYMOVE,
> + raio->addr);
> + if (ctx != raio->addr) {
> + pr_err("Ring remap failed with %ld\n", ctx);
> + return -1;
> + }
> + return 0;
> +}
> +
> static void rst_tcp_repair_off(struct rst_tcp_sock *rts)
> {
> int aux, ret;
> @@ -999,56 +1114,9 @@ long __export_restore_task(struct task_restore_args *args)
> * Now when all VMAs are in their places time to set
> * up AIO rings.
> */
> -
> - for (i = 0; i < args->rings_n; i++) {
> - struct rst_aio_ring *raio = &args->rings[i];
> - unsigned long ctx = 0;
> - int ret;
> -
> - ret = sys_io_setup(raio->nr_req, &ctx);
> - if (ret < 0) {
> - pr_err("Ring setup failed with %d\n", ret);
> + for (i = 0; i < args->rings_n; i++)
> + if (restore_aio_ring(&args->rings[i]) < 0)
> goto core_restore_end;
For such huge patching introduce the intermediate patch that just moves
the needed code into a helper w/o any changes.
> - }
> -
> - if (ctx == raio->addr) /* Lucky bastards we are! */
> - continue;
> -
> - /*
> - * If we failed to get the proper nr_req right and
> - * created smaller or larger ring, then this remap
> - * will (should) fail, since AIO rings has immutable
> - * size.
> - *
> - * This is not great, but anyway better than putting
> - * a ring of wrong size into correct place.
> - */
> -
> - ctx = sys_mremap(ctx, raio->len, raio->len,
> - MREMAP_FIXED | MREMAP_MAYMOVE,
> - raio->addr);
> - if (ctx != raio->addr) {
> - pr_err("Ring remap failed with %ld\n", ctx);
> - goto core_restore_end;
> - }
> -
> - /*
> - * Now check that kernel not just remapped the
> - * ring into new place, but updated the internal
> - * context state respectively.
> - */
> -
> - ret = sys_io_getevents(ctx, 0, 1, NULL, NULL);
> - if (ret != 0) {
> - if (ret < 0)
> - pr_err("Kernel doesn't remap AIO rings\n");
> - else
> - pr_err("AIO context screwed up\n");
> -
> - goto core_restore_end;
> - }
> - }
> -
> /*
> * Finally restore madivse() bits
> */
>
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
> .
>
More information about the CRIU
mailing list