[CRIU] [PATCH v2 3/3] aio: Restore aio ring content
Pavel Emelyanov
xemul at virtuozzo.com
Thu Mar 17 07:32:27 PDT 2016
On 03/17/2016 05:26 PM, Kirill Tkhai wrote:
> On 17.03.2016 16:19, Pavel Emelyanov wrote:
>>
>>> @@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
>>> static inline bool vma_entry_is_private(VmaEntry *entry,
>>> unsigned long task_size)
>>> {
>>> - return vma_entry_is(entry, VMA_AREA_REGULAR) &&
>>> + return (vma_entry_is(entry, VMA_AREA_REGULAR) &&
>>> (vma_entry_is(entry, VMA_ANON_PRIVATE) ||
>>> vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
>>> - (entry->end <= task_size);
>>> + (entry->end <= task_size)) ||
>>> + vma_entry_is(entry, VMA_AREA_AIORING);
>>
>> I'm not sure this is safe. How would pre-dumps act on rings?
>
> Could you please explain what kind of problems are possible here?
> I don't see a memory predump.
The vma_entry_is_private() check is too generic. E.g. such vmas are being
soft-dirty-tracked. Do we want the same for AIO rings? I bet we don't :)
>>> }
>>>
>>> static inline bool vma_area_is_private(struct vma_area *vma,
>>> diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
>>> index 1df3e71..d82518e 100644
>>> --- a/criu/pie/parasite.c
>>> +++ b/criu/pie/parasite.c
>>> @@ -410,14 +410,7 @@ static int parasite_check_aios(struct parasite_check_aios_args *args)
>>> return -1;
>>> }
>>>
>>> - /*
>>> - * XXX what else can we do if there are requests
>>> - * in the ring?
>>> - */
>>> - if (ring->head != ring->tail) {
>>> - pr_err("Pending AIO requests in ring #%d\n", i);
>>> - return -1;
>>> - }
>>> + /* XXX: wait aio completion */
>>>
>>> args->ring[i].max_reqs = ring->nr;
>>> }
>>> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
>>> index f7bde75..d19f4dc 100644
>>> --- a/criu/pie/restorer.c
>>> +++ b/criu/pie/restorer.c
>>> @@ -3,6 +3,7 @@
>>>
>>> #include <linux/securebits.h>
>>> #include <linux/capability.h>
>>> +#include <linux/aio_abi.h>
>>> #include <sys/types.h>
>>> #include <sys/mman.h>
>>> #include <sys/stat.h>
>>> @@ -546,6 +547,120 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
>>> return addr;
>>> }
>>>
>>> +/*
>>> + * This restores aio ring header, content, head and in-kernel position
>>> + * of tail. To set tail, we write to /dev/null and use the fact this
>>> + * operation is synchronious for the device. Also, we unmap temporary
>>> + * anonymous area, used to store content of ring buffer during restore
>>> + * and mapped in map_private_vma().
>>> + */
>>> +static int restore_aio_ring(struct rst_aio_ring *raio)
>>> +{
>>> + struct aio_ring *ring = (void *)raio->addr;
>>> + unsigned head = ring->head;
>>> + unsigned tail = ring->tail;
>>> + struct iocb *iocb, **iocbp;
>>> + unsigned long ctx = 0;
>>> + int i, count, fd, ret;
>>> + char buf[1];
>>> +
>>> + ret = sys_io_setup(raio->nr_req, &ctx);
>>> + if (ret < 0) {
>>> + pr_err("Ring setup failed with %d\n", ret);
>>> + return -1;
>>> + }
>>> +
>>> + if (tail == 0 && head == 0)
>>> + goto populate;
>>
>> if (tail == head) ?
>
> No. If tail is not zero we should move it.
> If tail == 0 and head != 0, we have nr_req-1-head completed
> events in rings, so we should move tail nr_req times (full
> buffer size). This case, number of completed events in kernel
> will set right.
OK, add this as code comment.
>>> +
>>> + fd = sys_open("/dev/null", O_WRONLY, 0);
>>> + if (fd < 0) {
>>> + pr_err("Can't open /dev/null for aio\n");
>>> + return -1;
>>> + }
>>> +
>>> + if (tail >= head)
>>> + count = tail;
>>
>> count = tail - head ?
>
> No. We can't consider tail and head like difference only, because
> a user task may cache this values. We have to restore them exactly
> like they were before. If we move tail on tail-head only, it will
> be wrong.
>
>>> + else
>>> + count = ring->nr - 1;
>>
>> count = ring->nr - (tail - head) ?
>
> No. It will be a number greater than ring->nr. We can't submit more reqs
> than allocated in io_setup().
OK, so what does the 'count' variable contain then?
>>> +
>>> + iocb = (void *)sys_mmap(NULL, count * sizeof(struct iocb), PROT_READ|PROT_WRITE,
>>> + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
>>> + iocbp = (void *)sys_mmap(NULL, count * sizeof(struct iocb *), PROT_READ|PROT_WRITE,
>>> + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
>>
>> Use singe mmap for all the memory you need.
>
> OK
>
>>> + if (iocb == MAP_FAILED || iocbp == MAP_FAILED) {
>>> + pr_err("Can't mmap aio tmp buffer\n");
>>> + return -1;
>>> + }
>>> +
>>> + for (i = 0; i < count; i++) {
>>> + iocbp[i] = &iocb[i];
>>> + iocb[i].aio_fildes = fd;
>>> + iocb[i].aio_buf = (unsigned long)buf;
>>> + iocb[i].aio_nbytes = 1;
>>> + iocb[i].aio_lio_opcode = IOCB_CMD_PWRITE;
>>
>> Only writes? Are there asynchronous reads?
>
> No. It's just to move tail and head, so there is no difference
> which operation is used for that. Statuses are restored below.
I'd use reads then...
>>> + }
>>> +
>>> + i = count;
>>> + do {
>>> + ret = sys_io_submit(ctx, i, iocbp);
>>> + if (ret < 0) {
>>> + pr_err("Can't submit %d aio iocbs: ret=%d\n", i, ret);
>>> + return -1;
>>> + }
>>> + i -= ret;
>>> +
>>> + if (count - i > head)
>>> + /*
>>> + * Though count is less than maximum available reqs, kernel's
>>> + * get_reqs_available() takes only a number of reqs, which is
>>> + * aliquot to kioctx::req_batch. So, set head to free a space
>>> + * for next io_submit().
>>> + *
>>> + * Direct set of head is equal to sys_io_getevents() call. See
>>> + * kernel for the details.
>>> + */
>>> + ((struct aio_ring *)ctx)->head = head;
>>> + } while (i);
>>> +
>>> + if (tail < head) {
>>> + ret = sys_io_submit(ctx, tail + 1, iocbp);
>>
>> Why?!
>
> You can't submit more than nr_req at once.
But you do it in a loop above anyway.
> So, we need to advance head before,
> and this action acts like io_getevents() call. We set head manually because it's
> faster. After head is set, kernel sees we have available space in buffer, and
> second io_submit works correct.
>
>>> + if (ret != tail + 1) {
>>> + pr_err("Can't submit %d aio iocbs more, ret=%d\n", tail + 1, ret);
>>> + return -1;
>>> + }
>>> + }
>>> +
>>> + sys_munmap(iocb, count * sizeof(struct iocb));
>>> + sys_munmap(iocbp, count * sizeof(struct iocb *));
>>> + sys_close(fd);
>>> +populate:
>>> + count = raio->len/sizeof(unsigned long);
>>> + for (i = 0; i < count; i++)
>>> + ((unsigned long *)ctx)[i] = ((unsigned long *)ring)[i];
>>
>> Don't copy the whole rings, we only need the events.
>
> OK. We may skip aio_ring fields up to tail. But anyway it may be
> useful to restore them from aio_ring::magic.
I'd say we only need to restore head, tail and events. And do it _explicitly_
rather than with memcpy-s over parts of the structure.
>>> +
>>> + /* Unmap temporary anonymous area */
>>> + sys_munmap(ring, raio->len);
>>
>> Presumably this is not required, sys_mremap() unmaps the target.
>
> OK
>
>>> +
>>> + /*
>>> + * If we failed to get the proper nr_req right and
>>> + * created smaller or larger ring, then this remap
>>> + * will (should) fail, since AIO rings has immutable
>>> + * size.
>>> + *
>>> + * This is not great, but anyway better than putting
>>> + * a ring of wrong size into correct place.
>>> + */
>>> + ctx = sys_mremap(ctx, raio->len, raio->len,
>>> + MREMAP_FIXED | MREMAP_MAYMOVE,
>>> + raio->addr);
>>> + if (ctx != raio->addr) {
>>> + pr_err("Ring remap failed with %ld\n", ctx);
>>> + return -1;
>>> + }
>>> + return 0;
>>> +}
>>> +
>>> static void rst_tcp_repair_off(struct rst_tcp_sock *rts)
>>> {
>>> int aux, ret;
>>> @@ -999,56 +1114,9 @@ long __export_restore_task(struct task_restore_args *args)
>>> * Now when all VMAs are in their places time to set
>>> * up AIO rings.
>>> */
>>> -
>>> - for (i = 0; i < args->rings_n; i++) {
>>> - struct rst_aio_ring *raio = &args->rings[i];
>>> - unsigned long ctx = 0;
>>> - int ret;
>>> -
>>> - ret = sys_io_setup(raio->nr_req, &ctx);
>>> - if (ret < 0) {
>>> - pr_err("Ring setup failed with %d\n", ret);
>>> + for (i = 0; i < args->rings_n; i++)
>>> + if (restore_aio_ring(&args->rings[i]) < 0)
>>> goto core_restore_end;
>>
>> For such huge patching introduce the intermediate patch that just moves
>> the needed code into a helper w/o any changes.
>
> OK
>
>>> - }
>>> -
>>> - if (ctx == raio->addr) /* Lucky bastards we are! */
>>> - continue;
>>> -
>>> - /*
>>> - * If we failed to get the proper nr_req right and
>>> - * created smaller or larger ring, then this remap
>>> - * will (should) fail, since AIO rings has immutable
>>> - * size.
>>> - *
>>> - * This is not great, but anyway better than putting
>>> - * a ring of wrong size into correct place.
>>> - */
>>> -
>>> - ctx = sys_mremap(ctx, raio->len, raio->len,
>>> - MREMAP_FIXED | MREMAP_MAYMOVE,
>>> - raio->addr);
>>> - if (ctx != raio->addr) {
>>> - pr_err("Ring remap failed with %ld\n", ctx);
>>> - goto core_restore_end;
>>> - }
>>> -
>>> - /*
>>> - * Now check that kernel not just remapped the
>>> - * ring into new place, but updated the internal
>>> - * context state respectively.
>>> - */
>>> -
>>> - ret = sys_io_getevents(ctx, 0, 1, NULL, NULL);
>>> - if (ret != 0) {
>>> - if (ret < 0)
>>> - pr_err("Kernel doesn't remap AIO rings\n");
>>> - else
>>> - pr_err("AIO context screwed up\n");
>>> -
>>> - goto core_restore_end;
>>> - }
>>> - }
>>> -
>>> /*
>>> * Finally restore madivse() bits
>>> */
>>>
>>> _______________________________________________
>>> CRIU mailing list
>>> CRIU at openvz.org
>>> https://lists.openvz.org/mailman/listinfo/criu
>>> .
>>>
>>
> .
>
More information about the CRIU
mailing list