[CRIU] [PATCH v2 3/3] aio: Restore aio ring content
Kirill Tkhai
ktkhai at virtuozzo.com
Mon Mar 14 04:49:37 PDT 2016
1)Dump/restore mmaped aio ring like any other private vma entry.
2)Create io context, set head and tail using write to /dev/null.
3)Copy aio ring restored in (1) to created in (2).
4)Unmap temporary ring (1).
5)Remap (2) to address of (1).
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
criu/arch/ppc64/syscalls/syscall-ppc64.tbl | 1
criu/arch/x86/syscalls/syscall_32.tbl | 1
criu/arch/x86/syscalls/syscall_64.tbl | 1
criu/cr-restore.c | 6 +
criu/include/syscall-types.h | 1
criu/include/vma.h | 5 +
criu/pie/parasite.c | 9 --
criu/pie/restorer.c | 166 ++++++++++++++++++++--------
8 files changed, 130 insertions(+), 60 deletions(-)
diff --git a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
index 3319379..e71a1ad 100644
--- a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
+++ b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
@@ -102,4 +102,5 @@ __NR_seccomp 358 sys_seccomp (unsigned int op, unsigned int flags, const char
__NR_memfd_create 360 sys_memfd_create (const char *name, unsigned int flags)
__NR_io_setup 227 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp)
__NR_io_getevents 229 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
+__NR_io_submit 230 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp)
__NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth)
diff --git a/criu/arch/x86/syscalls/syscall_32.tbl b/criu/arch/x86/syscalls/syscall_32.tbl
index c527122..2b61530 100644
--- a/criu/arch/x86/syscalls/syscall_32.tbl
+++ b/criu/arch/x86/syscalls/syscall_32.tbl
@@ -66,6 +66,7 @@ __NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info)
__NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info)
__NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p)
__NR_io_getevents 247 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
+__NR_io_submit 248 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp)
__NR_exit_group 252 sys_exit_group (int error_code)
__NR_set_tid_address 258 sys_set_tid_address (int *tid_addr)
__NR_timer_create 259 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
diff --git a/criu/arch/x86/syscalls/syscall_64.tbl b/criu/arch/x86/syscalls/syscall_64.tbl
index 5c32d4c..e01ea8f 100644
--- a/criu/arch/x86/syscalls/syscall_64.tbl
+++ b/criu/arch/x86/syscalls/syscall_64.tbl
@@ -74,6 +74,7 @@ __NR_futex 202 sys_futex (u32 *uaddr, int op, u32 val, struct timespec *utim
__NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info)
__NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx)
__NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
+__NR_io_submit 209 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp)
__NR_get_thread_area 211 sys_get_thread_area (user_desc_t *info)
__NR_set_tid_address 218 sys_set_tid_address (int *tid_addr)
__NR_restart_syscall 219 sys_restart_syscall (void)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 30ddff9..922fa14 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -316,6 +316,7 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
size = vma_entry_len(vma->e);
if (paddr == NULL) {
+ int flag = 0;
/*
* The respective memory area was NOT found in the parent.
* Map a new one.
@@ -323,9 +324,12 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
vma->e->start, vma->e->end, vma->e->pgoff);
+ if (vma_entry_is(vma->e, VMA_AREA_AIORING))
+ flag |= MAP_ANONYMOUS;
+
addr = mmap(*tgt_addr, size,
vma->e->prot | PROT_WRITE,
- vma->e->flags | MAP_FIXED,
+ vma->e->flags | MAP_FIXED | flag,
vma->e->fd, vma->e->pgoff);
if (addr == MAP_FAILED) {
diff --git a/criu/include/syscall-types.h b/criu/include/syscall-types.h
index b056f6d..5b4e1aa 100644
--- a/criu/include/syscall-types.h
+++ b/criu/include/syscall-types.h
@@ -31,6 +31,7 @@ struct rusage;
struct file_handle;
struct robust_list_head;
struct io_event;
+struct iocb;
struct timespec;
typedef unsigned long aio_context_t;
diff --git a/criu/include/vma.h b/criu/include/vma.h
index 247c5a3..ce4d5f7 100644
--- a/criu/include/vma.h
+++ b/criu/include/vma.h
@@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
static inline bool vma_entry_is_private(VmaEntry *entry,
unsigned long task_size)
{
- return vma_entry_is(entry, VMA_AREA_REGULAR) &&
+ return (vma_entry_is(entry, VMA_AREA_REGULAR) &&
(vma_entry_is(entry, VMA_ANON_PRIVATE) ||
vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
- (entry->end <= task_size);
+ (entry->end <= task_size)) ||
+ vma_entry_is(entry, VMA_AREA_AIORING);
}
static inline bool vma_area_is_private(struct vma_area *vma,
diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
index 1df3e71..d82518e 100644
--- a/criu/pie/parasite.c
+++ b/criu/pie/parasite.c
@@ -410,14 +410,7 @@ static int parasite_check_aios(struct parasite_check_aios_args *args)
return -1;
}
- /*
- * XXX what else can we do if there are requests
- * in the ring?
- */
- if (ring->head != ring->tail) {
- pr_err("Pending AIO requests in ring #%d\n", i);
- return -1;
- }
+ /* XXX: wait aio completion */
args->ring[i].max_reqs = ring->nr;
}
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index f7bde75..d19f4dc 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -3,6 +3,7 @@
#include <linux/securebits.h>
#include <linux/capability.h>
+#include <linux/aio_abi.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
@@ -546,6 +547,120 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
return addr;
}
+/*
+ * This restores aio ring header, content, head and in-kernel position
+ * of tail. To set tail, we write to /dev/null and use the fact this
+ * operation is synchronious for the device. Also, we unmap temporary
+ * anonymous area, used to store content of ring buffer during restore
+ * and mapped in map_private_vma().
+ */
+static int restore_aio_ring(struct rst_aio_ring *raio)
+{
+ struct aio_ring *ring = (void *)raio->addr;
+ unsigned head = ring->head;
+ unsigned tail = ring->tail;
+ struct iocb *iocb, **iocbp;
+ unsigned long ctx = 0;
+ int i, count, fd, ret;
+ char buf[1];
+
+ ret = sys_io_setup(raio->nr_req, &ctx);
+ if (ret < 0) {
+ pr_err("Ring setup failed with %d\n", ret);
+ return -1;
+ }
+
+ if (tail == 0 && head == 0)
+ goto populate;
+
+ fd = sys_open("/dev/null", O_WRONLY, 0);
+ if (fd < 0) {
+ pr_err("Can't open /dev/null for aio\n");
+ return -1;
+ }
+
+ if (tail >= head)
+ count = tail;
+ else
+ count = ring->nr - 1;
+
+ iocb = (void *)sys_mmap(NULL, count * sizeof(struct iocb), PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ iocbp = (void *)sys_mmap(NULL, count * sizeof(struct iocb *), PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ if (iocb == MAP_FAILED || iocbp == MAP_FAILED) {
+ pr_err("Can't mmap aio tmp buffer\n");
+ return -1;
+ }
+
+ for (i = 0; i < count; i++) {
+ iocbp[i] = &iocb[i];
+ iocb[i].aio_fildes = fd;
+ iocb[i].aio_buf = (unsigned long)buf;
+ iocb[i].aio_nbytes = 1;
+ iocb[i].aio_lio_opcode = IOCB_CMD_PWRITE;
+ }
+
+ i = count;
+ do {
+ ret = sys_io_submit(ctx, i, iocbp);
+ if (ret < 0) {
+ pr_err("Can't submit %d aio iocbs: ret=%d\n", i, ret);
+ return -1;
+ }
+ i -= ret;
+
+ if (count - i > head)
+ /*
+ * Though count is less than maximum available reqs, kernel's
+ * get_reqs_available() takes only a number of reqs, which is
+ * aliquot to kioctx::req_batch. So, set head to free a space
+ * for next io_submit().
+ *
+ * Direct set of head is equal to sys_io_getevents() call. See
+ * kernel for the details.
+ */
+ ((struct aio_ring *)ctx)->head = head;
+ } while (i);
+
+ if (tail < head) {
+ ret = sys_io_submit(ctx, tail + 1, iocbp);
+ if (ret != tail + 1) {
+ pr_err("Can't submit %d aio iocbs more, ret=%d\n", tail + 1, ret);
+ return -1;
+ }
+ }
+
+ sys_munmap(iocb, count * sizeof(struct iocb));
+ sys_munmap(iocbp, count * sizeof(struct iocb *));
+ sys_close(fd);
+populate:
+ count = raio->len/sizeof(unsigned long);
+ for (i = 0; i < count; i++)
+ ((unsigned long *)ctx)[i] = ((unsigned long *)ring)[i];
+
+ /* Unmap temporary anonymous area */
+ sys_munmap(ring, raio->len);
+
+ /*
+ * If we failed to get the proper nr_req right and
+ * created smaller or larger ring, then this remap
+ * will (should) fail, since AIO rings has immutable
+ * size.
+ *
+ * This is not great, but anyway better than putting
+ * a ring of wrong size into correct place.
+ */
+ ctx = sys_mremap(ctx, raio->len, raio->len,
+ MREMAP_FIXED | MREMAP_MAYMOVE,
+ raio->addr);
+ if (ctx != raio->addr) {
+ pr_err("Ring remap failed with %ld\n", ctx);
+ return -1;
+ }
+ return 0;
+}
+
static void rst_tcp_repair_off(struct rst_tcp_sock *rts)
{
int aux, ret;
@@ -999,56 +1114,9 @@ long __export_restore_task(struct task_restore_args *args)
* Now when all VMAs are in their places time to set
* up AIO rings.
*/
-
- for (i = 0; i < args->rings_n; i++) {
- struct rst_aio_ring *raio = &args->rings[i];
- unsigned long ctx = 0;
- int ret;
-
- ret = sys_io_setup(raio->nr_req, &ctx);
- if (ret < 0) {
- pr_err("Ring setup failed with %d\n", ret);
+ for (i = 0; i < args->rings_n; i++)
+ if (restore_aio_ring(&args->rings[i]) < 0)
goto core_restore_end;
- }
-
- if (ctx == raio->addr) /* Lucky bastards we are! */
- continue;
-
- /*
- * If we failed to get the proper nr_req right and
- * created smaller or larger ring, then this remap
- * will (should) fail, since AIO rings has immutable
- * size.
- *
- * This is not great, but anyway better than putting
- * a ring of wrong size into correct place.
- */
-
- ctx = sys_mremap(ctx, raio->len, raio->len,
- MREMAP_FIXED | MREMAP_MAYMOVE,
- raio->addr);
- if (ctx != raio->addr) {
- pr_err("Ring remap failed with %ld\n", ctx);
- goto core_restore_end;
- }
-
- /*
- * Now check that kernel not just remapped the
- * ring into new place, but updated the internal
- * context state respectively.
- */
-
- ret = sys_io_getevents(ctx, 0, 1, NULL, NULL);
- if (ret != 0) {
- if (ret < 0)
- pr_err("Kernel doesn't remap AIO rings\n");
- else
- pr_err("AIO context screwed up\n");
-
- goto core_restore_end;
- }
- }
-
/*
* Finally restore madivse() bits
*/
More information about the CRIU
mailing list