[CRIU] [PATCH v5 4/5] aio: Restore aio ring content

Kirill Tkhai ktkhai at virtuozzo.com
Wed Mar 23 06:01:42 PDT 2016


1)Dump/restore mmaped aio ring like any other private vma entry,
  with the only exception we do not predump it.
2)Create io context, set head and tail using write to /dev/null.
3)Copy aio ring restored in (1) to created in (2).
4)Remap (2) to address of (1).

Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 criu/arch/ppc64/syscalls/syscall-ppc64.tbl |    1 
 criu/arch/x86/syscalls/syscall_32.tbl      |    1 
 criu/arch/x86/syscalls/syscall_64.tbl      |    1 
 criu/cr-restore.c                          |   10 ++-
 criu/include/syscall-types.h               |    1 
 criu/include/vma.h                         |    5 +
 criu/mem.c                                 |    8 ++
 criu/pie/parasite.c                        |    9 --
 criu/pie/restorer.c                        |  112 +++++++++++++++++++++++-----
 9 files changed, 117 insertions(+), 31 deletions(-)

diff --git a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
index 3319379..e71a1ad 100644
--- a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
+++ b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
@@ -102,4 +102,5 @@ __NR_seccomp		358		sys_seccomp		(unsigned int op, unsigned int flags, const char
 __NR_memfd_create	360		sys_memfd_create	(const char *name, unsigned int flags)
 __NR_io_setup		227		sys_io_setup		(unsigned nr_events, aio_context_t *ctx_idp)
 __NR_io_getevents	229		sys_io_getevents	(aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
+__NR_io_submit		230		sys_io_submit		(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
 __NR_ipc		117		sys_ipc			(unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth)
diff --git a/criu/arch/x86/syscalls/syscall_32.tbl b/criu/arch/x86/syscalls/syscall_32.tbl
index c527122..2b61530 100644
--- a/criu/arch/x86/syscalls/syscall_32.tbl
+++ b/criu/arch/x86/syscalls/syscall_32.tbl
@@ -66,6 +66,7 @@ __NR_set_thread_area	243		sys_set_thread_area	(user_desc_t *info)
 __NR_get_thread_area	244		sys_get_thread_area	(user_desc_t *info)
 __NR_io_setup		245		sys_io_setup		(unsigned nr_reqs, aio_context_t *ctx32p)
 __NR_io_getevents	247		sys_io_getevents	(aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
+__NR_io_submit		248		sys_io_submit		(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
 __NR_exit_group		252		sys_exit_group		(int error_code)
 __NR_set_tid_address	258		sys_set_tid_address	(int *tid_addr)
 __NR_timer_create	259		sys_timer_create	(clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
diff --git a/criu/arch/x86/syscalls/syscall_64.tbl b/criu/arch/x86/syscalls/syscall_64.tbl
index 5c32d4c..e01ea8f 100644
--- a/criu/arch/x86/syscalls/syscall_64.tbl
+++ b/criu/arch/x86/syscalls/syscall_64.tbl
@@ -74,6 +74,7 @@ __NR_futex			202		sys_futex		(u32 *uaddr, int op, u32 val, struct timespec *utim
 __NR_set_thread_area		205		sys_set_thread_area	(user_desc_t *info)
 __NR_io_setup			206		sys_io_setup		(unsigned nr_events, aio_context_t *ctx)
 __NR_io_getevents		208		sys_io_getevents	(aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
+__NR_io_submit			209		sys_io_submit		(aio_context_t ctx, long nr, struct iocb **iocbpp)
 __NR_get_thread_area		211		sys_get_thread_area	(user_desc_t *info)
 __NR_set_tid_address		218		sys_set_tid_address	(int *tid_addr)
 __NR_restart_syscall		219		sys_restart_syscall	(void)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 0c6a828..a40552e 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -316,6 +316,7 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
 
 	size = vma_entry_len(vma->e);
 	if (paddr == NULL) {
+		int flag = 0;
 		/*
 		 * The respective memory area was NOT found in the parent.
 		 * Map a new one.
@@ -323,9 +324,16 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
 		pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
 			vma->e->start, vma->e->end, vma->e->pgoff);
 
+		/*
+		 * Restore AIO ring buffer content to temporary anonymous area.
+		 * This will be placed in io_setup'ed AIO in restore_aio_ring().
+		 */
+		if (vma_entry_is(vma->e, VMA_AREA_AIORING))
+			flag |= MAP_ANONYMOUS;
+
 		addr = mmap(*tgt_addr, size,
 				vma->e->prot | PROT_WRITE,
-				vma->e->flags | MAP_FIXED,
+				vma->e->flags | MAP_FIXED | flag,
 				vma->e->fd, vma->e->pgoff);
 
 		if (addr == MAP_FAILED) {
diff --git a/criu/include/syscall-types.h b/criu/include/syscall-types.h
index fdfed9a..75fb353 100644
--- a/criu/include/syscall-types.h
+++ b/criu/include/syscall-types.h
@@ -31,6 +31,7 @@ struct rusage;
 struct file_handle;
 struct robust_list_head;
 struct io_event;
+struct iocb;
 struct timespec;
 
 typedef unsigned long aio_context_t;
diff --git a/criu/include/vma.h b/criu/include/vma.h
index 247c5a3..ce4d5f7 100644
--- a/criu/include/vma.h
+++ b/criu/include/vma.h
@@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
 static inline bool vma_entry_is_private(VmaEntry *entry,
 					unsigned long task_size)
 {
-	return vma_entry_is(entry, VMA_AREA_REGULAR)	&&
+	return (vma_entry_is(entry, VMA_AREA_REGULAR)	&&
 		(vma_entry_is(entry, VMA_ANON_PRIVATE)	||
 		 vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
-		 (entry->end <= task_size);
+		 (entry->end <= task_size)) ||
+		vma_entry_is(entry, VMA_AREA_AIORING);
 }
 
 static inline bool vma_area_is_private(struct vma_area *vma,
diff --git a/criu/mem.c b/criu/mem.c
index 1379f09..0513823 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -189,6 +189,12 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
 	list_for_each_entry(vma, &vma_area_list->h, list) {
 		if (!vma_area_is_private(vma, kdat.task_size))
 			continue;
+		/*
+		 * Kernel write to aio ring is not soft-dirty tracked,
+		 * so we ignore them at pre-dump.
+		 */
+		if (vma_entry_is(vma->e, VMA_AREA_AIORING) && pp_ret)
+			continue;
 		if (vma->e->prot & PROT_READ)
 			continue;
 
@@ -302,6 +308,8 @@ static int __parasite_dump_pages_seized(struct parasite_ctl *ctl,
 
 		if (!vma_area_is_private(vma_area, kdat.task_size))
 			continue;
+		if (vma_entry_is(vma_area->e, VMA_AREA_AIORING) && pp_ret)
+			continue;
 
 		map = pmc_get_map(&pmc, vma_area);
 		if (!map)
diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
index 1df3e71..d82518e 100644
--- a/criu/pie/parasite.c
+++ b/criu/pie/parasite.c
@@ -410,14 +410,7 @@ static int parasite_check_aios(struct parasite_check_aios_args *args)
 			return -1;
 		}
 
-		/*
-		 * XXX what else can we do if there are requests
-		 * in the ring?
-		 */
-		if (ring->head != ring->tail) {
-			pr_err("Pending AIO requests in ring #%d\n", i);
-			return -1;
-		}
+		/* XXX: wait aio completion */
 
 		args->ring[i].max_reqs = ring->nr;
 	}
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index b28e2f8..6e8f15d 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -3,6 +3,7 @@
 
 #include <linux/securebits.h>
 #include <linux/capability.h>
+#include <linux/aio_abi.h>
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
@@ -16,6 +17,7 @@
 #include <signal.h>
 
 #include "compiler.h"
+#include "asm/string.h"
 #include "asm/types.h"
 #include "syscall.h"
 #include "config.h"
@@ -545,10 +547,23 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
 	return addr;
 }
 
+/*
+ * This restores aio ring header, content, head and in-kernel position
+ * of tail. To set tail, we write to /dev/null and use the fact this
+ * operation is synchronious for the device. Also, we unmap temporary
+ * anonymous area, used to store content of ring buffer during restore
+ * and mapped in map_private_vma().
+ */
 static int restore_aio_ring(struct rst_aio_ring *raio)
 {
+	struct aio_ring *ring = (void *)raio->addr;
+	int i, maxr, count, fd, ret;
+	unsigned head = ring->head;
+	unsigned tail = ring->tail;
+	struct iocb *iocb, **iocbp;
 	unsigned long ctx = 0;
-	int ret;
+	unsigned size;
+	char buf[1];
 
 	ret = sys_io_setup(raio->nr_req, &ctx);
 	if (ret < 0) {
@@ -556,8 +571,80 @@ static int restore_aio_ring(struct rst_aio_ring *raio)
 		return -1;
 	}
 
-	if (ctx == raio->addr) /* Lucky bastards we are! */
-		return 0;
+	if (tail == 0 && head == 0)
+		goto populate;
+
+	fd = sys_open("/dev/null", O_WRONLY, 0);
+	if (fd < 0) {
+		pr_err("Can't open /dev/null for aio\n");
+		return -1;
+	}
+
+	/*
+	 * If tail < head, we have to do full turn and then submit
+	 * tail more request, i.e. ring->nr + tail.
+	 * If we do not do full turn, in-kernel completed_events
+	 * will initialize wrong.
+	 *
+	 * Maximum number reqs to submit at once are ring->nr-1,
+	 * so we won't allocate more.
+	 */
+	if (tail < head)
+		count = ring->nr + tail;
+	else
+		count = tail;
+	maxr = min_t(unsigned, count, ring->nr-1);
+
+	/*
+	 * Since we only interested in moving the tail, the requests
+	 * may be any. We submit count identical requests.
+	 */
+	size = sizeof(struct iocb) + maxr * sizeof(struct iocb *);
+	iocb = (void *)sys_mmap(NULL, size, PROT_READ|PROT_WRITE,
+				MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+	iocbp = (void *)iocb + sizeof(struct iocb);
+
+	if (iocb == MAP_FAILED) {
+		pr_err("Can't mmap aio tmp buffer\n");
+		return -1;
+	}
+
+	iocb->aio_fildes = fd;
+	iocb->aio_buf = (unsigned long)buf;
+	iocb->aio_nbytes = 1;
+	iocb->aio_lio_opcode = IOCB_CMD_PWRITE; /* Write is nop, read populates buf */
+
+	for (i = 0; i < maxr; i++)
+		iocbp[i] = iocb;
+
+	i = 0;
+	do {
+		ret = sys_io_submit(ctx, count - i, iocbp);
+		if (ret < 0) {
+			pr_err("Can't submit aio iocbs: ret=%d\n", ret);
+			return -1;
+		}
+		i += ret;
+
+		 /*
+		  * We may submit less than requested, because of too big
+		  * count OR behaviour of get_reqs_available(), which
+		  * takes available requests only if their number is
+		  * aliquot to kioctx::req_batch. Free part of buffer
+		  * for next iteration.
+		  *
+		  * Direct set of head is equal to sys_io_getevents() call,
+		  * and faster. See kernel for the details.
+		  */
+		((struct aio_ring *)ctx)->head = i < head ? i : head;
+	} while (i < count);
+
+	sys_munmap(iocb, size);
+	sys_close(fd);
+
+populate:
+	count = raio->len - offsetof(struct aio_ring, io_events);
+	builtin_memcpy((void *)ctx, ring, count);
 
 	/*
 	 * If we failed to get the proper nr_req right and
@@ -567,6 +654,8 @@ static int restore_aio_ring(struct rst_aio_ring *raio)
 	 *
 	 * This is not great, but anyway better than putting
 	 * a ring of wrong size into correct place.
+	 *
+	 * Also, this unmaps temporary anonymous area on raio->addr.
 	 */
 
 	ctx = sys_mremap(ctx, raio->len, raio->len,
@@ -576,23 +665,6 @@ static int restore_aio_ring(struct rst_aio_ring *raio)
 		pr_err("Ring remap failed with %ld\n", ctx);
 		return -1;
 	}
-
-	/*
-	 * Now check that kernel not just remapped the
-	 * ring into new place, but updated the internal
-	 * context state respectively.
-	 */
-
-	ret = sys_io_getevents(ctx, 0, 1, NULL, NULL);
-	if (ret != 0) {
-		if (ret < 0)
-			pr_err("Kernel doesn't remap AIO rings\n");
-		else
-			pr_err("AIO context screwed up\n");
-
-		return -1;
-	}
-
 	return 0;
 }
 



More information about the CRIU mailing list