[Devel] [PATCH criu-stable v2] aio: Use VE_AIO_IOC_SET_TAIL and VE_AIO_IOC_WAIT_ACTIVE ioctls

Kirill Tkhai ktkhai at virtuozzo.com
Sat Feb 20 02:24:47 PST 2016


This adds a waiting of in-flight aio requests during dump
and allows to set ring buffer tail from userspace.

The last thing we need because it's the only AIO ring parameter
we can't set from userspace. This allows to restore the ring
state fully.

Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 criu/cr-restore.c   |    6 +++++-
 criu/include/aio.h  |   27 ++++++++++++++++++++++++++
 criu/include/vma.h  |    5 +++--
 criu/pie/parasite.c |   52 ++++++++++++++++++++++++---------------------------
 criu/pie/restorer.c |   44 +++++++++++++++++++++++++------------------
 5 files changed, 84 insertions(+), 50 deletions(-)

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 67032a3..871f49a 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -316,6 +316,7 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
 
 	size = vma_entry_len(vma->e);
 	if (paddr == NULL) {
+		int f = 0;
 		/*
 		 * The respective memory area was NOT found in the parent.
 		 * Map a new one.
@@ -323,9 +324,12 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
 		pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
 			vma->e->start, vma->e->end, vma->e->pgoff);
 
+		if (vma_entry_is(vma->e, VMA_AREA_AIORING))
+			f |= MAP_ANONYMOUS;
+
 		addr = mmap(*tgt_addr, size,
 				vma->e->prot | PROT_WRITE,
-				vma->e->flags | MAP_FIXED,
+				vma->e->flags | MAP_FIXED | f,
 				vma->e->fd, vma->e->pgoff);
 
 		if (addr == MAP_FAILED) {
diff --git a/criu/include/aio.h b/criu/include/aio.h
index af6d687..3fead1b 100644
--- a/criu/include/aio.h
+++ b/criu/include/aio.h
@@ -1,5 +1,6 @@
 #ifndef __CR_AIO_H__
 #define __CR_AIO_H__
+#include <linux/ioctl.h>
 #include "images/mm.pb-c.h"
 int dump_aio_ring(MmEntry *mme, struct vma_area *vma);
 void free_aios(MmEntry *mme);
@@ -7,9 +8,35 @@ struct parasite_ctl;
 int parasite_check_aios(struct parasite_ctl *, struct vm_area_list *);
 unsigned long aio_rings_args_size(struct vm_area_list *);
 
+struct aio_ring {
+	unsigned        id;     /* kernel internal index number */
+	unsigned        nr;     /* number of io_events */
+	unsigned        head;   /* Written to by userland or under ring_lock
+				 * mutex by aio_read_events_ring(). */
+	unsigned        tail;
+
+	unsigned        magic;
+	unsigned        compat_features;
+	unsigned        incompat_features;
+	unsigned        header_length;  /* size of aio_ring */
+
+
+	/* struct io_event         io_events[0]; */
+};
+
 struct rst_aio_ring {
 	unsigned long addr;
 	unsigned long len;
 	unsigned int nr_req;
 };
+
+struct ve_ioc_arg
+{
+	unsigned long	ctx_id;
+	unsigned	val;
+};
+
+#define VE_AIO_IOC_SET_TAIL	_IOW('a',  0, struct ve_ioc_arg)
+#define VE_AIO_IOC_WAIT_ACTIVE	_IOW('a',  1, struct ve_ioc_arg)
+
 #endif /* __CR_AIO_H__ */
diff --git a/criu/include/vma.h b/criu/include/vma.h
index 247c5a3..ce4d5f7 100644
--- a/criu/include/vma.h
+++ b/criu/include/vma.h
@@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
 static inline bool vma_entry_is_private(VmaEntry *entry,
 					unsigned long task_size)
 {
-	return vma_entry_is(entry, VMA_AREA_REGULAR)	&&
+	return (vma_entry_is(entry, VMA_AREA_REGULAR)	&&
 		(vma_entry_is(entry, VMA_ANON_PRIVATE)	||
 		 vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
-		 (entry->end <= task_size);
+		 (entry->end <= task_size)) ||
+		vma_entry_is(entry, VMA_AREA_AIORING);
 }
 
 static inline bool vma_area_is_private(struct vma_area *vma,
diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
index f946a2b..47b0a26 100644
--- a/criu/pie/parasite.c
+++ b/criu/pie/parasite.c
@@ -16,6 +16,7 @@
 #include "parasite-vdso.h"
 #include "log.h"
 #include "tty.h"
+#include "aio.h"
 
 #include <string.h>
 
@@ -347,22 +348,6 @@ static inline int tty_ioctl(int fd, int cmd, int *arg)
 #define AIO_RING_COMPAT_FEATURES	1
 #define AIO_RING_INCOMPAT_FEATURES	0
 
-struct aio_ring {
-	unsigned        id;     /* kernel internal index number */
-	unsigned        nr;     /* number of io_events */
-	unsigned        head;   /* Written to by userland or under ring_lock
-				 * mutex by aio_read_events_ring(). */
-	unsigned        tail;
-
-	unsigned        magic;
-	unsigned        compat_features;
-	unsigned        incompat_features;
-	unsigned        header_length;  /* size of aio_ring */
-
-
-	/* struct io_event         io_events[0]; */
-};
-
 static int sane_ring(struct aio_ring *ring)
 {
 	return ring->magic == AIO_RING_MAGIC &&
@@ -371,12 +356,19 @@ static int sane_ring(struct aio_ring *ring)
 		ring->header_length == sizeof(struct aio_ring);
 }
 
-static int parasite_check_aios(struct parasite_check_aios_args *args)
+static int parasite_wait_aios(struct parasite_check_aios_args *args)
 {
-	int i;
+	int i, fd, ret;
+
+	fd = sys_open("/proc/self/aio", O_RDONLY, 0);
+	if (fd < 0) {
+		pr_err("Can't open /proc/self/aio file\n");
+		return -1;
+	}
 
 	for (i = 0; i < args->nr_rings; i++) {
 		struct aio_ring *ring;
+		struct ve_ioc_arg ioc;
 
 		ring = (struct aio_ring *)args->ring[i].ctx;
 		if (!sane_ring(ring)) {
@@ -385,22 +377,26 @@ static int parasite_check_aios(struct parasite_check_aios_args *args)
 			pr_info(" `- cf    %d\n", ring->compat_features);
 			pr_info(" `- if    %d\n", ring->incompat_features);
 			pr_info(" `- size  %d (%zd)\n", ring->header_length, sizeof(struct aio_ring));
-			return -1;
+			ret = -1;
+			goto out;
 		}
 
-		/*
-		 * XXX what else can we do if there are requests
-		 * in the ring?
-		 */
-		if (ring->head != ring->tail) {
-			pr_err("Pending AIO requests in ring #%d\n", i);
-			return -1;
+		ioc.ctx_id = args->ring[i].ctx;
+		ioc.val = 0;
+
+		ret = sys_ioctl(fd, VE_AIO_IOC_WAIT_ACTIVE, (unsigned long)&ioc);
+		if (ret < 0) {
+			pr_err("Can't wait for active aio reqs: ring #%d\n", i);
+			goto out;
 		}
 
 		args->ring[i].max_reqs = ring->nr;
 	}
 
-	return 0;
+	ret = 0;
+out:
+	sys_close(fd);
+	return ret;
 }
 
 static int parasite_dump_tty(struct parasite_tty_args *args)
@@ -628,7 +624,7 @@ static noinline __used int noinline parasite_daemon(void *args)
 			ret = parasite_dump_tty(args);
 			break;
 		case PARASITE_CMD_CHECK_AIOS:
-			ret = parasite_check_aios(args);
+			ret = parasite_wait_aios(args);
 			break;
 		case PARASITE_CMD_CHECK_VDSO_MARK:
 			ret = parasite_check_vdso_mark(args);
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 5bcae4f..78ca467 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -1029,8 +1029,12 @@ long __export_restore_task(struct task_restore_args *args)
 
 	for (i = 0; i < args->rings_n; i++) {
 		struct rst_aio_ring *raio = &args->rings[i];
+		struct aio_ring *ring = (void *)raio->addr;
+		unsigned tail = ring->tail;
 		unsigned long ctx = 0;
-		int ret;
+		struct ve_ioc_arg arg;
+		int fd, i, size, ret;
+
 
 		ret = sys_io_setup(raio->nr_req, &ctx);
 		if (ret < 0) {
@@ -1038,8 +1042,26 @@ long __export_restore_task(struct task_restore_args *args)
 			goto core_restore_end;
 		}
 
-		if (ctx == raio->addr) /* Lucky bastards we are! */
-			continue;
+		fd = sys_open("/proc/self/aio", O_RDONLY, 0);
+		if (fd < 0) {
+			pr_err("Can't open /proc/self/aio\n");
+			goto core_restore_end;
+		}
+
+		arg.ctx_id = ctx;
+		arg.val = tail;
+
+		ret = sys_ioctl(fd, VE_AIO_IOC_SET_TAIL, (unsigned long)&arg);
+		if (ret != 0) {
+			pr_err("Can't set tail of aio ring %lx, ret=%d\n", raio->addr, ret);
+			goto core_restore_end;
+		}
+		sys_close(fd);
+
+		size = raio->len/sizeof(unsigned long);
+		for (i = 0; i < size; i++)
+			((unsigned long *)ctx)[i] = ((unsigned long *)ring)[i];
+		sys_munmap(ring, raio->len);
 
 		/*
 		 * If we failed to get the proper nr_req right and
@@ -1058,22 +1080,6 @@ long __export_restore_task(struct task_restore_args *args)
 			pr_err("Ring remap failed with %ld\n", ctx);
 			goto core_restore_end;
 		}
-
-		/*
-		 * Now check that kernel not just remapped the
-		 * ring into new place, but updated the internal
-		 * context state respectively.
-		 */
-
-		ret = sys_io_getevents(ctx, 0, 1, NULL, NULL);
-		if (ret != 0) {
-			if (ret < 0)
-				pr_err("Kernel doesn't remap AIO rings\n");
-			else
-				pr_err("AIO context screwed up\n");
-
-			goto core_restore_end;
-		}
 	}
 
 	ret = 0;



More information about the Devel mailing list