[CRIU] [PATCH v2 4/4] aio: Support two versions of io_setup()

Kirill Tkhai ktkhai at virtuozzo.com
Thu May 19 06:13:02 PDT 2016


Since kernel e1bdd5f27a5b "aio: percpu reqs_available"
logic of io_setup() has changed. Now it multiplies passed
nr_events twice. The difference you can clearly see if you
compare aio_estimate_nr_reqs_v{1,2} introducing by this
patch.

The patch aims to support restore on kernels of both versions.
To do this, we determine the version of logic in aio_init()
and it's choosed the right function-convector v1 or v2.
The convector determines the right nr_events, that restorer
should pass to io_submit().

v2: Take in account number of possible cpus

Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 criu/aio.c         |   88 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 criu/cr-restore.c  |    9 +++++
 criu/include/aio.h |    3 +-
 3 files changed, 95 insertions(+), 5 deletions(-)

diff --git a/criu/aio.c b/criu/aio.c
index 4d2fbeb..20b8a5d 100644
--- a/criu/aio.c
+++ b/criu/aio.c
@@ -1,15 +1,21 @@
+#include <sys/syscall.h>
 #include <unistd.h>
 #include <stdio.h>
 #include <stdbool.h>
 #include "vma.h"
 #include "xmalloc.h"
 #include "aio.h"
+#include "kerndat.h"
 #include "parasite.h"
 #include "parasite-syscall.h"
 #include "images/mm.pb-c.h"
 
 #define NR_IOEVENTS_IN_NPAGES(npages) ((PAGE_SIZE * npages - sizeof(struct aio_ring)) / sizeof(struct io_event))
 
+unsigned int (*aio_estimate_nr_reqs)(unsigned int size);
+
+static unsigned int aio_estimate_nr_reqs_v2(unsigned int size);
+
 int dump_aio_ring(MmEntry *mme, struct vma_area *vma)
 {
 	int nr = mme->n_aios;
@@ -26,7 +32,13 @@ int dump_aio_ring(MmEntry *mme, struct vma_area *vma)
 	aio_ring_entry__init(re);
 	re->id = vma->e->start;
 	re->ring_len = vma->e->end - vma->e->start;
-	re->nr_req = aio_estimate_nr_reqs(re->ring_len);
+	/*
+	 * Every AIO is interpreted as v2 from the very beginning.
+	 * Currently, there are two AIO types, and AioRingEntry::nr_req
+	 * is not used for restore. But it's still saved for backward
+	 * compatibility.
+	 */
+	re->nr_req = aio_estimate_nr_reqs_v2(re->ring_len);
 	if (!re->nr_req)
 		return -1;
 	mme->aios[nr] = re;
@@ -46,8 +58,32 @@ void free_aios(MmEntry *mme)
 		xfree(mme->aios);
 	}
 }
+static unsigned int aio_estimate_nr_reqs_v1(unsigned int size)
+{
+	unsigned int k_max_reqs = NR_IOEVENTS_IN_NPAGES(size/PAGE_SIZE);
+
+	if (size & ~PAGE_MASK) {
+		pr_err("Ring size is not aligned\n");
+		return 0;
+	}
+	/*
+	 * Kernel did (before e1bdd5f27a5b "aio: percpu reqs_available")
+	 *
+	 * nr_reqs = max(nr_reqs, nr_cpus * 4)
+	 * nr_reqs += 2
+	 * ring = roundup(sizeof(head) + nr_reqs * sizeof(req))
+	 * nr_reqs = (ring - sizeof(head)) / sizeof(req)
+	 *
+	 * And the k_max_reqs here is the resulting value.
+	 *
+	 * We need to get the initial nr_reqs that would grow
+	 * up back to the k_max_reqs.
+	 */
+
+	return (k_max_reqs - 2);
+}
 
-unsigned int aio_estimate_nr_reqs(unsigned int size)
+static unsigned int aio_estimate_nr_reqs_v2(unsigned int size)
 {
 	unsigned int k_max_reqs = NR_IOEVENTS_IN_NPAGES(size/PAGE_SIZE);
 
@@ -56,7 +92,7 @@ unsigned int aio_estimate_nr_reqs(unsigned int size)
 		return 0;
 	}
 	/*
-	 * Kernel does
+	 * Kernel does (since e1bdd5f27a5b "aio: percpu reqs_available")
 	 *
 	 * nr_reqs = max(nr_reqs, nr_cpus * 4)
 	 * nr_reqs *= 2
@@ -117,3 +153,49 @@ int parasite_collect_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas)
 
 	return 0;
 }
+
+static unsigned nr_events_page_up(unsigned nr)
+{
+	unsigned long size;
+	size = sizeof(struct aio_ring) + sizeof(struct io_event) * nr;
+	size = ALIGN(size, PAGE_SIZE);
+
+	return (unsigned)((size - sizeof(struct aio_ring)) / sizeof(struct io_event));
+}
+
+int aio_init(void)
+{
+	unsigned nr_events, nr_v1, nr_v2;
+	aio_context_t ctx = 0;
+	struct aio_ring *ring;
+	long ret;
+
+	/* Choose nr_events in the way, v1 is near the page border, so v2 > v1 */
+	nr_events = nr_events_page_up(4 * kdat.nr_cpus_possible);
+	nr_events -= 2;
+
+	nr_v1 = nr_events_page_up(nr_events + 2);
+	nr_v2 = nr_events_page_up(nr_events * 2 + 2);
+
+	ret = syscall(__NR_io_setup, nr_events, &ctx);
+	if (ret < 0) {
+		pr_err("Ring setup failed with %ld\n", ret);
+		return -1;
+	}
+	ring = (void *)ctx;
+	if (ring->nr == nr_v1) {
+		aio_estimate_nr_reqs = aio_estimate_nr_reqs_v1;
+		pr_info("io_setup() version#1\n");
+	} else if (ring->nr == nr_v2) {
+		aio_estimate_nr_reqs = aio_estimate_nr_reqs_v2;
+		pr_info("io_setup() version#2\n");
+	} else {
+		pr_err("Can't determine io_setup() version: nr=%u, cpus=%u\n",
+			ring->nr, kdat.nr_cpus_possible);
+		ret = -1;
+	}
+
+	syscall(__NR_io_destroy, ctx);
+
+	return ret;
+}
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 0f6cdde..829c113 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -2315,6 +2315,9 @@ int cr_restore_tasks(void)
 	if (vdso_init())
 		goto err;
 
+	if (aio_init())
+		goto err;
+
 	if (opts.cpu_cap & (CPU_CAP_INS | CPU_CAP_CPU)) {
 		if (cpu_validate_cpuinfo())
 			goto err;
@@ -3191,8 +3194,12 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 			goto err_nv;
 
 		raio->addr = mm->aios[i]->id;
-		raio->nr_req = mm->aios[i]->nr_req;
 		raio->len = mm->aios[i]->ring_len;
+		raio->nr_req = aio_estimate_nr_reqs(raio->len);
+		if (!raio->nr_req) {
+			pr_err("Empty aio ring\n");
+			goto err_nv;
+		}
 	}
 
 	/*
diff --git a/criu/include/aio.h b/criu/include/aio.h
index c72122c..f445b0e 100644
--- a/criu/include/aio.h
+++ b/criu/include/aio.h
@@ -3,7 +3,8 @@
 
 #include <linux/aio_abi.h>
 #include "images/mm.pb-c.h"
-unsigned int aio_estimate_nr_reqs(unsigned int size);
+int aio_init(void);
+unsigned int (*aio_estimate_nr_reqs)(unsigned int size);
 int dump_aio_ring(MmEntry *mme, struct vma_area *vma);
 void free_aios(MmEntry *mme);
 struct parasite_ctl;



More information about the CRIU mailing list