[CRIU] [PATCH 3/3] aio: Support two versions of io_setup()
Kirill Tkhai
ktkhai at virtuozzo.com
Tue May 17 11:57:31 PDT 2016
Since kernel e1bdd5f27a5b "aio: percpu reqs_available"
logic of io_setup() has changed. Now it multiplies passed
nr_events twice. The difference you can clearly see if you
compare aio_estimate_nr_reqs_v{1,2} introducing by this
patch.
The patch aims to support restore on kernels of both versions.
To do this, we determine the version of logic in aio_init()
and it's choosed the right function-convector v1 or v2.
The convector determines the right nr_events, that restorer
should pass to io_submit().
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
criu/aio.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++--
criu/cr-restore.c | 9 ++++++
criu/include/aio.h | 3 +-
3 files changed, 80 insertions(+), 5 deletions(-)
diff --git a/criu/aio.c b/criu/aio.c
index 9bc1115..a0a63d9 100644
--- a/criu/aio.c
+++ b/criu/aio.c
@@ -1,3 +1,4 @@
+#include <sys/syscall.h>
#include <unistd.h>
#include <stdio.h>
#include <stdbool.h>
@@ -10,6 +11,10 @@
#define NR_IOEVENTS_IN_NPAGES(npages) (PAGE_SIZE * npages - sizeof(struct aio_ring)) / sizeof(struct io_event)
+unsigned int (*aio_estimate_nr_reqs)(unsigned int size);
+
+static unsigned int aio_estimate_nr_reqs_v2(unsigned int size);
+
int dump_aio_ring(MmEntry *mme, struct vma_area *vma)
{
int nr = mme->n_aios;
@@ -26,7 +31,13 @@ int dump_aio_ring(MmEntry *mme, struct vma_area *vma)
aio_ring_entry__init(re);
re->id = vma->e->start;
re->ring_len = vma->e->end - vma->e->start;
- re->nr_req = aio_estimate_nr_reqs(re->ring_len);
+ /*
+ * Every AIO is interpreted as v2 from the very beginning.
+ * Currently, there are two AIO types, and AioRingEntry::nr_req
+ * is not used for restore. But it's still saved for backward
+ * compatibility.
+ */
+ re->nr_req = aio_estimate_nr_reqs_v2(re->ring_len);
if (!re->nr_req)
return -1;
mme->aios[nr] = re;
@@ -46,8 +57,32 @@ void free_aios(MmEntry *mme)
xfree(mme->aios);
}
}
+static unsigned int aio_estimate_nr_reqs_v1(unsigned int size)
+{
+ unsigned int k_max_reqs = NR_IOEVENTS_IN_NPAGES(size/PAGE_SIZE);
-unsigned int aio_estimate_nr_reqs(unsigned int size)
+ if (size & ~PAGE_MASK) {
+ pr_err("Ring size is not aligned\n");
+ return 0;
+ }
+ /*
+ * Kernel did (before e1bdd5f27a5b "aio: percpu reqs_available")
+ *
+ * nr_reqs = max(nr_reqs, nr_cpus * 4)
+ * nr_reqs += 2
+ * ring = roundup(sizeof(head) + nr_reqs * sizeof(req))
+ * nr_reqs = (ring - sizeof(head)) / sizeof(req)
+ *
+ * And the k_max_reqs here is the resulting value.
+ *
+ * We need to get the initial nr_reqs that would grow
+ * up back to the k_max_reqs.
+ */
+
+ return (k_max_reqs - 2);
+}
+
+static unsigned int aio_estimate_nr_reqs_v2(unsigned int size)
{
unsigned int k_max_reqs = NR_IOEVENTS_IN_NPAGES(size/PAGE_SIZE);
@@ -56,7 +91,7 @@ unsigned int aio_estimate_nr_reqs(unsigned int size)
return 0;
}
/*
- * Kernel does
+ * Kernel does (since e1bdd5f27a5b "aio: percpu reqs_available")
*
* nr_reqs = max(nr_reqs, nr_cpus * 4)
* nr_reqs *= 2
@@ -117,3 +152,35 @@ int parasite_collect_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas)
return 0;
}
+
+int aio_init(void)
+{
+ aio_context_t ctx = 0;
+ struct aio_ring *ring;
+ unsigned nr_events;
+ long ret;
+
+ nr_events = NR_IOEVENTS_IN_NPAGES(1);
+ nr_events -= 2;
+
+ ret = syscall(__NR_io_setup, nr_events, &ctx);
+ if (ret < 0) {
+ pr_err("Ring setup failed with %ld\n", ret);
+ return -1;
+ }
+ ring = (void *)ctx;
+ if (ring->nr == NR_IOEVENTS_IN_NPAGES(1)) {
+ aio_estimate_nr_reqs = aio_estimate_nr_reqs_v1;
+ pr_info("io_setup() version#1\n");
+ } else if (ring->nr == NR_IOEVENTS_IN_NPAGES(2)) {
+ aio_estimate_nr_reqs = aio_estimate_nr_reqs_v2;
+ pr_info("io_setup() version#2\n");
+ } else {
+ pr_err("Can't determine io_setup() version\n");
+ ret = -1;
+ }
+
+ syscall(__NR_io_destroy, ctx);
+
+ return ret;
+}
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 0f6cdde..829c113 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -2315,6 +2315,9 @@ int cr_restore_tasks(void)
if (vdso_init())
goto err;
+ if (aio_init())
+ goto err;
+
if (opts.cpu_cap & (CPU_CAP_INS | CPU_CAP_CPU)) {
if (cpu_validate_cpuinfo())
goto err;
@@ -3191,8 +3194,12 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
goto err_nv;
raio->addr = mm->aios[i]->id;
- raio->nr_req = mm->aios[i]->nr_req;
raio->len = mm->aios[i]->ring_len;
+ raio->nr_req = aio_estimate_nr_reqs(raio->len);
+ if (!raio->nr_req) {
+ pr_err("Empty aio ring\n");
+ goto err_nv;
+ }
}
/*
diff --git a/criu/include/aio.h b/criu/include/aio.h
index c72122c..f445b0e 100644
--- a/criu/include/aio.h
+++ b/criu/include/aio.h
@@ -3,7 +3,8 @@
#include <linux/aio_abi.h>
#include "images/mm.pb-c.h"
-unsigned int aio_estimate_nr_reqs(unsigned int size);
+int aio_init(void);
+unsigned int (*aio_estimate_nr_reqs)(unsigned int size);
int dump_aio_ring(MmEntry *mme, struct vma_area *vma);
void free_aios(MmEntry *mme);
struct parasite_ctl;
More information about the CRIU
mailing list