[CRIU] [PATCH v2 4/4] aio: Support two versions of io_setup()
Kirill Tkhai
ktkhai at virtuozzo.com
Thu May 19 06:13:02 PDT 2016
Since kernel e1bdd5f27a5b "aio: percpu reqs_available"
logic of io_setup() has changed. Now it multiplies passed
nr_events twice. The difference you can clearly see if you
compare aio_estimate_nr_reqs_v{1,2} introducing by this
patch.
The patch aims to support restore on kernels of both versions.
To do this, we determine the version of logic in aio_init()
and it's choosed the right function-convector v1 or v2.
The convector determines the right nr_events, that restorer
should pass to io_submit().
v2: Take in account number of possible cpus
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
criu/aio.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++--
criu/cr-restore.c | 9 +++++
criu/include/aio.h | 3 +-
3 files changed, 95 insertions(+), 5 deletions(-)
diff --git a/criu/aio.c b/criu/aio.c
index 4d2fbeb..20b8a5d 100644
--- a/criu/aio.c
+++ b/criu/aio.c
@@ -1,15 +1,21 @@
+#include <sys/syscall.h>
#include <unistd.h>
#include <stdio.h>
#include <stdbool.h>
#include "vma.h"
#include "xmalloc.h"
#include "aio.h"
+#include "kerndat.h"
#include "parasite.h"
#include "parasite-syscall.h"
#include "images/mm.pb-c.h"
#define NR_IOEVENTS_IN_NPAGES(npages) ((PAGE_SIZE * npages - sizeof(struct aio_ring)) / sizeof(struct io_event))
+unsigned int (*aio_estimate_nr_reqs)(unsigned int size);
+
+static unsigned int aio_estimate_nr_reqs_v2(unsigned int size);
+
int dump_aio_ring(MmEntry *mme, struct vma_area *vma)
{
int nr = mme->n_aios;
@@ -26,7 +32,13 @@ int dump_aio_ring(MmEntry *mme, struct vma_area *vma)
aio_ring_entry__init(re);
re->id = vma->e->start;
re->ring_len = vma->e->end - vma->e->start;
- re->nr_req = aio_estimate_nr_reqs(re->ring_len);
+ /*
+ * Every AIO is interpreted as v2 from the very beginning.
+ * Currently, there are two AIO types, and AioRingEntry::nr_req
+ * is not used for restore. But it's still saved for backward
+ * compatibility.
+ */
+ re->nr_req = aio_estimate_nr_reqs_v2(re->ring_len);
if (!re->nr_req)
return -1;
mme->aios[nr] = re;
@@ -46,8 +58,32 @@ void free_aios(MmEntry *mme)
xfree(mme->aios);
}
}
+static unsigned int aio_estimate_nr_reqs_v1(unsigned int size)
+{
+ unsigned int k_max_reqs = NR_IOEVENTS_IN_NPAGES(size/PAGE_SIZE);
+
+ if (size & ~PAGE_MASK) {
+ pr_err("Ring size is not aligned\n");
+ return 0;
+ }
+ /*
+ * Kernel did (before e1bdd5f27a5b "aio: percpu reqs_available")
+ *
+ * nr_reqs = max(nr_reqs, nr_cpus * 4)
+ * nr_reqs += 2
+ * ring = roundup(sizeof(head) + nr_reqs * sizeof(req))
+ * nr_reqs = (ring - sizeof(head)) / sizeof(req)
+ *
+ * And the k_max_reqs here is the resulting value.
+ *
+ * We need to get the initial nr_reqs that would grow
+ * up back to the k_max_reqs.
+ */
+
+ return (k_max_reqs - 2);
+}
-unsigned int aio_estimate_nr_reqs(unsigned int size)
+static unsigned int aio_estimate_nr_reqs_v2(unsigned int size)
{
unsigned int k_max_reqs = NR_IOEVENTS_IN_NPAGES(size/PAGE_SIZE);
@@ -56,7 +92,7 @@ unsigned int aio_estimate_nr_reqs(unsigned int size)
return 0;
}
/*
- * Kernel does
+ * Kernel does (since e1bdd5f27a5b "aio: percpu reqs_available")
*
* nr_reqs = max(nr_reqs, nr_cpus * 4)
* nr_reqs *= 2
@@ -117,3 +153,49 @@ int parasite_collect_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas)
return 0;
}
+
+static unsigned nr_events_page_up(unsigned nr)
+{
+ unsigned long size;
+ size = sizeof(struct aio_ring) + sizeof(struct io_event) * nr;
+ size = ALIGN(size, PAGE_SIZE);
+
+ return (unsigned)((size - sizeof(struct aio_ring)) / sizeof(struct io_event));
+}
+
+int aio_init(void)
+{
+ unsigned nr_events, nr_v1, nr_v2;
+ aio_context_t ctx = 0;
+ struct aio_ring *ring;
+ long ret;
+
+ /* Choose nr_events in the way, v1 is near the page border, so v2 > v1 */
+ nr_events = nr_events_page_up(4 * kdat.nr_cpus_possible);
+ nr_events -= 2;
+
+ nr_v1 = nr_events_page_up(nr_events + 2);
+ nr_v2 = nr_events_page_up(nr_events * 2 + 2);
+
+ ret = syscall(__NR_io_setup, nr_events, &ctx);
+ if (ret < 0) {
+ pr_err("Ring setup failed with %ld\n", ret);
+ return -1;
+ }
+ ring = (void *)ctx;
+ if (ring->nr == nr_v1) {
+ aio_estimate_nr_reqs = aio_estimate_nr_reqs_v1;
+ pr_info("io_setup() version#1\n");
+ } else if (ring->nr == nr_v2) {
+ aio_estimate_nr_reqs = aio_estimate_nr_reqs_v2;
+ pr_info("io_setup() version#2\n");
+ } else {
+ pr_err("Can't determine io_setup() version: nr=%u, cpus=%u\n",
+ ring->nr, kdat.nr_cpus_possible);
+ ret = -1;
+ }
+
+ syscall(__NR_io_destroy, ctx);
+
+ return ret;
+}
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 0f6cdde..829c113 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -2315,6 +2315,9 @@ int cr_restore_tasks(void)
if (vdso_init())
goto err;
+ if (aio_init())
+ goto err;
+
if (opts.cpu_cap & (CPU_CAP_INS | CPU_CAP_CPU)) {
if (cpu_validate_cpuinfo())
goto err;
@@ -3191,8 +3194,12 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
goto err_nv;
raio->addr = mm->aios[i]->id;
- raio->nr_req = mm->aios[i]->nr_req;
raio->len = mm->aios[i]->ring_len;
+ raio->nr_req = aio_estimate_nr_reqs(raio->len);
+ if (!raio->nr_req) {
+ pr_err("Empty aio ring\n");
+ goto err_nv;
+ }
}
/*
diff --git a/criu/include/aio.h b/criu/include/aio.h
index c72122c..f445b0e 100644
--- a/criu/include/aio.h
+++ b/criu/include/aio.h
@@ -3,7 +3,8 @@
#include <linux/aio_abi.h>
#include "images/mm.pb-c.h"
-unsigned int aio_estimate_nr_reqs(unsigned int size);
+int aio_init(void);
+unsigned int (*aio_estimate_nr_reqs)(unsigned int size);
int dump_aio_ring(MmEntry *mme, struct vma_area *vma);
void free_aios(MmEntry *mme);
struct parasite_ctl;
More information about the CRIU
mailing list