[CRIU] [PATCH 6/8] aio: Dump AIO rings

Pavel Emelyanov xemul at parallels.com
Fri Oct 10 12:02:38 PDT 2014


When AIO context is set up kernel does two things:

1. creates an in-kernel aioctx object
2. maps a ring into process memory

The 2nd thing gives us all the needed information
about how the AIO was set up. So, in order to dump
one we need to pick the ring in memory and get all
the information we need from it.

One thing to note -- we cannot dump tasks if there
are any AIO requests pending. So we also need to
go to parasite and check the ring to be empty.

Signed-off-by: Pavel Emelyanov <xemul at parallels.com>
---
 Makefile.crtools   |   1 +
 aio.c              | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 cr-dump.c          |  14 +++++++
 include/aio.h      |   9 ++++
 include/image.h    |   1 +
 include/parasite.h |  12 ++++++
 include/vma.h      |   4 ++
 parasite-syscall.c |   2 +
 pie/parasite.c     |  70 +++++++++++++++++++++++++++++++
 proc_parse.c       |  10 ++++-
 protobuf/mm.proto  |   7 ++++
 11 files changed, 249 insertions(+), 1 deletion(-)
 create mode 100644 aio.c
 create mode 100644 include/aio.h

diff --git a/Makefile.crtools b/Makefile.crtools
index 7f21d25..73ffe0d 100644
--- a/Makefile.crtools
+++ b/Makefile.crtools
@@ -61,6 +61,7 @@ obj-y	+= kerndat.o
 obj-y	+= stats.o
 obj-y	+= cgroup.o
 obj-y	+= timerfd.o
+obj-y	+= aio.o
 obj-y	+= string.o
 obj-y	+= sigframe.o
 ifeq ($(VDSO),y)
diff --git a/aio.c b/aio.c
new file mode 100644
index 0000000..86cb0c6
--- /dev/null
+++ b/aio.c
@@ -0,0 +1,120 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include "vma.h"
+#include "xmalloc.h"
+#include "aio.h"
+#include "parasite.h"
+#include "parasite-syscall.h"
+#include "protobuf/mm.pb-c.h"
+
+int dump_aio_ring(MmEntry *mme, struct vma_area *vma)
+{
+	int nr = mme->n_aios;
+	AioRingEntry *re;
+
+	pr_info("Dumping AIO ring @%#lx, %u reqs\n",
+			vma->e->start, vma->aio_nr_req);
+
+	mme->aios = xrealloc(mme->aios, (nr + 1) * sizeof(re));
+	if (!mme->aios)
+		return -1;
+
+	re = xmalloc(sizeof(*re));
+	if (!re)
+		return -1;
+
+	aio_ring_entry__init(re);
+	re->id = vma->e->start;
+	re->nr_req = vma->aio_nr_req;
+	re->ring_len = vma->e->end - vma->e->start;
+	mme->aios[nr] = re;
+	mme->n_aios = nr + 1;
+	return 0;
+}
+
+void free_aios(MmEntry *mme)
+{
+	int i;
+
+	if (mme->aios) {
+		for (i = 0; i < mme->n_aios; i++)
+			xfree(mme->aios[i]);
+		xfree(mme->aios);
+	}
+}
+
+static unsigned int aio_estimate_nr_reqs(unsigned int k_max_reqs)
+{
+	/*
+	 * Kernel does
+	 *
+	 * nr_reqs = max(nr_reqs, nr_cpus * 4)
+	 * nr_reqs *= 2
+	 * nr_reqs += 2
+	 * ring = roundup(sizeof(head) + nr_reqs * sizeof(req))
+	 * nr_reqs = (ring - sizeof(head)) / sizeof(req)
+	 *
+	 * And the k_max_reqs here is the resulting value.
+	 *
+	 * We need to get the initial nr_reqs that would grow
+	 * up back to the k_max_reqs.
+	 */
+
+	return (k_max_reqs - 2) / 2;
+}
+
+unsigned long aio_rings_args_size(struct vm_area_list *vmas)
+{
+	return sizeof(struct parasite_check_aios_args) +
+		vmas->nr_aios * sizeof(struct parasite_aio);
+}
+
+int parasite_check_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas)
+{
+	struct vma_area *vma;
+	struct parasite_check_aios_args *aa;
+	struct parasite_aio *pa;
+	int i;
+
+	if (!vmas->nr_aios)
+		return 0;
+
+	pr_info("Checking AIO rings\n");
+
+	/*
+	 * Go to parasite and
+	 * a) check that no requests are currently pengind
+	 * b) get the maximum number of requests kernel handles
+	 *    to estimate what was the user request on ring
+	 *    creation.
+	 */
+
+	aa = parasite_args_s(ctl, aio_rings_args_size(vmas));
+	pa = &aa->ring[0];
+	list_for_each_entry(vma, &vmas->h, list) {
+		if (!vma_area_is(vma, VMA_AREA_AIORING))
+			continue;
+
+		pr_debug(" `- Ring #%ld @%#lx\n",
+				pa - &aa->ring[0], vma->e->start);
+		pa->ctx = vma->e->start;
+		pa->max_reqs = 0;
+		pa->vma_nr_reqs = &vma->aio_nr_req;
+		pa++;
+	}
+	aa->nr_rings = vmas->nr_aios;
+
+	if (parasite_execute_daemon(PARASITE_CMD_CHECK_AIOS, ctl))
+		return -1;
+
+	pa = &aa->ring[0];
+	for (i = 0; i < vmas->nr_aios; i++) {
+		pa = &aa->ring[i];
+		*pa->vma_nr_reqs = aio_estimate_nr_reqs(pa->max_reqs);
+		pr_debug(" `- Ring #%d has %u reqs, estimated to %u\n", i,
+				pa->max_reqs, *pa->vma_nr_reqs);
+	}
+
+	return 0;
+}
diff --git a/cr-dump.c b/cr-dump.c
index adf8eff..f7991bc 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -72,6 +72,7 @@
 #include "irmap.h"
 #include "sysfs_parse.h"
 #include "action-scripts.h"
+#include "aio.h"
 
 #include "asm/dump.h"
 
@@ -464,6 +465,12 @@ static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat,
 			goto err;
 
 		mme.vmas[i++] = vma;
+
+		if (vma_entry_is(vma, VMA_AREA_AIORING)) {
+			ret = dump_aio_ring(&mme, vma_area);
+			if (ret)
+				goto err;
+		}
 	}
 
 	mme.mm_start_code = stat->start_code;
@@ -496,6 +503,7 @@ static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat,
 
 	ret = pb_write_one(img_from_set(imgset, CR_FD_MM), &mme, PB_MM);
 	xfree(mme.mm_saved_auxv);
+	free_aios(&mme);
 err:
 	return ret;
 }
@@ -1567,6 +1575,12 @@ static int dump_one_task(struct pstree_item *item)
 		goto err_cure_imgset;
 	}
 
+	ret = parasite_check_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */
+	if (ret) {
+		pr_err("Failed to check aio rings (pid: %d)\n", pid);
+		goto err_cure_imgset;
+	}
+
 	ret = parasite_dump_misc_seized(parasite_ctl, &misc);
 	if (ret) {
 		pr_err("Can't dump misc (pid: %d)\n", pid);
diff --git a/include/aio.h b/include/aio.h
new file mode 100644
index 0000000..af7a046
--- /dev/null
+++ b/include/aio.h
@@ -0,0 +1,9 @@
+#ifndef __CR_AIO_H__
+#define __CR_AIO_H__
+#include "protobuf/mm.pb-c.h"
+int dump_aio_ring(MmEntry *mme, struct vma_area *vma);
+void free_aios(MmEntry *mme);
+struct parasite_ctl;
+int parasite_check_aios(struct parasite_ctl *, struct vm_area_list *);
+unsigned long aio_rings_args_size(struct vm_area_list *);
+#endif /* __CR_AIO_H__ */
diff --git a/include/image.h b/include/image.h
index e02fa0e..9c711c0 100644
--- a/include/image.h
+++ b/include/image.h
@@ -54,6 +54,7 @@
 #define VMA_AREA_SYSVIPC	(1 <<  10)
 #define VMA_AREA_SOCKET		(1 <<  11)
 #define VMA_AREA_VVAR		(1 <<  12)
+#define VMA_AREA_AIORING	(1 <<  13)
 
 #define VMA_UNSUPP		(1 <<  31)	/* Unsupported VMA */
 
diff --git a/include/parasite.h b/include/parasite.h
index 774eba0..2357818 100644
--- a/include/parasite.h
+++ b/include/parasite.h
@@ -47,6 +47,7 @@ enum {
 	PARASITE_CMD_GET_PROC_FD,
 	PARASITE_CMD_DUMP_TTY,
 	PARASITE_CMD_CHECK_VDSO_MARK,
+	PARASITE_CMD_CHECK_AIOS,
 
 	PARASITE_CMD_MAX,
 };
@@ -133,6 +134,17 @@ struct parasite_dump_posix_timers_args {
 	struct posix_timer timer[0];
 };
 
+struct parasite_aio {
+	unsigned long ctx;
+	unsigned int max_reqs;
+	unsigned int *vma_nr_reqs;
+};
+
+struct parasite_check_aios_args {
+	unsigned nr_rings;
+	struct parasite_aio ring[0];
+};
+
 static inline int posix_timers_dump_size(int timer_n)
 {
 	return sizeof(int) + sizeof(struct posix_timer) * timer_n;
diff --git a/include/vma.h b/include/vma.h
index d2ce80c..878658c 100644
--- a/include/vma.h
+++ b/include/vma.h
@@ -7,6 +7,7 @@
 struct vm_area_list {
 	struct list_head	h;
 	unsigned		nr;
+	unsigned int		nr_aios;
 	unsigned long		priv_size; /* nr of pages in private VMAs */
 	unsigned long		longest; /* nr of pages in longest VMA */
 };
@@ -35,9 +36,12 @@ struct vma_area {
 				 * The file_fd is an fd for a regular file and
 				 * the socket_id is the inode number of the
 				 * mapped (PF_PACKET) socket.
+				 *
+				 * The aio_nr_req is only for aio rings.
 				 */
 				int	vm_file_fd;
 				int	vm_socket_id;
+				unsigned int aio_nr_req;
 			};
 
 			char		*aufs_rpath;	/* path from aufs root */
diff --git a/parasite-syscall.c b/parasite-syscall.c
index 9d4d5ea..b231418 100644
--- a/parasite-syscall.c
+++ b/parasite-syscall.c
@@ -28,6 +28,7 @@
 #include "mem.h"
 #include "vma.h"
 #include "proc_parse.h"
+#include "aio.h"
 
 #include <string.h>
 #include <stdlib.h>
@@ -1161,6 +1162,7 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item,
 
 	BUG_ON(item->threads[0].real != pid);
 	parasite_ensure_args_size(item, dump_pages_args_size(vma_area_list));
+	parasite_ensure_args_size(item, aio_rings_args_size(vma_area_list));
 
 	ctl = parasite_prep_ctl(pid, vma_area_list);
 	if (!ctl)
diff --git a/pie/parasite.c b/pie/parasite.c
index bab67f8..ea98eed 100644
--- a/pie/parasite.c
+++ b/pie/parasite.c
@@ -313,6 +313,73 @@ static inline int tty_ioctl(int fd, int cmd, int *arg)
 	return 0;
 }
 
+/*
+ * Stolen from kernel/fs/aio.c
+ *
+ * Is it valid to go to memory and check it? Should be,
+ * as libaio does the same.
+ */
+
+#define AIO_RING_MAGIC			0xa10a10a1
+#define AIO_RING_COMPAT_FEATURES	1
+#define AIO_RING_INCOMPAT_FEATURES	0
+
+struct aio_ring {
+	unsigned        id;     /* kernel internal index number */
+	unsigned        nr;     /* number of io_events */
+	unsigned        head;   /* Written to by userland or under ring_lock
+				 * mutex by aio_read_events_ring(). */
+	unsigned        tail;
+
+	unsigned        magic;
+	unsigned        compat_features;
+	unsigned        incompat_features;
+	unsigned        header_length;  /* size of aio_ring */
+
+
+	/* struct io_event         io_events[0]; */
+};
+
+static int sane_ring(struct aio_ring *ring)
+{
+	return ring->magic == AIO_RING_MAGIC &&
+		ring->compat_features == AIO_RING_COMPAT_FEATURES &&
+		ring->incompat_features == AIO_RING_INCOMPAT_FEATURES &&
+		ring->header_length == sizeof(struct aio_ring);
+}
+
+static int parasite_check_aios(struct parasite_check_aios_args *args)
+{
+	int i;
+
+	for (i = 0; i < args->nr_rings; i++) {
+		struct aio_ring *ring;
+
+		ring = (struct aio_ring *)args->ring[i].ctx;
+		if (!sane_ring(ring)) {
+			pr_err("Not valid ring #%d\n", i);
+			pr_info(" `- magic %x\n", ring->magic);
+			pr_info(" `- cf    %d\n", ring->compat_features);
+			pr_info(" `- if    %d\n", ring->incompat_features);
+			pr_info(" `- size  %d (%ld)\n", ring->header_length, sizeof(struct aio_ring));
+			return -1;
+		}
+
+		/*
+		 * XXX what else can we do if there are requests
+		 * in the ring?
+		 */
+		if (ring->head != ring->tail) {
+			pr_err("Pending AIO requests in ring #%d\n", i);
+			return -1;
+		}
+
+		args->ring[i].max_reqs = ring->nr;
+	}
+
+	return 0;
+}
+
 static int parasite_dump_tty(struct parasite_tty_args *args)
 {
 	int ret;
@@ -521,6 +588,9 @@ static noinline __used int noinline parasite_daemon(void *args)
 		case PARASITE_CMD_DUMP_TTY:
 			ret = parasite_dump_tty(args);
 			break;
+		case PARASITE_CMD_CHECK_AIOS:
+			ret = parasite_check_aios(args);
+			break;
 #ifdef CONFIG_VDSO
 		case PARASITE_CMD_CHECK_VDSO_MARK:
 			ret = parasite_check_vdso_mark(args);
diff --git a/proc_parse.c b/proc_parse.c
index 38f933a..3161f8e 100644
--- a/proc_parse.c
+++ b/proc_parse.c
@@ -305,6 +305,7 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, bool use_map_file
 	struct bfd f;
 
 	vma_area_list->nr = 0;
+	vma_area_list->nr_aios = 0;
 	vma_area_list->longest = 0;
 	vma_area_list->priv_size = 0;
 	INIT_LIST_HEAD(&vma_area_list->h);
@@ -491,7 +492,14 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, bool use_map_file
 				/* regular file mapping -- supported */;
 			else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO))
 				/* devzero mapping -- also makes sense */;
-			else {
+			else if ((st_buf->st_mode & S_IFMT) == 0 && !strcmp(file_path, "/[aio")) {
+				/* AIO ring, let's try */
+				close(vma_area->vm_file_fd);
+				vma_area->e->status = VMA_AREA_AIORING; /* drops REGULAR */
+				vma_area->aio_nr_req = -1;
+				vma_area_list->nr_aios++;
+				continue;
+			} else {
 				pr_err("Can't handle non-regular mapping on %d's map %#lx\n", pid, start);
 				goto err;
 			}
diff --git a/protobuf/mm.proto b/protobuf/mm.proto
index 1556b60..de2ff74 100644
--- a/protobuf/mm.proto
+++ b/protobuf/mm.proto
@@ -1,5 +1,11 @@
 import "vma.proto";
 
+message aio_ring_entry {
+	required uint64	id		= 1;
+	required uint32	nr_req		= 2;
+	required uint32	ring_len	= 3;
+}
+
 message mm_entry {
 	required uint64	mm_start_code	=  1;
 	required uint64	mm_end_code	=  2;
@@ -19,4 +25,5 @@ message mm_entry {
 	repeated vma_entry vmas		= 14;
 
 	optional int32	dumpable	= 15;
+	repeated aio_ring_entry	aios	= 16;
 }
-- 
1.8.4.2




More information about the CRIU mailing list