[CRIU] [RFC PATCH 12/20] criu/plugin: Dump and restore events

Felix Kuehling Felix.Kuehling at amd.com
Sat May 1 04:58:37 MSK 2021


From: David Yat Sin <david.yatsin at amd.com>

Add support for dumping and restoring events during CRIU checkpoint and
restore. Events are used to send notifications to applications when
certain tasks are completed or exceptions have occured.

Signed-off-by: David Yat Sin <david.yatsin at amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
---
 test/others/ext-kfd/criu-kfd.proto |  20 +++++
 test/others/ext-kfd/kfd_plugin.c   | 125 ++++++++++++++++++++++++++++-
 2 files changed, 144 insertions(+), 1 deletion(-)

diff --git a/test/others/ext-kfd/criu-kfd.proto b/test/others/ext-kfd/criu-kfd.proto
index e1a93f503..85b676846 100644
--- a/test/others/ext-kfd/criu-kfd.proto
+++ b/test/others/ext-kfd/criu-kfd.proto
@@ -42,6 +42,23 @@ message q_entry {
 	required bytes ctl_stack = 23;
 }
 
+message ev_entry {
+	required uint32 event_id = 1;
+	required uint32 auto_reset = 2;
+	required uint32 type = 3;
+	required uint32 signaled = 4;
+	required uint64 user_signal_address = 5;
+	required uint32 mem_exc_fail_not_present = 6;
+	required uint32 mem_exc_fail_read_only = 7;
+	required uint32 mem_exc_fail_no_execute = 8;
+	required uint64 mem_exc_va = 9;
+	required uint32 mem_exc_gpu_id = 10;
+	required uint32 hw_exc_reset_type = 11;
+	required uint32 hw_exc_reset_cause = 12;
+	required uint32 hw_exc_memory_lost = 13;
+	required uint32 hw_exc_gpu_id = 14;
+}
+
 message criu_kfd {
 	required uint32 pid = 1;
 	required uint32 num_of_devices = 2;
@@ -50,6 +67,9 @@ message criu_kfd {
 	repeated bo_entries_test bo_info_test = 5;
 	required uint32	num_of_queues = 6;
 	repeated q_entry q_entries = 7;
+	required uint64 event_page_offset = 8;
+	required uint32 num_of_events = 9;
+	repeated ev_entry ev_entries = 10;
 }
 
 message criu_render_node {
diff --git a/test/others/ext-kfd/kfd_plugin.c b/test/others/ext-kfd/kfd_plugin.c
index 5c4809649..78c8c4f3b 100644
--- a/test/others/ext-kfd/kfd_plugin.c
+++ b/test/others/ext-kfd/kfd_plugin.c
@@ -173,6 +173,10 @@ static void free_e(CriuKfd *e)
 		if (e->q_entries[i])
 			xfree(e->q_entries[i]);
 	}
+	for (int i = 0; i < e->n_ev_entries; i++) {
+		if (e->ev_entries[i])
+			xfree(e->ev_entries[i]);
+	}
 	xfree(e);
 }
 
@@ -260,6 +264,28 @@ static int allocate_q_entries(CriuKfd *e, int num_queues)
 	return 0;
 }
 
+static int allocate_ev_entries(CriuKfd *e, int num_events)
+{
+	e->ev_entries = xmalloc(sizeof(EvEntry*) * num_events);
+	if (!e->ev_entries) {
+		pr_err("Failed to allocate ev_entries\n");
+		return -1;
+	}
+
+	for (int i = 0; i < num_events; i++) {
+		EvEntry *ev_entry = xmalloc(sizeof(EvEntry));
+		if (!ev_entry) {
+			pr_err("Failed to allocate ev_entry\n");
+			return -ENOMEM;
+		}
+		ev_entry__init(ev_entry);
+		e->ev_entries[i] = ev_entry;
+		e->n_ev_entries++;
+
+	}
+	e->num_of_events = num_events;
+	return 0;
+}
 int kfd_plugin_init(int stage)
 {
 	pr_info("kfd_plugin: initialized:  %s (AMDGPU/KFD)\n",
@@ -281,6 +307,7 @@ int kfd_plugin_dump_file(int fd, int id)
 	struct kfd_ioctl_criu_dumper_args args = {0};
 	struct kfd_criu_bo_buckets *bo_bucket_ptr;
 	struct kfd_criu_q_bucket *q_bucket_ptr;
+	struct kfd_criu_ev_bucket *ev_buckets_ptr = NULL;
 	int ret, drm_fd;
 	char img_path[PATH_MAX];
 	struct stat st, st_kfd;
@@ -389,6 +416,14 @@ int kfd_plugin_dump_file(int fd, int id)
 		pr_info("kfd_plugin: queues data size:%llu\n", args.queues_data_size);
 	}
 
+	if (helper_args.num_of_events) {
+		ev_buckets_ptr = xmalloc(helper_args.num_of_events *
+					sizeof(struct kfd_criu_ev_bucket));
+		args.num_of_events = helper_args.num_of_events;
+	}
+
+	args.kfd_criu_ev_buckets_ptr = (uintptr_t)ev_buckets_ptr;
+
 	/* call dumper ioctl, pass num of BOs to dump */
         if (kmtIoctl(fd, AMDKFD_IOC_CRIU_DUMPER, &args) == -1) {
 		pr_perror("kfd_plugin: failed to call kfd ioctl from plugin dumper for fd = %d\n", major(st.st_rdev));
@@ -602,6 +637,44 @@ int kfd_plugin_dump_file(int fd, int id)
 		e->q_entries[i]->ctl_stack.data = queue_data_ptr + q_bucket_ptr[i].cu_mask_size + q_bucket_ptr[i].mqd_size;
 	}
 
+	e->event_page_offset = args.event_page_offset;
+	pr_info("kfd_plugin: number of events:%d\n", args.num_of_events);
+
+	if (args.num_of_events) {
+		ret = allocate_ev_entries(e, args.num_of_events);
+		if (ret)
+			return ret;
+
+		for (int i = 0; i < args.num_of_events; i++) {
+			e->ev_entries[i]->event_id = ev_buckets_ptr[i].event_id;
+			e->ev_entries[i]->auto_reset = ev_buckets_ptr[i].auto_reset;
+			e->ev_entries[i]->type = ev_buckets_ptr[i].type;
+			e->ev_entries[i]->signaled = ev_buckets_ptr[i].signaled;
+
+			if (e->ev_entries[i]->type == KFD_IOC_EVENT_MEMORY) {
+				e->ev_entries[i]->mem_exc_fail_not_present =
+					ev_buckets_ptr[i].memory_exception_data.failure.NotPresent;
+				e->ev_entries[i]->mem_exc_fail_read_only =
+					ev_buckets_ptr[i].memory_exception_data.failure.ReadOnly;
+				e->ev_entries[i]->mem_exc_fail_no_execute =
+					ev_buckets_ptr[i].memory_exception_data.failure.NoExecute;
+				e->ev_entries[i]->mem_exc_va =
+					ev_buckets_ptr[i].memory_exception_data.va;
+				e->ev_entries[i]->mem_exc_gpu_id =
+					ev_buckets_ptr[i].memory_exception_data.gpu_id;
+			} else if (e->ev_entries[i]->type == KFD_IOC_EVENT_HW_EXCEPTION) {
+				e->ev_entries[i]->hw_exc_reset_type =
+					ev_buckets_ptr[i].hw_exception_data.reset_type;
+				e->ev_entries[i]->hw_exc_reset_cause =
+					ev_buckets_ptr[i].hw_exception_data.reset_cause;
+				e->ev_entries[i]->hw_exc_memory_lost =
+					ev_buckets_ptr[i].hw_exception_data.memory_lost;
+				e->ev_entries[i]->hw_exc_gpu_id =
+					ev_buckets_ptr[i].hw_exception_data.gpu_id;
+			}
+		}
+	}
+
 	snprintf(img_path, sizeof(img_path), "kfd.%d.img", id);
 	pr_info("kfd_plugin: img_path = %s", img_path);
 
@@ -619,7 +692,7 @@ int kfd_plugin_dump_file(int fd, int id)
 	criu_kfd__pack(e, buf);
 
 	ret = write_file(img_path,  buf, len);
-	if (ret != len)
+	if (ret)
 		ret = -1;
 
 	xfree(buf);
@@ -627,6 +700,8 @@ failed:
 	xfree(devinfo_bucket_ptr);
 	xfree(bo_bucket_ptr);
 	xfree(q_bucket_ptr);
+	if (ev_buckets_ptr)
+		xfree(ev_buckets_ptr);
 	free_e(e);
 	pr_info("kfd_plugin: Exiting from dumper for fd = %d\n", major(st.st_rdev));
         return ret;
@@ -641,6 +716,7 @@ int kfd_plugin_restore_file(int id)
 	struct kfd_ioctl_criu_restorer_args args = {0};
 	struct kfd_criu_bo_buckets *bo_bucket_ptr;
 	struct kfd_criu_q_bucket *q_bucket_ptr;
+	struct kfd_criu_ev_bucket *ev_bucket_ptr = NULL;
 	__u64 *restored_bo_offsets_array;
 	char img_path[PATH_MAX];
 	struct stat filestat;
@@ -881,6 +957,51 @@ int kfd_plugin_restore_file(int id)
 	args.num_of_queues = e->num_of_queues;
 	args.kfd_criu_q_buckets_ptr = (uintptr_t)q_bucket_ptr;
 
+	args.event_page_offset = e->event_page_offset;
+
+	pr_info("Number of events:%u\n", e->num_of_events);
+	if (e->num_of_events) {
+		ev_bucket_ptr = xmalloc(e->num_of_events * sizeof(struct kfd_criu_ev_bucket));
+		if (!ev_bucket_ptr) {
+			pr_perror("kfd_plugin: failed to allocate events for restore ioctl\n");
+			return -1;
+		}
+
+		for (int i = 0; i < e->num_of_events; i++ )
+		{
+			ev_bucket_ptr[i].event_id = e->ev_entries[i]->event_id;
+			ev_bucket_ptr[i].auto_reset = e->ev_entries[i]->auto_reset;
+			ev_bucket_ptr[i].type = e->ev_entries[i]->type;
+			ev_bucket_ptr[i].signaled = e->ev_entries[i]->signaled;
+
+			if (e->ev_entries[i]->type == KFD_IOC_EVENT_MEMORY) {
+				ev_bucket_ptr[i].memory_exception_data.failure.NotPresent =
+						e->ev_entries[i]->mem_exc_fail_not_present;
+				ev_bucket_ptr[i].memory_exception_data.failure.ReadOnly =
+						e->ev_entries[i]->mem_exc_fail_read_only;
+				ev_bucket_ptr[i].memory_exception_data.failure.NoExecute =
+						e->ev_entries[i]->mem_exc_fail_no_execute;
+				ev_bucket_ptr[i].memory_exception_data.va =
+						e->ev_entries[i]->mem_exc_va;
+				ev_bucket_ptr[i].memory_exception_data.gpu_id =
+						e->ev_entries[i]->mem_exc_gpu_id;
+
+			} else if (e->ev_entries[i]->type == KFD_IOC_EVENT_HW_EXCEPTION) {
+				ev_bucket_ptr[i].hw_exception_data.reset_type =
+					e->ev_entries[i]->hw_exc_reset_type;
+				ev_bucket_ptr[i].hw_exception_data.reset_cause =
+					e->ev_entries[i]->hw_exc_reset_cause;
+				ev_bucket_ptr[i].hw_exception_data.memory_lost =
+					e->ev_entries[i]->hw_exc_memory_lost;
+				ev_bucket_ptr[i].hw_exception_data.gpu_id =
+					e->ev_entries[i]->hw_exc_gpu_id;
+			}
+		}
+
+		args.num_of_events = e->num_of_events;
+		args.kfd_criu_ev_buckets_ptr = (uintptr_t)ev_bucket_ptr;
+	}
+
 	if (kmtIoctl(fd, AMDKFD_IOC_CRIU_RESTORER, &args) == -1) {
 		pr_perror("kfd_plugin: failed to call kfd ioctl from plugin restorer for id = %d\n", id);
 		fd = -EBADFD;
@@ -999,6 +1120,8 @@ int kfd_plugin_restore_file(int id)
 	}
 clean:
 	xfree(devinfo_bucket_ptr);
+	if (ev_bucket_ptr)
+		xfree(ev_bucket_ptr);
 	if (q_bucket_ptr)
 		xfree(q_bucket_ptr);
 	xfree(restored_bo_offsets_array);
-- 
2.17.1



More information about the CRIU mailing list