[CRIU] [RFC PATCH 12/20] criu/plugin: Dump and restore events
Felix Kuehling
Felix.Kuehling at amd.com
Sat May 1 04:58:37 MSK 2021
From: David Yat Sin <david.yatsin at amd.com>
Add support for dumping and restoring events during CRIU checkpoint and
restore. Events are used to send notifications to applications when
certain tasks are completed or exceptions have occured.
Signed-off-by: David Yat Sin <david.yatsin at amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
---
test/others/ext-kfd/criu-kfd.proto | 20 +++++
test/others/ext-kfd/kfd_plugin.c | 125 ++++++++++++++++++++++++++++-
2 files changed, 144 insertions(+), 1 deletion(-)
diff --git a/test/others/ext-kfd/criu-kfd.proto b/test/others/ext-kfd/criu-kfd.proto
index e1a93f503..85b676846 100644
--- a/test/others/ext-kfd/criu-kfd.proto
+++ b/test/others/ext-kfd/criu-kfd.proto
@@ -42,6 +42,23 @@ message q_entry {
required bytes ctl_stack = 23;
}
+message ev_entry {
+ required uint32 event_id = 1;
+ required uint32 auto_reset = 2;
+ required uint32 type = 3;
+ required uint32 signaled = 4;
+ required uint64 user_signal_address = 5;
+ required uint32 mem_exc_fail_not_present = 6;
+ required uint32 mem_exc_fail_read_only = 7;
+ required uint32 mem_exc_fail_no_execute = 8;
+ required uint64 mem_exc_va = 9;
+ required uint32 mem_exc_gpu_id = 10;
+ required uint32 hw_exc_reset_type = 11;
+ required uint32 hw_exc_reset_cause = 12;
+ required uint32 hw_exc_memory_lost = 13;
+ required uint32 hw_exc_gpu_id = 14;
+}
+
message criu_kfd {
required uint32 pid = 1;
required uint32 num_of_devices = 2;
@@ -50,6 +67,9 @@ message criu_kfd {
repeated bo_entries_test bo_info_test = 5;
required uint32 num_of_queues = 6;
repeated q_entry q_entries = 7;
+ required uint64 event_page_offset = 8;
+ required uint32 num_of_events = 9;
+ repeated ev_entry ev_entries = 10;
}
message criu_render_node {
diff --git a/test/others/ext-kfd/kfd_plugin.c b/test/others/ext-kfd/kfd_plugin.c
index 5c4809649..78c8c4f3b 100644
--- a/test/others/ext-kfd/kfd_plugin.c
+++ b/test/others/ext-kfd/kfd_plugin.c
@@ -173,6 +173,10 @@ static void free_e(CriuKfd *e)
if (e->q_entries[i])
xfree(e->q_entries[i]);
}
+ for (int i = 0; i < e->n_ev_entries; i++) {
+ if (e->ev_entries[i])
+ xfree(e->ev_entries[i]);
+ }
xfree(e);
}
@@ -260,6 +264,28 @@ static int allocate_q_entries(CriuKfd *e, int num_queues)
return 0;
}
+static int allocate_ev_entries(CriuKfd *e, int num_events)
+{
+ e->ev_entries = xmalloc(sizeof(EvEntry*) * num_events);
+ if (!e->ev_entries) {
+ pr_err("Failed to allocate ev_entries\n");
+ return -1;
+ }
+
+ for (int i = 0; i < num_events; i++) {
+ EvEntry *ev_entry = xmalloc(sizeof(EvEntry));
+ if (!ev_entry) {
+ pr_err("Failed to allocate ev_entry\n");
+ return -ENOMEM;
+ }
+ ev_entry__init(ev_entry);
+ e->ev_entries[i] = ev_entry;
+ e->n_ev_entries++;
+
+ }
+ e->num_of_events = num_events;
+ return 0;
+}
int kfd_plugin_init(int stage)
{
pr_info("kfd_plugin: initialized: %s (AMDGPU/KFD)\n",
@@ -281,6 +307,7 @@ int kfd_plugin_dump_file(int fd, int id)
struct kfd_ioctl_criu_dumper_args args = {0};
struct kfd_criu_bo_buckets *bo_bucket_ptr;
struct kfd_criu_q_bucket *q_bucket_ptr;
+ struct kfd_criu_ev_bucket *ev_buckets_ptr = NULL;
int ret, drm_fd;
char img_path[PATH_MAX];
struct stat st, st_kfd;
@@ -389,6 +416,14 @@ int kfd_plugin_dump_file(int fd, int id)
pr_info("kfd_plugin: queues data size:%llu\n", args.queues_data_size);
}
+ if (helper_args.num_of_events) {
+ ev_buckets_ptr = xmalloc(helper_args.num_of_events *
+ sizeof(struct kfd_criu_ev_bucket));
+ args.num_of_events = helper_args.num_of_events;
+ }
+
+ args.kfd_criu_ev_buckets_ptr = (uintptr_t)ev_buckets_ptr;
+
/* call dumper ioctl, pass num of BOs to dump */
if (kmtIoctl(fd, AMDKFD_IOC_CRIU_DUMPER, &args) == -1) {
pr_perror("kfd_plugin: failed to call kfd ioctl from plugin dumper for fd = %d\n", major(st.st_rdev));
@@ -602,6 +637,44 @@ int kfd_plugin_dump_file(int fd, int id)
e->q_entries[i]->ctl_stack.data = queue_data_ptr + q_bucket_ptr[i].cu_mask_size + q_bucket_ptr[i].mqd_size;
}
+ e->event_page_offset = args.event_page_offset;
+ pr_info("kfd_plugin: number of events:%d\n", args.num_of_events);
+
+ if (args.num_of_events) {
+ ret = allocate_ev_entries(e, args.num_of_events);
+ if (ret)
+ return ret;
+
+ for (int i = 0; i < args.num_of_events; i++) {
+ e->ev_entries[i]->event_id = ev_buckets_ptr[i].event_id;
+ e->ev_entries[i]->auto_reset = ev_buckets_ptr[i].auto_reset;
+ e->ev_entries[i]->type = ev_buckets_ptr[i].type;
+ e->ev_entries[i]->signaled = ev_buckets_ptr[i].signaled;
+
+ if (e->ev_entries[i]->type == KFD_IOC_EVENT_MEMORY) {
+ e->ev_entries[i]->mem_exc_fail_not_present =
+ ev_buckets_ptr[i].memory_exception_data.failure.NotPresent;
+ e->ev_entries[i]->mem_exc_fail_read_only =
+ ev_buckets_ptr[i].memory_exception_data.failure.ReadOnly;
+ e->ev_entries[i]->mem_exc_fail_no_execute =
+ ev_buckets_ptr[i].memory_exception_data.failure.NoExecute;
+ e->ev_entries[i]->mem_exc_va =
+ ev_buckets_ptr[i].memory_exception_data.va;
+ e->ev_entries[i]->mem_exc_gpu_id =
+ ev_buckets_ptr[i].memory_exception_data.gpu_id;
+ } else if (e->ev_entries[i]->type == KFD_IOC_EVENT_HW_EXCEPTION) {
+ e->ev_entries[i]->hw_exc_reset_type =
+ ev_buckets_ptr[i].hw_exception_data.reset_type;
+ e->ev_entries[i]->hw_exc_reset_cause =
+ ev_buckets_ptr[i].hw_exception_data.reset_cause;
+ e->ev_entries[i]->hw_exc_memory_lost =
+ ev_buckets_ptr[i].hw_exception_data.memory_lost;
+ e->ev_entries[i]->hw_exc_gpu_id =
+ ev_buckets_ptr[i].hw_exception_data.gpu_id;
+ }
+ }
+ }
+
snprintf(img_path, sizeof(img_path), "kfd.%d.img", id);
pr_info("kfd_plugin: img_path = %s", img_path);
@@ -619,7 +692,7 @@ int kfd_plugin_dump_file(int fd, int id)
criu_kfd__pack(e, buf);
ret = write_file(img_path, buf, len);
- if (ret != len)
+ if (ret)
ret = -1;
xfree(buf);
@@ -627,6 +700,8 @@ failed:
xfree(devinfo_bucket_ptr);
xfree(bo_bucket_ptr);
xfree(q_bucket_ptr);
+ if (ev_buckets_ptr)
+ xfree(ev_buckets_ptr);
free_e(e);
pr_info("kfd_plugin: Exiting from dumper for fd = %d\n", major(st.st_rdev));
return ret;
@@ -641,6 +716,7 @@ int kfd_plugin_restore_file(int id)
struct kfd_ioctl_criu_restorer_args args = {0};
struct kfd_criu_bo_buckets *bo_bucket_ptr;
struct kfd_criu_q_bucket *q_bucket_ptr;
+ struct kfd_criu_ev_bucket *ev_bucket_ptr = NULL;
__u64 *restored_bo_offsets_array;
char img_path[PATH_MAX];
struct stat filestat;
@@ -881,6 +957,51 @@ int kfd_plugin_restore_file(int id)
args.num_of_queues = e->num_of_queues;
args.kfd_criu_q_buckets_ptr = (uintptr_t)q_bucket_ptr;
+ args.event_page_offset = e->event_page_offset;
+
+ pr_info("Number of events:%u\n", e->num_of_events);
+ if (e->num_of_events) {
+ ev_bucket_ptr = xmalloc(e->num_of_events * sizeof(struct kfd_criu_ev_bucket));
+ if (!ev_bucket_ptr) {
+ pr_perror("kfd_plugin: failed to allocate events for restore ioctl\n");
+ return -1;
+ }
+
+ for (int i = 0; i < e->num_of_events; i++ )
+ {
+ ev_bucket_ptr[i].event_id = e->ev_entries[i]->event_id;
+ ev_bucket_ptr[i].auto_reset = e->ev_entries[i]->auto_reset;
+ ev_bucket_ptr[i].type = e->ev_entries[i]->type;
+ ev_bucket_ptr[i].signaled = e->ev_entries[i]->signaled;
+
+ if (e->ev_entries[i]->type == KFD_IOC_EVENT_MEMORY) {
+ ev_bucket_ptr[i].memory_exception_data.failure.NotPresent =
+ e->ev_entries[i]->mem_exc_fail_not_present;
+ ev_bucket_ptr[i].memory_exception_data.failure.ReadOnly =
+ e->ev_entries[i]->mem_exc_fail_read_only;
+ ev_bucket_ptr[i].memory_exception_data.failure.NoExecute =
+ e->ev_entries[i]->mem_exc_fail_no_execute;
+ ev_bucket_ptr[i].memory_exception_data.va =
+ e->ev_entries[i]->mem_exc_va;
+ ev_bucket_ptr[i].memory_exception_data.gpu_id =
+ e->ev_entries[i]->mem_exc_gpu_id;
+
+ } else if (e->ev_entries[i]->type == KFD_IOC_EVENT_HW_EXCEPTION) {
+ ev_bucket_ptr[i].hw_exception_data.reset_type =
+ e->ev_entries[i]->hw_exc_reset_type;
+ ev_bucket_ptr[i].hw_exception_data.reset_cause =
+ e->ev_entries[i]->hw_exc_reset_cause;
+ ev_bucket_ptr[i].hw_exception_data.memory_lost =
+ e->ev_entries[i]->hw_exc_memory_lost;
+ ev_bucket_ptr[i].hw_exception_data.gpu_id =
+ e->ev_entries[i]->hw_exc_gpu_id;
+ }
+ }
+
+ args.num_of_events = e->num_of_events;
+ args.kfd_criu_ev_buckets_ptr = (uintptr_t)ev_bucket_ptr;
+ }
+
if (kmtIoctl(fd, AMDKFD_IOC_CRIU_RESTORER, &args) == -1) {
pr_perror("kfd_plugin: failed to call kfd ioctl from plugin restorer for id = %d\n", id);
fd = -EBADFD;
@@ -999,6 +1120,8 @@ int kfd_plugin_restore_file(int id)
}
clean:
xfree(devinfo_bucket_ptr);
+ if (ev_bucket_ptr)
+ xfree(ev_bucket_ptr);
if (q_bucket_ptr)
xfree(q_bucket_ptr);
xfree(restored_bo_offsets_array);
--
2.17.1
More information about the CRIU
mailing list