[CRIU] [PATCH 7/7] plugin: Add DUMP_DEVICE_LATE callback
David Francis
David.Francis at amd.com
Sat May 17 00:05:39 MSK 2025
The amdgpu plugin was counting how many files were checkpointed
to determine when it should close the device files.
The number of device files is not consistent; a process may
have multiple copies of the drm device files open.
Instead of doing this counting, add a new callback after all
files are checkpointed, so plugins can clean up their
resources at an appropriate time.
Signed-off-by: David Francis <David.Francis at amd.com>
---
criu/cr-dump.c | 3 ++
criu/include/criu-plugin.h | 4 ++-
criu/plugin.c | 1 +
plugins/amdgpu/amdgpu_plugin.c | 55 ++++++++++++-----------------
plugins/amdgpu/amdgpu_plugin_util.c | 41 ++++++++++++---------
plugins/amdgpu/amdgpu_plugin_util.h | 6 ++--
6 files changed, 57 insertions(+), 53 deletions(-)
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 1bc5d934f..34e756c7f 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -2225,6 +2225,9 @@ int cr_dump_tasks(pid_t pid)
goto err;
}
+ if(run_plugins(DUMP_DEVICE_LATE, pid))
+ goto err;
+
if (parent_ie) {
inventory_entry__free_unpacked(parent_ie, NULL);
parent_ie = NULL;
diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h
index aaf4b0b94..b2a3ffce9 100644
--- a/criu/include/criu-plugin.h
+++ b/criu/include/criu-plugin.h
@@ -64,6 +64,8 @@ enum {
CR_PLUGIN_HOOK__COLLECT_FILE = 13,
+ CR_PLUGIN_HOOK__DUMP_DEVICE_LATE = 14,
+
CR_PLUGIN_HOOK__MAX
};
@@ -84,7 +86,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_EARLY, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__COLLECT_FILE, int pid, int fd);
-
+DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_DEVICE_LATE, int id);
enum {
CR_PLUGIN_STAGE__DUMP,
diff --git a/criu/plugin.c b/criu/plugin.c
index cfb19e9f0..a0f27616c 100644
--- a/criu/plugin.c
+++ b/criu/plugin.c
@@ -61,6 +61,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");
__assign_hook(RESUME_DEVICES_EARLY, "cr_plugin_resume_devices_early");
__assign_hook(COLLECT_FILE, "cr_plugin_collect_file");
+ __assign_hook(DUMP_DEVICE_LATE, "cr_plugin_dump_device_late");
#undef __assign_hook
diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c
index ad66e4659..b39c78175 100644
--- a/plugins/amdgpu/amdgpu_plugin.c
+++ b/plugins/amdgpu/amdgpu_plugin.c
@@ -54,13 +54,6 @@ struct vma_metadata {
/************************************ Global Variables ********************************************/
-/**
- * FD of KFD device used to checkpoint. On a multi-process
- * tree the order of checkpointing goes from parent to child
- * and so on - so saving the FD will not be overwritten
- */
-static int kfd_checkpoint_fd;
-
static LIST_HEAD(update_vma_info_list);
static LIST_HEAD(amdgpu_processes);
@@ -1018,28 +1011,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
return 0;
}
-static int unpause_process(int fd)
+int amdgpu_unpause_processes(int pid)
{
int ret = 0;
struct kfd_ioctl_criu_args args = { 0 };
+ struct list_head *l = get_dumped_fds();
+ struct dumped_fd *st;
- args.op = KFD_CRIU_OP_UNPAUSE;
+ list_for_each_entry(st, l, l) {
+ if (st->is_drm) {
+ close(st->fd);
+ } else {
+ args.op = KFD_CRIU_OP_UNPAUSE;
- ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args);
- if (ret) {
- pr_perror("Failed to unpause process");
- goto exit;
+ ret = kmtIoctl(st->fd, AMDKFD_IOC_CRIU_OP, &args);
+ if (ret) {
+ pr_perror("Failed to unpause process");
+ goto exit;
+ }
+ }
}
- // Reset the KFD FD
- kfd_checkpoint_fd = -1;
- sys_close_drm_render_devices(&src_topology);
-
exit:
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
+ clear_dumped_fds();
return ret;
}
+CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_DEVICE_LATE, amdgpu_unpause_processes)
static void dmabuf_socket_name_gen(struct sockaddr_un *addr, int *len, int pid)
{
@@ -1359,9 +1358,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
return -1;
}
- /* Initialize number of device files that will be checkpointed */
- init_gpu_count(&src_topology);
-
/* Check whether this plugin was called for kfd or render nodes */
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
@@ -1373,11 +1369,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
if (ret)
return ret;
- /* Invoke unpause process if needed */
- decrement_checkpoint_count();
- if (checkpoint_is_complete()) {
- ret = unpause_process(kfd_checkpoint_fd);
- }
+ ret = record_dumped_fd(fd, true);
+ if (ret)
+ return ret;
/* Need to return success here so that criu can call plugins for renderD nodes */
return ret;
@@ -1475,14 +1469,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
xfree(buf);
-exit:
- /* Restore all queues if conditions permit */
- kfd_checkpoint_fd = fd;
- decrement_checkpoint_count();
- if (checkpoint_is_complete()) {
- ret = unpause_process(fd);
- }
+ ret = record_dumped_fd(fd, false);
+ if (ret)
+ goto exit;
+exit:
xfree((void *)args.devices);
xfree((void *)args.bos);
xfree((void *)args.priv_data);
diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c
index 4b3ae0cdd..b7d6fe2f3 100644
--- a/plugins/amdgpu/amdgpu_plugin_util.c
+++ b/plugins/amdgpu/amdgpu_plugin_util.c
@@ -38,9 +38,7 @@
#include "amdgpu_plugin_util.h"
#include "amdgpu_plugin_topology.h"
-/* Tracks number of device files that need to be checkpointed */
-static int dev_file_cnt = 0;
-
+static LIST_HEAD(dumped_fds);
static LIST_HEAD(shared_bos);
static LIST_HEAD(shared_dmabuf_fds);
static LIST_HEAD(completed_work);
@@ -53,23 +51,23 @@ struct tp_system dest_topology;
struct device_maps checkpoint_maps;
struct device_maps restore_maps;
-bool checkpoint_is_complete()
-{
- return (dev_file_cnt == 0);
-}
+int record_dumped_fd(int fd, bool is_drm) {
+ int newfd = dup(fd);
-void decrement_checkpoint_count()
-{
- dev_file_cnt--;
-}
+ if (newfd < 0)
+ return newfd;
+ struct dumped_fd *st = malloc(sizeof(struct dumped_fd));
+ if (!st)
+ return -1;
+ st->fd = newfd;
+ st->is_drm = is_drm;
+ list_add(&st->l, &dumped_fds);
-void init_gpu_count(struct tp_system *topo)
-{
- if (dev_file_cnt != 0)
- return;
+ return 0;
+}
- /* We add ONE to include checkpointing of KFD device */
- dev_file_cnt = 1 + topology_gpu_count(topo);
+struct list_head *get_dumped_fds() {
+ return &dumped_fds;
}
bool shared_bo_has_exporter(int handle) {
@@ -174,6 +172,15 @@ void clear_restore_state() {
}
}
+void clear_dumped_fds() {
+ while (!list_empty(&dumped_fds)) {
+ struct dumped_fd *st = list_first_entry(&dumped_fds, struct dumped_fd, l);
+ list_del(&st->l);
+ close(st->fd);
+ free(st);
+ }
+}
+
int read_fp(FILE *fp, void *buf, const size_t buf_len)
{
size_t len_read;
diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h
index bd23fc6d4..edf0d05f4 100644
--- a/plugins/amdgpu/amdgpu_plugin_util.h
+++ b/plugins/amdgpu/amdgpu_plugin_util.h
@@ -128,9 +128,9 @@ int read_file(const char *file_path, void *buf, const size_t buf_len);
int write_img_file(char *path, const void *buf, const size_t buf_len);
FILE *open_img_file(char *path, bool write, size_t *size);
-bool checkpoint_is_complete();
-void decrement_checkpoint_count();
-void init_gpu_count(struct tp_system *topology);
+int record_dumped_fd(int fd, bool is_drm);
+struct list_head *get_dumped_fds();
+void clear_dumped_fds();
bool shared_bo_has_exporter(int handle);
int record_shared_bo(int handle, bool is_imported);
--
2.34.1
More information about the CRIU
mailing list