[CRIU] [PATCH 6/7] plugin/amdgpu: Add handling for amdgpu drm ioctl
David Francis
David.Francis at amd.com
Sat May 17 00:05:38 MSK 2025
Buffer objects held by the amdgpu drm driver are checkpointed with
the new DRM_IOCTL_AMDGPU_CRIU_OP ioctl. Handling for this
ioctl is in amdgpu_plugin_drm.h
Handling of imported buffer objects may require dmabuf fds to be
transferred between processes. These occur over sockets created
by the amdgpu plugin. There are two new plugin callbacks:
COLLECT_FILE to identify the processes that have amdgpu files and so
need a socket, and RESUME_DEVICES_EARLY to create the sockets before
any files are restored.
Before each amdgpu file restore, check the socket and record the
received dmabuf_fds.
During checkpoint, track shared buffer objects, so that buffer objects
that are shared across processes can be identified.
During restore, track which buffer objects have been restored. Retry
restore of a drm file if a buffer object is imported and the
original has not been exported yet. Skip buffer objects that have
already been completed or cannot be completed in the current restore.
So drm code can use sdma_copy_bo, that function no longer requires
kfd bo structs
Update the protobuf messages with new amdgpu drm information.
Signed-off-by: David Francis <David.Francis at amd.com>
---
criu/cr-restore.c | 3 +
criu/files.c | 3 +
criu/include/criu-plugin.h | 7 +
criu/plugin.c | 2 +
plugins/amdgpu/amdgpu_plugin.c | 259 ++++++++++++++--
plugins/amdgpu/amdgpu_plugin_drm.c | 452 +++++++++++++++++++++++++++-
plugins/amdgpu/amdgpu_plugin_drm.h | 8 +
plugins/amdgpu/amdgpu_plugin_util.c | 121 +++++++-
plugins/amdgpu/amdgpu_plugin_util.h | 50 ++-
plugins/amdgpu/criu-amdgpu.proto | 25 ++
10 files changed, 890 insertions(+), 40 deletions(-)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index ddca6b8ec..0b4acb99b 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -1651,6 +1651,9 @@ static int __restore_task_with_children(void *_arg)
if (open_transport_socket())
goto err;
+ if (run_plugins(RESUME_DEVICES_EARLY, current->pid->real))
+ goto err;
+
timing_start(TIME_FORK);
if (create_children_and_session())
diff --git a/criu/files.c b/criu/files.c
index 31e705bcc..326f23cf7 100644
--- a/criu/files.c
+++ b/criu/files.c
@@ -836,6 +836,9 @@ struct fdinfo_list_entry *collect_fd_to(int pid, FdinfoEntry *e, struct rst_info
{
struct fdinfo_list_entry *new_le;
+ if (fdesc->ops->type == FD_TYPES__EXT)
+ run_plugins(COLLECT_FILE, pid, fdesc->id);
+
new_le = alloc_fle(pid, e);
if (new_le) {
new_le->fake = (!!fake);
diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h
index b844dca4d..aaf4b0b94 100644
--- a/criu/include/criu-plugin.h
+++ b/criu/include/criu-plugin.h
@@ -60,6 +60,10 @@ enum {
CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11,
+ CR_PLUGIN_HOOK__RESUME_DEVICES_EARLY = 12,
+
+ CR_PLUGIN_HOOK__COLLECT_FILE = 13,
+
CR_PLUGIN_HOOK__MAX
};
@@ -78,6 +82,9 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);
+DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_EARLY, int pid);
+DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__COLLECT_FILE, int pid, int fd);
+
enum {
CR_PLUGIN_STAGE__DUMP,
diff --git a/criu/plugin.c b/criu/plugin.c
index 65e79a069..cfb19e9f0 100644
--- a/criu/plugin.c
+++ b/criu/plugin.c
@@ -59,6 +59,8 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
__assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late");
__assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices");
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");
+ __assign_hook(RESUME_DEVICES_EARLY, "cr_plugin_resume_devices_early");
+ __assign_hook(COLLECT_FILE, "cr_plugin_collect_file");
#undef __assign_hook
diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c
index 4e2b0a789..ad66e4659 100644
--- a/plugins/amdgpu/amdgpu_plugin.c
+++ b/plugins/amdgpu/amdgpu_plugin.c
@@ -12,6 +12,8 @@
#include <sys/sysmacros.h>
#include <sys/mman.h>
#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
#include <stdint.h>
#include <pthread.h>
#include <semaphore.h>
@@ -23,11 +25,14 @@
#include "criu-plugin.h"
#include "plugin.h"
#include "criu-amdgpu.pb-c.h"
+#include "util.h"
+#include "util-pie.h"
#include "kfd_ioctl.h"
#include "xmalloc.h"
#include "criu-log.h"
#include "files.h"
+#include "sockets.h"
#include "common/list.h"
#include "amdgpu_plugin_drm.h"
@@ -58,12 +63,18 @@ static int kfd_checkpoint_fd;
static LIST_HEAD(update_vma_info_list);
+static LIST_HEAD(amdgpu_processes);
+
size_t kfd_max_buffer_size;
bool plugin_added_to_inventory = false;
bool plugin_disabled = false;
+int dmabuf_socket_fd = -1;
+
+int current_pid;
+
/**************************************************************************************************/
/* Call ioctl, restarting if it is interrupted */
@@ -503,11 +514,11 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va,
amdgpu_bo_free(h_bo);
}
-static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp,
+int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp,
void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
- uint64_t max_copy_size, enum sdma_op_type type)
+ uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free)
{
- uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain;
+ uint64_t src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain;
uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size;
amdgpu_va_handle h_va_src, h_va_dst, h_va_ib;
amdgpu_bo_handle h_bo_src, h_bo_dst, h_bo_ib;
@@ -520,10 +531,8 @@ static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp,
uint32_t expired;
amdgpu_context_handle h_ctx;
uint32_t *ib = NULL;
- int j, err, shared_fd, packets_per_buffer;
+ int j, err, packets_per_buffer;
- shared_fd = bo_bucket.dmabuf_fd;
- size = bo_bucket.size;
buffer_bo_size = min(size, buffer_size);
packets_per_buffer = ((buffer_bo_size - 1) / max_copy_size) + 1;
src_bo_size = (type == SDMA_OP_VRAM_WRITE) ? buffer_bo_size : size;
@@ -734,7 +743,8 @@ err_dst_bo_map:
if (err)
pr_perror("dest range free failed");
err_dst_va:
- err = amdgpu_bo_free(h_bo_dst);
+ if (!do_not_free)
+ err = amdgpu_bo_free(h_bo_dst);
if (err)
pr_perror("dest bo free failed");
err_dst_bo_prep:
@@ -822,8 +832,9 @@ void *dump_bo_contents(void *_thread_data)
num_bos++;
/* perform sDMA based vram copy */
- ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
- SDMA_OP_VRAM_READ);
+ ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
+ SDMA_OP_VRAM_READ, false);
+
if (ret) {
pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i);
break;
@@ -920,8 +931,8 @@ void *restore_bo_contents(void *_thread_data)
num_bos++;
- ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
- SDMA_OP_VRAM_WRITE);
+ ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
+ SDMA_OP_VRAM_WRITE, false);
if (ret) {
pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
break;
@@ -1030,6 +1041,124 @@ exit:
return ret;
}
+static void dmabuf_socket_name_gen(struct sockaddr_un *addr, int *len, int pid)
+{
+ addr->sun_family = AF_UNIX;
+ snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-amdgpu-dmabuf-%d-%" PRIx64, pid, criu_run_id);
+ *len = SUN_LEN(addr);
+ *addr->sun_path = '\0';
+}
+
+int amdgpu_make_socket(int pid)
+{
+ int ret = 0;
+ struct amdgpu_process *p;
+ struct sockaddr_un saddr;
+ int sock, slen;
+
+ list_for_each_entry(p, &amdgpu_processes, l) {
+ if (p->pid == pid) {
+ dmabuf_socket_fd = get_unused_high_fd();
+ current_pid = pid;
+
+ sock = socket(PF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0);
+ if (sock < 0) {
+ pr_perror("Can't create socket");
+ ret = -1;
+ goto out;
+ }
+
+ dmabuf_socket_name_gen(&saddr, &slen, pid);
+ if (bind(sock, (struct sockaddr *)&saddr, slen) < 0) {
+ pr_perror("Can't bind dmabuf socket %s", saddr.sun_path + 1);
+ close(sock);
+ ret = -1;
+ goto out;
+ }
+
+ ret = fcntl(sock, F_DUPFD, dmabuf_socket_fd);
+ if (ret < 0) {
+ close(sock);
+ goto out;
+ } else if (ret != dmabuf_socket_fd) {
+ close(dmabuf_socket_fd);
+ close(sock);
+ ret = -1;
+ goto out;
+ }
+ close(sock);
+ ret = 0;
+ }
+ }
+
+ out:
+
+ return ret;
+}
+CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_EARLY, amdgpu_make_socket)
+
+int serve_out_dmabuf_fd(int handle, int fd)
+{
+ int ret = 0;
+ struct amdgpu_process *p;
+ struct sockaddr_un saddr;
+ int len;
+
+ list_for_each_entry(p, &amdgpu_processes, l) {
+ dmabuf_socket_name_gen(&saddr, &len, p->pid);
+
+ ret = send_fds(dmabuf_socket_fd, &saddr, len, &fd, 1, (void *)&handle, sizeof(handle));
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+int amdgpu_collect_file(int pid, int fd)
+{
+ struct amdgpu_process *p;
+
+ list_for_each_entry(p, &amdgpu_processes, l)
+ if (p->pid == pid)
+ return 0;
+
+ p = malloc(sizeof(struct amdgpu_process));
+
+ if (!p)
+ return -ENOMEM;
+
+ p->pid = pid;
+
+ list_add(&p->l, &amdgpu_processes);
+
+ return 0;
+}
+CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__COLLECT_FILE, amdgpu_collect_file)
+
+static int recv_dmabuf_fds(void)
+{
+ int fd, newfd, ret, handle;
+
+ while (true) {
+ ret = __recv_fds(dmabuf_socket_fd, &fd, 1, (void *)&handle, sizeof(handle), MSG_DONTWAIT);
+
+ if (ret == -EAGAIN || ret == -EWOULDBLOCK)
+ return 0;
+ else if (ret)
+ return -1;
+
+ newfd = get_unused_high_fd();
+
+ reopen_fd_as(newfd, fd);
+
+ record_shared_dmabuf_fd(handle, newfd);
+ }
+
+ return 0;
+}
+
static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_device_bucket *device_buckets,
CriuKfd *e)
{
@@ -1072,6 +1201,8 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd
{
struct thread_data *thread_datas;
int ret = 0, i;
+ amdgpu_device_handle h_dev;
+ uint32_t major, minor;
pr_debug("Dumping %d BOs\n", args->num_bos);
@@ -1095,6 +1226,19 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd
boinfo->size = bo_bucket->size;
boinfo->offset = bo_bucket->offset;
boinfo->alloc_flags = bo_bucket->alloc_flags;
+
+ ret = amdgpu_device_initialize(node_get_drm_render_device(sys_get_node_by_gpu_id(&src_topology, bo_bucket->gpu_id)), &major, &minor, &h_dev);
+
+ boinfo->handle = get_gem_handle(h_dev, bo_bucket->dmabuf_fd);
+
+ amdgpu_device_deinitialize(h_dev);
+ }
+ for (i = 0; i < e->num_of_bos; i++) {
+ KfdBoEntry *boinfo = e->bo_entries[i];
+
+ ret = record_shared_bo(boinfo->handle, false);
+ if (ret)
+ goto exit;
}
for (int i = 0; i < e->num_of_gpus; i++) {
@@ -1431,9 +1575,33 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)
plugin_log_msg("BO [%d] gpu_id:%x addr:%llx size:%llx offset:%llx\n", i, bo_bucket->gpu_id,
bo_bucket->addr, bo_bucket->size, bo_bucket->offset);
+
}
pr_info("Restore BOs Ok\n");
+
+ return 0;
+}
+
+int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int fd)
+{
+ struct vma_metadata *vma_md;
+
+ vma_md = xmalloc(sizeof(*vma_md));
+ if (!vma_md) {
+ return -ENOMEM;
+ }
+
+ memset(vma_md, 0, sizeof(*vma_md));
+
+ vma_md->old_pgoff = offset;
+ vma_md->vma_entry = addr;
+
+ vma_md->new_pgoff = restored_offset;
+ vma_md->fd = fd;
+
+ list_add_tail(&vma_md->list, &update_vma_info_list);
+
return 0;
}
@@ -1567,6 +1735,10 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed)
if (plugin_disabled)
return -ENOTSUP;
+ ret = recv_dmabuf_fds();
+ if (ret)
+ return ret;
+
pr_info("Initialized kfd plugin restorer with ID = %d\n", id);
snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id);
@@ -1628,8 +1800,18 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed)
pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id);
fd = node_get_drm_render_device(tp_node);
- if (fd < 0)
+ if (fd < 0) {
pr_err("Failed to open render device (minor:%d)\n", tp_node->drm_render_minor);
+ return -1;
+ }
+
+ ret = amdgpu_plugin_drm_restore_file(fd, rd);
+ if (ret == 1)
+ *retry_needed = true;
+ if (ret < 0) {
+ fd = ret;
+ goto fail;
+ }
fail:
criu_render_node__free_unpacked(rd, NULL);
xfree(buf);
@@ -1641,12 +1823,20 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed)
* copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in
* tp_node.
*/
- fd = dup(fd);
- if (fd == -1) {
- pr_perror("unable to duplicate the render fd");
- return -1;
+
+ if (fd < 0)
+ return fd;
+
+ if (!(*retry_needed)) {
+ fd = dup(fd);
+ if (fd == -1) {
+ pr_perror("unable to duplicate the render fd");
+ return -1;
+ }
+ return fd;
}
- return fd;
+
+ return 0;
}
fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
@@ -1690,13 +1880,16 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed)
* This way, we know that the file descriptors we store will not conflict with file descriptors inside core
* CRIU.
*/
- fd_next = find_unused_fd_pid(e->pid);
- if (fd_next <= 0) {
- pr_err("Failed to find unused fd (fd:%d)\n", fd_next);
- ret = -EINVAL;
- goto exit;
+ if (fd_next == -1) {
+ fd_next = find_unused_fd_pid(e->pid);
+ if (fd_next <= 0) {
+ pr_err("Failed to find unused fd (fd:%d)\n", fd_next);
+ ret = -EINVAL;
+ goto exit;
+ }
}
+
ret = devinfo_to_topology(e->device_entries, e->num_of_gpus + e->num_of_cpus, &src_topology);
if (ret) {
pr_err("Failed to convert stored device information to topology\n");
@@ -1727,14 +1920,26 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed)
args.num_objects = e->num_of_objects;
args.priv_data_size = e->priv_data.len;
args.priv_data = (uintptr_t)e->priv_data.data;
-
args.op = KFD_CRIU_OP_RESTORE;
+
if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) {
pr_perror("Restore ioctl failed");
ret = -1;
goto exit;
}
+ if (ret < 0)
+ goto exit;
+
+ for (int i = 0; i < args.num_bos; i++) {
+ struct kfd_criu_bo_bucket *bo_bucket = &((struct kfd_criu_bo_bucket *)args.bos)[i];
+ KfdBoEntry *bo_entry = e->bo_entries[i];
+
+ if (bo_entry->handle != -1) {
+ serve_out_dmabuf_fd(bo_entry->handle, bo_bucket->dmabuf_fd);
+ }
+ }
+
ret = restore_bo_data(id, (struct kfd_criu_bo_bucket *)args.bos, e);
if (ret)
goto exit;
@@ -1859,6 +2064,14 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
}
}
+ clear_restore_state();
+ close(dmabuf_socket_fd);
+ while (!list_empty(&amdgpu_processes)) {
+ struct amdgpu_process *st = list_first_entry(&amdgpu_processes, struct amdgpu_process, l);
+ list_del(&st->l);
+ free(st);
+ }
+
close(fd);
return exit_code;
}
diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c
index d54cd937d..43e95d1db 100644
--- a/plugins/amdgpu/amdgpu_plugin_drm.c
+++ b/plugins/amdgpu/amdgpu_plugin_drm.c
@@ -19,20 +19,113 @@
#include <dirent.h>
#include "common/list.h"
+#include "files.h"
#include "criu-amdgpu.pb-c.h"
+#define __user
+#include "drm.h"
#include <xf86drm.h>
#include <libdrm/amdgpu.h>
#include "xmalloc.h"
-#include "criu-log.h"
-#include "kfd_ioctl.h"
+#include "amdgpu_drm.h"
#include "amdgpu_plugin_drm.h"
#include "amdgpu_plugin_util.h"
#include "amdgpu_plugin_topology.h"
+#include "util.h"
+#include "common/scm.h"
+
+int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd)
+{
+ uint32_t handle;
+ int fd = amdgpu_device_get_fd(h_dev);
+
+ if (dmabuf_fd == -1) {
+ return -1;
+ }
+
+ drmPrimeFDToHandle(fd, dmabuf_fd, &handle);
+
+ return handle;
+}
+
+int drmIoctl(int fd, unsigned long request, void *arg)
+{
+ int ret, max_retries = 200;
+
+ do {
+ ret = ioctl(fd, request, arg);
+ } while (ret == -1 && max_retries-- > 0 && (errno == EINTR || errno == EAGAIN));
+
+ if (ret == -1 && errno == EBADF)
+ /* In case pthread_atfork didn't catch it, this will
+ * make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN.
+ */
+ pr_perror("KFD file descriptor not valid in this process");
+ return ret;
+}
+
+static int allocate_bo_entries(CriuRenderNode *e, int num_bos)
+{
+ e->bo_entries = xmalloc(sizeof(DrmBoEntry *) * num_bos);
+ if (!e->bo_entries) {
+ pr_err("Failed to allocate bo_info\n");
+ return -ENOMEM;
+ }
+
+ for (int i = 0; i < num_bos; i++) {
+ DrmBoEntry *entry = xzalloc(sizeof(*entry));
+
+ if (!entry) {
+ pr_err("Failed to allocate botest\n");
+ return -ENOMEM;
+ }
+
+ drm_bo_entry__init(entry);
+
+ e->bo_entries[i] = entry;
+ e->n_bo_entries++;
+ }
+ return 0;
+}
+
+static int allocate_vm_entries(CriuRenderNode *e, int num_vms)
+{
+ e->vm_entries = xmalloc(sizeof(DrmVmEntry *) * num_vms);
+ if (!e->vm_entries) {
+ pr_err("Failed to allocate bo_info\n");
+ return -ENOMEM;
+ }
+
+ for (int i = 0; i < num_vms; i++) {
+ DrmVmEntry *entry = xzalloc(sizeof(*entry));
+
+ if (!entry) {
+ pr_err("Failed to allocate botest\n");
+ return -ENOMEM;
+ }
+
+ drm_vm_entry__init(entry);
+
+ e->vm_entries[i] = entry;
+ e->n_vm_entries++;
+ }
+ return 0;
+}
+
+static void free_e(CriuRenderNode *e)
+{
+ for (int i = 0; i < e->n_bo_entries; i++) {
+ if (e->bo_entries[i])
+ xfree(e->bo_entries[i]);
+ }
+
+ xfree(e);
+}
+
int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st)
{
char path[PATH_MAX];
@@ -60,19 +153,209 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st)
return 0;
}
+static int restore_bo_contents_drm(int drm_render_minor, pid_t pid, int drm_fd, uint64_t num_of_bos, struct drm_amdgpu_criu_bo_bucket *bo_buckets)
+{
+ size_t image_size = 0, total_bo_size = 0, max_bo_size = 0, buffer_size;
+ struct amdgpu_gpu_info gpu_info = { 0 };
+ amdgpu_device_handle h_dev;
+ uint64_t max_copy_size;
+ uint32_t major, minor;
+ FILE *bo_contents_fp = NULL;
+ void *buffer = NULL;
+ char img_path[40];
+ int num_bos = 0;
+ int i, ret = 0;
+
+ ret = amdgpu_device_initialize(drm_fd, &major, &minor, &h_dev);
+ if (ret) {
+ pr_perror("failed to initialize device");
+ goto exit;
+ }
+ plugin_log_msg("libdrm initialized successfully\n");
+
+ ret = amdgpu_query_gpu_info(h_dev, &gpu_info);
+ if (ret) {
+ pr_perror("failed to query gpuinfo via libdrm");
+ goto exit;
+ }
+
+ max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE :
+ SDMA_LINEAR_COPY_MAX_SIZE - 1;
+
+ for (i = 0; i < num_of_bos; i++) {
+ if (bo_buckets[i].preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)) {
+ total_bo_size += bo_buckets[i].size;
+
+ if (bo_buckets[i].size > max_bo_size)
+ max_bo_size = bo_buckets[i].size;
+ }
+ }
+
+ buffer_size = max_bo_size;
+
+ posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size);
+ if (!buffer) {
+ pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE.");
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ for (i = 0; i < num_of_bos; i++) {
+
+ if (!(bo_buckets[i].preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)))
+ continue;
+
+ if (bo_buckets[i].addr == -1)
+ continue;
+
+ num_bos++;
+
+ snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, pid, drm_render_minor, i);
+
+ bo_contents_fp = open_img_file(img_path, false, &image_size);
+
+ ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
+ SDMA_OP_VRAM_WRITE, true);
+ if (ret) {
+ pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
+ break;
+ }
+ plugin_log_msg("** Successfully filled the BO using sDMA: bo_buckets[%d] **\n", i);
+
+
+ if (bo_contents_fp)
+ fclose(bo_contents_fp);
+
+ }
+
+exit:
+ for (int i = 0; i < num_of_bos; i++) {
+ if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD)
+ close(bo_buckets[i].dmabuf_fd);
+ }
+
+ xfree(buffer);
+
+ amdgpu_device_deinitialize(h_dev);
+ return ret;
+}
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
{
- CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
- struct tp_node *tp_node;
+ CriuRenderNode *rd = NULL;
char path[PATH_MAX];
unsigned char *buf;
int minor;
int len;
int ret;
+ struct drm_amdgpu_criu_args args = {0};
+ size_t image_size;
+ struct tp_node *tp_node;
+
+ rd = xmalloc(sizeof(*rd));
+ if (!rd) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+ criu_render_node__init(rd);
/* Get the topology node of the DRM device */
minor = minor(drm->st_rdev);
+ rd->drm_render_minor = minor;
+
+ args.op = AMDGPU_CRIU_OP_PROCESS_INFO;
+ if (drmIoctl(fd, DRM_IOCTL_AMDGPU_CRIU_OP, &args) == -1) {
+ pr_perror("Failed to call process info ioctl");
+ ret = -1;
+ goto exit;
+ }
+
+ rd->pid = args.pid;
+ rd->num_of_bos = args.num_bos;
+ rd->num_of_vms = args.num_vms;
+ ret = allocate_bo_entries(rd, args.num_bos);
+ if (ret)
+ goto exit;
+ ret = allocate_vm_entries(rd, args.num_vms);
+ if (ret)
+ goto exit;
+
+ args.bos = (uintptr_t)xzalloc((args.num_bos * sizeof(struct drm_amdgpu_criu_bo_bucket)));
+ if (!args.bos) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ args.vms = (uintptr_t)xzalloc((args.num_vms * sizeof(struct drm_amdgpu_criu_vm_bucket)));
+ if (!args.bos) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ args.op = AMDGPU_CRIU_OP_CHECKPOINT;
+ ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_CRIU_OP, &args);
+ if (ret) {
+ pr_perror("Failed to call dumper (process) ioctl");
+ goto exit;
+ }
+
+ for (int i = 0; i < args.num_bos; i++) {
+ struct drm_amdgpu_criu_bo_bucket bo_bucket = ((struct drm_amdgpu_criu_bo_bucket *)args.bos)[i];
+ uint32_t major, minor;
+ amdgpu_device_handle h_dev;
+ void *buffer = NULL;
+ char img_path[40];
+ FILE *bo_contents_fp = NULL;
+ DrmBoEntry *boinfo = rd->bo_entries[i];
+
+ boinfo->addr = bo_bucket.addr;
+ boinfo->size = bo_bucket.size;
+ boinfo->offset = bo_bucket.offset;
+ boinfo->alloc_flags = bo_bucket.alloc_flags;
+ boinfo->preferred_domains = bo_bucket.preferred_domains;
+
+ ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev);
+
+ snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->pid, rd->drm_render_minor, i);
+ bo_contents_fp = open_img_file(img_path, true, &image_size);
+
+ posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), bo_bucket.size);
+
+ ret = sdma_copy_bo(bo_bucket.dmabuf_fd, bo_bucket.size, bo_contents_fp, buffer, bo_bucket.size, h_dev, 0x1000,
+ SDMA_OP_VRAM_READ, false);
+
+ boinfo->handle = get_gem_handle(h_dev, bo_bucket.dmabuf_fd);
+ boinfo->is_import = (bo_bucket.flags & AMDGPU_CRIU_BO_FLAG_IS_IMPORT)
+ || shared_bo_has_exporter(boinfo->handle);
+
+ if (bo_bucket.dmabuf_fd != KFD_INVALID_FD)
+ close(bo_bucket.dmabuf_fd);
+
+ if (bo_contents_fp)
+ fclose(bo_contents_fp);
+
+ ret = amdgpu_device_deinitialize(h_dev);
+ if (ret)
+ goto exit;
+ }
+ for (int i = 0; i < args.num_bos; i++) {
+ DrmBoEntry *boinfo = rd->bo_entries[i];
+
+ ret = record_shared_bo(boinfo->handle, boinfo->is_import);
+ if (ret)
+ goto exit;
+ }
+ for (int i = 0; i < args.num_vms; i++) {
+ DrmVmEntry *vminfo = rd->vm_entries[i];
+ struct drm_amdgpu_criu_vm_bucket vm_bucket = ((struct drm_amdgpu_criu_vm_bucket *)args.vms)[i];
+
+ vminfo->start = vm_bucket.start;
+ vminfo->last = vm_bucket.last;
+ vminfo->offset = vm_bucket.offset;
+ vminfo->flags = vm_bucket.flags;
+ vminfo->gem_handle = vm_bucket.gem_handle;
+ }
+
tp_node = sys_get_node_by_render_minor(&src_topology, minor);
if (!tp_node) {
pr_err("Failed to find a device with minor number = %d\n", minor);
@@ -80,21 +363,172 @@ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
}
/* Get the GPU_ID of the DRM device */
- rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
- if (!rd.gpu_id) {
- pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id);
+ rd->gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
+ if (!rd->gpu_id) {
+ pr_err("Failed to find valid gpu_id for the device = %d\n", rd->gpu_id);
return -ENODEV;
}
- len = criu_render_node__get_packed_size(&rd);
+ len = criu_render_node__get_packed_size(rd);
buf = xmalloc(len);
if (!buf)
return -ENOMEM;
- criu_render_node__pack(&rd, buf);
+ criu_render_node__pack(rd, buf);
snprintf(path, sizeof(path), IMG_DRM_FILE, id);
ret = write_img_file(path, buf, len);
+
+ exit:
+ xfree((void *)args.bos);
+ xfree((void *)args.vms);
xfree(buf);
+ free_e(rd);
return ret;
}
+
+int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd)
+{
+ struct drm_amdgpu_criu_args args = {0};
+ int ret = 0;
+ bool retry_needed = false;
+ uint32_t major, minor;
+ amdgpu_device_handle h_dev;
+ int device_fd;
+
+ args.num_bos = rd->num_of_bos;
+ args.num_vms = rd->num_of_vms;
+ args.bos = (uint64_t)xzalloc(sizeof(struct drm_amdgpu_criu_bo_bucket) * rd->num_of_bos);
+
+ ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev);
+ if (ret) {
+ pr_info("Error in init amdgpu device\n");
+ goto exit;
+ }
+
+ device_fd = amdgpu_device_get_fd(h_dev);
+
+ for (int i = 0; i < args.num_bos; i++) {
+ struct drm_amdgpu_criu_bo_bucket *bo_bucket = &((struct drm_amdgpu_criu_bo_bucket *)args.bos)[i];
+ DrmBoEntry *boinfo = rd->bo_entries[i];
+ int dmabuf_fd = -1;
+ uint32_t handle;
+ struct drm_prime_change_gem_handle change_args = {0};
+ union drm_amdgpu_gem_mmap mmap_args = {0};
+ struct drm_amdgpu_gem_va va_args = {0};
+
+ bo_bucket->addr = boinfo->addr;
+
+ if (work_already_completed(boinfo->handle, rd->drm_render_minor)) {
+ bo_bucket->addr = -1;
+ continue;
+ } else if (boinfo->handle != -1) {
+ if (boinfo->is_import) {
+ dmabuf_fd = dmabuf_fd_for_handle(boinfo->handle);
+ if (dmabuf_fd == -1) {
+ bo_bucket->addr = -1;
+ continue;
+ }
+ }
+ }
+
+ bo_bucket->dmabuf_fd = dmabuf_fd;
+ bo_bucket->size = boinfo->size;
+ bo_bucket->preferred_domains = boinfo->preferred_domains;
+
+ if (boinfo->is_import) {
+ drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle);
+ } else {
+ union drm_amdgpu_gem_create create_args = {0};
+
+ create_args.in.bo_size = boinfo->size;
+ create_args.in.alignment = 0x1000;
+ create_args.in.domains = boinfo->preferred_domains;
+ create_args.in.domain_flags = boinfo->alloc_flags;
+
+ if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &create_args) == -1) {
+ pr_perror("Error Failed to call create ioctl");
+ ret = -1;
+ goto exit;
+ }
+ handle = create_args.out.handle;
+
+ drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd);
+ }
+
+ change_args.handle = handle;
+ change_args.new_handle = boinfo->handle;
+
+ if (drmIoctl(fd, DRM_IOCTL_PRIME_CHANGE_GEM_HANDLE, &change_args) == -1) {
+ pr_perror("Error Failed to call change ioctl");
+ ret = -1;
+ goto exit;
+ }
+
+ if (!boinfo->is_import)
+ serve_out_dmabuf_fd(boinfo->handle, dmabuf_fd);
+
+ bo_bucket->dmabuf_fd = dmabuf_fd;
+
+ ret = record_completed_work(boinfo->handle, rd->drm_render_minor);
+ if (ret)
+ goto exit;
+
+ mmap_args.in.handle = boinfo->handle;
+ if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) {
+ pr_perror("Error Failed to call mmap ioctl");
+ ret = -1;
+ goto exit;
+ }
+
+ for (int j = 0; j < args.num_vms; j++) {
+ DrmVmEntry *vminfo = rd->vm_entries[j];
+
+ if (vminfo->gem_handle != boinfo->handle)
+ continue;
+
+ va_args.handle = boinfo->handle;
+ va_args.operation = AMDGPU_VA_OP_MAP;
+ va_args.flags = vminfo->flags;
+ va_args.va_address = vminfo->start * 0x1000;
+ va_args.offset_in_bo = vminfo->offset;
+ va_args.map_size = (vminfo->last - vminfo->start + 1) * 0x1000;
+
+
+ if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_VA, &va_args) == -1) {
+ pr_perror("Error Failed to call mmap ioctl");
+ ret = -1;
+ goto exit;
+ }
+
+ }
+
+ ret = save_vma_updates(boinfo->offset, boinfo->addr, mmap_args.out.addr_ptr, fd);
+ if (ret < 0)
+ goto exit;
+
+ }
+
+ if (ret) {
+ pr_info("Error in deinit amdgpu device\n");
+ goto exit;
+ }
+
+ ret = record_completed_work(-1, rd->drm_render_minor);
+ if (ret)
+ goto exit;
+
+ ret = amdgpu_device_deinitialize(h_dev);
+
+ if (args.num_bos > 0) {
+ ret = restore_bo_contents_drm(rd->drm_render_minor, rd->pid, fd, args.num_bos, (struct drm_amdgpu_criu_bo_bucket *)args.bos);
+ if (ret)
+ goto exit;
+ }
+
+ exit:
+ if (ret < 0)
+ return ret;
+
+ return retry_needed;
+}
diff --git a/plugins/amdgpu/amdgpu_plugin_drm.h b/plugins/amdgpu/amdgpu_plugin_drm.h
index 6f0c1a9a6..3dd4499a6 100644
--- a/plugins/amdgpu/amdgpu_plugin_drm.h
+++ b/plugins/amdgpu/amdgpu_plugin_drm.h
@@ -24,5 +24,13 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm);
*/
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm);
+int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd);
+
+int amdgpu_plugin_drm_unpause_file(int fd);
+
+int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd);
+
+int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int gpu_id);
+
#endif /* __AMDGPU_PLUGIN_DRM_H__ */
diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c
index a165fc9cd..4b3ae0cdd 100644
--- a/plugins/amdgpu/amdgpu_plugin_util.c
+++ b/plugins/amdgpu/amdgpu_plugin_util.c
@@ -41,6 +41,10 @@
/* Tracks number of device files that need to be checkpointed */
static int dev_file_cnt = 0;
+static LIST_HEAD(shared_bos);
+static LIST_HEAD(shared_dmabuf_fds);
+static LIST_HEAD(completed_work);
+
/* Helper structures to encode device topology of SRC and DEST platforms */
struct tp_system src_topology;
struct tp_system dest_topology;
@@ -68,18 +72,121 @@ void init_gpu_count(struct tp_system *topo)
dev_file_cnt = 1 + topology_gpu_count(topo);
}
-int read_fp(FILE *fp, void *buf, const size_t buf_len)
-{
- size_t len_read;
+bool shared_bo_has_exporter(int handle) {
+ struct shared_bo *bo;
- len_read = fread(buf, 1, buf_len, fp);
- if (len_read != buf_len) {
- pr_err("Unable to read file (read:%ld buf_len:%ld)\n", len_read, buf_len);
- return -EIO;
+ if (handle == -1) {
+ return false;
+ }
+
+ list_for_each_entry(bo, &shared_bos, l) {
+ if (bo->handle == handle) {
+ return bo->has_exporter;
+ }
+ }
+
+ return false;
+}
+
+int record_shared_bo(int handle, bool is_imported) {
+ struct shared_bo *bo;
+
+ if (handle == -1)
+ return 0;
+
+ list_for_each_entry(bo, &shared_bos, l) {
+ if (bo->handle == handle) {
+ return 0;
+ }
+ }
+ bo = malloc(sizeof(struct shared_bo));
+ if (!bo)
+ return -1;
+ bo->handle = handle;
+ bo->has_exporter = !is_imported;
+ list_add(&bo->l, &shared_bos);
+
+ return 0;
+}
+
+int record_shared_dmabuf_fd(int handle, int dmabuf_fd) {
+ struct shared_dmabuf *bo;
+
+ bo = malloc(sizeof(struct shared_dmabuf));
+ if(!bo)
+ return -1;
+ bo->handle = handle;
+ bo->dmabuf_fd = dmabuf_fd;
+ list_add(&bo->l, &shared_dmabuf_fds);
+
+ return 0;
+}
+
+int dmabuf_fd_for_handle(int handle) {
+ struct shared_dmabuf *bo;
+
+ list_for_each_entry(bo, &shared_dmabuf_fds, l) {
+ if (bo->handle == handle) {
+ return bo->dmabuf_fd;
+ }
}
+
+ return -1;
+}
+
+int record_completed_work(int handle, int id) {
+ struct restore_completed_work *work;
+
+ work = malloc(sizeof(struct restore_completed_work));
+ if (!work)
+ return -1;
+ work->handle = handle;
+ work->id = id;
+ list_add(&work->l, &completed_work);
+
return 0;
}
+bool work_already_completed(int handle, int id) {
+ struct restore_completed_work *work;
+
+ list_for_each_entry(work, &completed_work, l) {
+ if (work->handle == handle && work->id == id) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void clear_restore_state() {
+ while (!list_empty(&shared_dmabuf_fds)) {
+ struct shared_dmabuf *st = list_first_entry(&shared_dmabuf_fds, struct shared_dmabuf, l);
+ list_del(&st->l);
+ close(st->dmabuf_fd);
+ free(st);
+ }
+
+ while (!list_empty(&completed_work)) {
+ struct restore_completed_work *st = list_first_entry(&completed_work, struct restore_completed_work, l);
+ list_del(&st->l);
+ free(st);
+ }
+}
+
+int read_fp(FILE *fp, void *buf, const size_t buf_len)
+{
+ size_t len_read;
+
+ len_read = fread(buf, 1, buf_len, fp);
+ if (len_read != buf_len) {
+ pr_err("Unable to read file (read:%ld buf_len:%ld)\n", len_read, buf_len);
+ return -EIO;
+
+ }
+ return 0;
+}
+
int write_fp(FILE *fp, const void *buf, const size_t buf_len)
{
size_t len_write;
diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h
index aacca3a28..bd23fc6d4 100644
--- a/plugins/amdgpu/amdgpu_plugin_util.h
+++ b/plugins/amdgpu/amdgpu_plugin_util.h
@@ -1,6 +1,8 @@
#ifndef __AMDGPU_PLUGIN_UTIL_H__
#define __AMDGPU_PLUGIN_UTIL_H__
+#include <libdrm/amdgpu.h>
+
#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
#endif
@@ -52,7 +54,7 @@
#define IMG_DRM_FILE "amdgpu-renderD-%d.img"
/* Name of file having serialized data of DRM device buffer objects (BOs) */
-#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%04x.img"
+#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img"
/* Helper macros to Checkpoint and Restore a ROCm file */
#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem"
@@ -73,6 +75,35 @@ enum sdma_op_type {
SDMA_OP_VRAM_WRITE,
};
+struct dumped_fd {
+ struct list_head l;
+ int fd;
+ bool is_drm;
+};
+
+struct shared_bo {
+ struct list_head l;
+ int handle;
+ bool has_exporter;
+};
+
+struct shared_dmabuf {
+ struct list_head l;
+ int handle;
+ int dmabuf_fd;
+};
+
+struct restore_completed_work {
+ struct list_head l;
+ int handle;
+ int id;
+};
+
+struct amdgpu_process {
+ struct list_head l;
+ int pid;
+};
+
/* Helper structures to encode device topology of SRC and DEST platforms */
extern struct tp_system src_topology;
extern struct tp_system dest_topology;
@@ -101,6 +132,23 @@ bool checkpoint_is_complete();
void decrement_checkpoint_count();
void init_gpu_count(struct tp_system *topology);
+bool shared_bo_has_exporter(int handle);
+int record_shared_bo(int handle, bool is_imported);
+
+int record_shared_dmabuf_fd(int handle, int dmabuf_fd);
+int dmabuf_fd_for_handle(int handle);
+
+int record_completed_work(int handle, int id);
+bool work_already_completed(int handle, int id);
+
+void clear_restore_state();
+
void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list);
+int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp,
+ void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
+ uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free);
+
+int serve_out_dmabuf_fd(int handle, int fd);
+
#endif /* __AMDGPU_PLUGIN_UTIL_H__ */
diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto
index 078b67650..8f198410b 100644
--- a/plugins/amdgpu/criu-amdgpu.proto
+++ b/plugins/amdgpu/criu-amdgpu.proto
@@ -46,6 +46,7 @@ message kfd_bo_entry {
required uint64 offset = 3;
required uint32 alloc_flags = 4;
required uint32 gpu_id = 5;
+ required uint32 handle = 6;
}
message criu_kfd {
@@ -61,6 +62,30 @@ message criu_kfd {
required bytes priv_data = 10;
}
+message drm_bo_entry {
+ required uint64 addr = 1;
+ required uint64 size = 2;
+ required uint64 offset = 3;
+ required uint64 alloc_flags = 4;
+ required uint32 preferred_domains = 5;
+ required uint32 handle = 6;
+ required uint32 is_import = 7;
+}
+
+message drm_vm_entry {
+ required uint64 start = 1;
+ required uint64 last = 2;
+ required uint64 offset = 3;
+ required uint64 flags = 4;
+ required uint32 gem_handle = 5;
+}
+
message criu_render_node {
required uint32 gpu_id = 1;
+ required uint32 pid = 2;
+ required uint32 drm_render_minor = 3;
+ required uint64 num_of_bos = 4;
+ repeated drm_bo_entry bo_entries = 5;
+ required uint32 num_of_vms = 6;
+ repeated drm_vm_entry vm_entries = 7;
}
--
2.34.1
More information about the CRIU
mailing list