[CRIU] [RFC PATCH 17/20] criu/plugin: Remap GPUs on checkpoint restore

Felix Kuehling Felix.Kuehling at amd.com
Sat May 1 04:58:42 MSK 2021


From: David Yat Sin <david.yatsin at amd.com>

The device topology on the restore node can be different from the
topology on the checkpointed node. The GPUs on the restore node may
have different gpu_ids, minor number. or some GPUs may have different
properties as checkpointed node. During restore, the CRIU plugin
determines the target GPUs to avoid restore failures caused by trying
to restore a process on a gpu that is different.

Signed-off-by: David Yat Sin <david.yatsin at amd.com>
---
 Documentation/kfd_plugin.txt       |   2 +-
 test/others/ext-kfd/criu-kfd.proto |   2 +-
 test/others/ext-kfd/kfd_plugin.c   | 439 ++++++++++++++++++++++++-----
 3 files changed, 364 insertions(+), 79 deletions(-)

diff --git a/Documentation/kfd_plugin.txt b/Documentation/kfd_plugin.txt
index 4caf489c1..9dce736fc 100644
--- a/Documentation/kfd_plugin.txt
+++ b/Documentation/kfd_plugin.txt
@@ -9,7 +9,7 @@ userspace for AMD GPUs.
 
 CURRENT SUPPORT
 ---------------
-Single GPU systems (Gfx9)
+Single and Multi GPU systems (Gfx9)
 Checkpoint / Restore on same system
 Checkpoint / Restore inside a docker container
 Pytorch
diff --git a/test/others/ext-kfd/criu-kfd.proto b/test/others/ext-kfd/criu-kfd.proto
index 995163c41..6eed456c4 100644
--- a/test/others/ext-kfd/criu-kfd.proto
+++ b/test/others/ext-kfd/criu-kfd.proto
@@ -103,5 +103,5 @@ message criu_kfd {
 }
 
 message criu_render_node {
-        required uint32 minor_number = 1;
+	required uint32 gpu_id = 1;
 }
diff --git a/test/others/ext-kfd/kfd_plugin.c b/test/others/ext-kfd/kfd_plugin.c
index 1d7e5745c..5e6247492 100644
--- a/test/others/ext-kfd/kfd_plugin.c
+++ b/test/others/ext-kfd/kfd_plugin.c
@@ -50,6 +50,7 @@ struct vma_metadata {
 	uint64_t old_pgoff;
 	uint64_t new_pgoff;
 	uint64_t vma_entry;
+	uint32_t new_minor;
 };
 
 static LIST_HEAD(update_vma_info_list);
@@ -93,6 +94,21 @@ struct tp_system {
 };
 
 struct tp_system src_topology;  /* Valid during dump */
+struct tp_system dest_topology; /* Valid during restore */
+
+struct gpu_id_maps {
+	uint32_t num_devices;
+	struct gpu_id_map {
+		uint32_t src;
+		uint32_t dest;
+	} maps[NUM_OF_SUPPORTED_GPUS];
+};
+
+/* Valid during dump, map of actual gpu_id to user gpu_id */
+struct gpu_id_maps checkpoint_maps;
+
+/* Valid during restore, map of gpu_id on checkpointed node to gpu_id on current node */
+struct gpu_id_maps restore_maps;
 
 struct tp_device *get_tp_device_by_render_minor(struct tp_system *sys, int drm_render_minor)
 {
@@ -341,6 +357,121 @@ int parse_topology(struct tp_system *topology)
 	return 0;
 }
 
+int get_gpu_map(struct gpu_id_maps *gpu_maps, uint32_t src, uint32_t *dest)
+{
+	/* If we have an existing mapping for this gpu_id, return it */
+	for (int i = 0; i < gpu_maps->num_devices; i++) {
+		if (gpu_maps->maps[i].src == src) {
+			*dest = gpu_maps->maps[i].dest;
+			return 0;
+		}
+	}
+	pr_err("Failed to find destination GPU ID for 0x%04x (num_devices:%d)\n", src, gpu_maps->num_devices);
+	return -1;
+}
+
+bool device_match(DevinfoEntry *src_dev, struct tp_device *tp_dev)
+{
+	if (src_dev->cpu_cores_count == tp_dev->cpu_cores_count &&
+		src_dev->simd_count == tp_dev->simd_count &&
+		src_dev->mem_banks_count == tp_dev->mem_banks_count &&
+		src_dev->io_links_count == tp_dev->io_links_count &&
+		src_dev->max_waves_per_simd == tp_dev->max_waves_per_simd &&
+		src_dev->lds_size_in_kb == tp_dev->lds_size_in_kb &&
+		src_dev->num_gws == tp_dev->num_gws &&
+		src_dev->wave_front_size == tp_dev->wave_front_size &&
+		src_dev->array_count == tp_dev->array_count &&
+		src_dev->simd_arrays_per_engine == tp_dev->simd_arrays_per_engine &&
+		src_dev->cu_per_simd_array == tp_dev->cu_per_simd_array &&
+		src_dev->simd_per_cu == tp_dev->simd_per_cu &&
+		src_dev->max_slots_scratch_cu == tp_dev->max_slots_scratch_cu &&
+		src_dev->vendor_id == tp_dev->vendor_id &&
+		src_dev->device_id == tp_dev->device_id &&
+		src_dev->num_sdma_engines == tp_dev->num_sdma_engines &&
+		src_dev->num_sdma_xgmi_engines == tp_dev->num_sdma_xgmi_engines &&
+		src_dev->num_sdma_queues_per_engine == tp_dev->num_sdma_queues_per_engine &&
+		src_dev->num_cp_queues == tp_dev->num_cp_queues &&
+		src_dev->capability == tp_dev->capability &&
+		src_dev->sdma_fw_version == tp_dev->sdma_fw_version &&
+		src_dev->caches_count <= tp_dev->caches_count &&
+		src_dev->fw_version <= tp_dev->fw_version) {
+
+		return true;
+	}
+	return false;
+}
+
+void print_required_properties(DevinfoEntry *src_dev)
+{
+	pr_err("===Required properties==================================================\n");
+	pr_err("      cpu_cores_count:%u simd_count:%u mem_banks_count:%u caches_count:%u\n",
+			src_dev->cpu_cores_count, src_dev->simd_count,
+			src_dev->mem_banks_count, src_dev->caches_count);
+	pr_err("      io_links_count:%u max_waves_per_simd:%u lds_size_in_kb:%u\n",
+			src_dev->io_links_count, src_dev->max_waves_per_simd,
+			src_dev->lds_size_in_kb);
+	pr_err("      num_gws:%u wave_front_size:%u array_count:%u\n",
+			src_dev->num_gws, src_dev->wave_front_size, src_dev->array_count);
+	pr_err("      simd_arrays_per_engine:%u cu_per_simd_array:%u simd_per_cu:%u\n",
+			src_dev->simd_arrays_per_engine, src_dev->cu_per_simd_array,
+			src_dev->simd_per_cu);
+	pr_err("      max_slots_scratch_cu:%u vendor_id:%u device_id:%u\n",
+			src_dev->max_slots_scratch_cu, src_dev->vendor_id, src_dev->device_id);
+	pr_err("      num_sdma_engines:%u num_sdma_xgmi_engines:%u num_sdma_queues_per_engine:%u\n",
+			src_dev->num_sdma_engines, src_dev->num_sdma_xgmi_engines,
+			src_dev->num_sdma_queues_per_engine);
+	pr_err("      num_cp_queues:%u fw_version:%u capability:%u sdma_fw_version:%u\n",
+			src_dev->num_cp_queues, src_dev->fw_version, src_dev->capability,
+			src_dev->sdma_fw_version);
+	pr_err("========================================================================\n");
+}
+
+/* Parse local system topology and compare with checkpointed devices so we can build a set of gpu
+ * maps that is used for local target gpu's */
+int set_restore_gpu_maps(struct gpu_id_maps *gpu_maps, DevinfoEntry *src_devs[],
+                               uint32_t num_devices, struct tp_system *topo)
+{
+	int i,j;
+	bool matched_devices[NUM_OF_SUPPORTED_GPUS];
+
+	if (parse_topology(topo))
+		return -EFAULT;
+
+	if (topo->num_nodes != num_devices) {
+		pr_err("Number of devices mismatch (local:%d checkpointed:%d)\n",
+						topo->num_nodes, num_devices);
+		return -EINVAL;
+	}
+
+	memset(matched_devices, 0, sizeof(matched_devices));
+	gpu_maps->num_devices = num_devices;
+
+	for (i = 0; i < num_devices; i++) {
+		for (j = 0; j < num_devices; j++) {
+			if (matched_devices[j])
+				continue;
+
+			if (device_match(src_devs[i], &topo->devs[j])) {
+				matched_devices[j] = true;
+				gpu_maps->maps[i].src = src_devs[i]->gpu_id;
+				gpu_maps->maps[i].dest = topo->devs[j].gpu_id;
+				pr_info("Matched gpu 0x%04x->0x%04x\n", gpu_maps->maps[i].src,
+						gpu_maps->maps[i].dest);
+				break;
+			}
+		}
+
+		if (j < num_devices)
+			continue;
+
+		pr_err("No matching destination GPU for gpu_id = 0x%04x\n", src_devs[i]->gpu_id);
+		print_required_properties(src_devs[i]);
+
+		return -ENOTSUP;
+	}
+	return 0;
+}
+
 int open_drm_render_device(int minor)
 {
 	char path[128];
@@ -581,6 +712,9 @@ int kfd_plugin_init(int stage)
 						CR_PLUGIN_DESC.name);
 
 	memset(&src_topology, 0, sizeof(src_topology));
+	memset(&dest_topology, 0, sizeof(dest_topology));
+	memset(&checkpoint_maps, 0, sizeof(checkpoint_maps));
+	memset(&restore_maps, 0, sizeof(restore_maps));
 	return 0;
 }
 
@@ -599,12 +733,10 @@ int kfd_plugin_dump_file(int fd, int id)
 	struct kfd_criu_bo_buckets *bo_bucket_ptr;
 	struct kfd_criu_q_bucket *q_bucket_ptr;
 	struct kfd_criu_ev_bucket *ev_buckets_ptr = NULL;
-	int ret, drm_fd;
+	int ret;
 	char img_path[PATH_MAX];
 	struct stat st, st_kfd;
 	unsigned char *buf;
-	char fd_path[128];
-	void *addr;
 	size_t len;
 
 	printf("kfd_plugin: Enter cr_plugin_dump_file()- ID = 0x%x\n", id);
@@ -623,21 +755,28 @@ int kfd_plugin_dump_file(int fd, int id)
 	}
 
 	if (parse_topology(&src_topology))
-		return HSAKMT_STATUS_ERROR;
+		return -1;
 
 	/* Check whether this plugin was called for kfd or render nodes */
 	if (major(st.st_rdev) != major(st_kfd.st_rdev) ||
 		 minor(st.st_rdev) != 0) {
-		/* This is RenderD dumper plugin, for now just save renderD
-		 * minor number to be used during restore. In later phases this
-		 * needs to save more data for video decode etc.
-		 */
-
+		/* This is RenderD dumper plugin, save the render minor and gpu_id */
 		CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
-		pr_info("kfd_plugin: Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), fd, id);
+		struct tp_device *tp_dev;
 
-		rd.minor_number = minor(st.st_rdev);
-		snprintf(img_path, sizeof(img_path), "renderDXXX.%d.img", id);
+		pr_info("kfd_plugin: Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n",
+			minor(st.st_rdev), fd, id);
+
+		tp_dev = get_tp_device_by_render_minor(&src_topology, minor(st.st_rdev));
+		if (!tp_dev) {
+			pr_err("kfd_plugin: Failed to find a device with minor number = %d\n",
+				minor(st.st_rdev));
+
+			return -EFAULT;
+		}
+
+		if (get_gpu_map(&checkpoint_maps, tp_dev->gpu_id, &rd.gpu_id))
+			return -EFAULT;
 
 		len = criu_render_node__get_packed_size(&rd);
 		buf = xmalloc(len);
@@ -645,14 +784,17 @@ int kfd_plugin_dump_file(int fd, int id)
 			return -ENOMEM;
 
 		criu_render_node__pack(&rd, buf);
+
+		snprintf(img_path, sizeof(img_path), "renderDXXX.%d.img", id);
 		ret = write_file(img_path,  buf, len);
-		if (ret)
-			ret = -1;
+		if (ret) {
+			xfree(buf);
+			return ret;
+		}
 
 		xfree(buf);
 
-		/* Need to return success here so that criu can call plugins for
-		 * renderD nodes */
+		/* Need to return success here so that criu can call plugins for renderD nodes */
 		return ret;
 	}
 
@@ -746,15 +888,13 @@ int kfd_plugin_dump_file(int fd, int id)
 	/* When checkpointing on a node where there was already a checkpoint-restore before, the
 	 * user_gpu_id and actual_gpu_id will be different.
 	 *
-	 * For now, we assume the user_gpu_id and actual_gpu_id is the same. Once we support
-	 * restoring on a different node, then we will have a user_gpu_id to actual_gpu_id mapping.
-	 */
+	 * We store the user_gpu_id in the stored image files so that the stored images always have
+	 * the gpu_id's of the node where the application was first launched. */
+
+	checkpoint_maps.num_devices = args.num_of_devices;
 	for (int i = 0; i < args.num_of_devices; i++) {
-		e->devinfo_entries[i]->gpu_id = devinfo_bucket_ptr[i].user_gpu_id;
-		if (devinfo_bucket_ptr[i].user_gpu_id != devinfo_bucket_ptr[i].actual_gpu_id) {
-			pr_err("Checkpoint-Restore on different node not supported yet\n");
-			ret = -ENOTSUP;
-		}
+		checkpoint_maps.maps[i].src = devinfo_bucket_ptr[i].actual_gpu_id;
+		checkpoint_maps.maps[i].dest = devinfo_bucket_ptr[i].user_gpu_id;
 	}
 
 	/* Store local topology information */
@@ -777,7 +917,6 @@ int kfd_plugin_dump_file(int fd, int id)
 		e->devinfo_entries[i]->simd_id_base = dev->simd_id_base;
 		e->devinfo_entries[i]->max_waves_per_simd = dev->max_waves_per_simd;
 		e->devinfo_entries[i]->lds_size_in_kb = dev->lds_size_in_kb;
-		e->devinfo_entries[i]->gds_size_in_kb = dev->gds_size_in_kb;
 		e->devinfo_entries[i]->num_gws = dev->num_gws;
 		e->devinfo_entries[i]->wave_front_size = dev->wave_front_size;
 		e->devinfo_entries[i]->array_count = dev->array_count;
@@ -806,23 +945,21 @@ int kfd_plugin_dump_file(int fd, int id)
 	if (ret)
 		return -1;
 
-	sprintf(fd_path, "/dev/dri/renderD%d", DRM_FIRST_RENDER_NODE);
-	drm_fd = open(fd_path, O_RDWR | O_CLOEXEC);
-	if (drm_fd < 0) {
-		pr_perror("kfd_plugin: failed to open drm fd for %s\n", fd_path);
-		return -1;
-	}
-
 	for (int i = 0; i < helper_args.num_of_bos; i++)
 	{
 		(e->bo_info_test[i])->bo_addr = (bo_bucket_ptr)[i].bo_addr;
 		(e->bo_info_test[i])->bo_size = (bo_bucket_ptr)[i].bo_size;
 		(e->bo_info_test[i])->bo_offset = (bo_bucket_ptr)[i].bo_offset;
-		(e->bo_info_test[i])->gpu_id = (bo_bucket_ptr)[i].gpu_id;
 		(e->bo_info_test[i])->bo_alloc_flags = (bo_bucket_ptr)[i].bo_alloc_flags;
 		(e->bo_info_test[i])->idr_handle = (bo_bucket_ptr)[i].idr_handle;
 		(e->bo_info_test[i])->user_addr = (bo_bucket_ptr)[i].user_addr;
 
+		if (get_gpu_map(&checkpoint_maps, bo_bucket_ptr[i].gpu_id,
+			&e->bo_info_test[i]->gpu_id)) {
+			ret = -EFAULT;
+			goto failed;
+		}
+
 		if ((bo_bucket_ptr)[i].bo_alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
 			pr_info("VRAM BO Found\n");
 		}
@@ -840,7 +977,24 @@ int kfd_plugin_dump_file(int fd, int id)
 
 			if ((e->bo_info_test[i])->bo_alloc_flags &
 			    KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) {
-				pr_info("kfd_plugin: large bar read possible\n");
+				int drm_fd;
+				void *addr;
+				struct tp_device *dev;
+
+				plugin_log_msg("kfd_plugin: large bar read possible\n");
+
+				dev = get_tp_device_by_gpu_id(&src_topology, bo_bucket_ptr[i].gpu_id);
+				if (!dev) {
+					ret = -EFAULT;
+					goto failed;
+				}
+
+				drm_fd = open_drm_render_device(dev->drm_render_minor);
+				if (drm_fd < 0) {
+					ret = -EFAULT;
+					goto failed;
+				}
+
 				addr = mmap(NULL,
 					    (bo_bucket_ptr)[i].bo_size,
 					    PROT_READ,
@@ -858,9 +1012,9 @@ int kfd_plugin_dump_file(int fd, int id)
 				memcpy((e->bo_info_test[i])->bo_rawdata.data,
 				       addr, bo_bucket_ptr[i].bo_size);
 				munmap(addr, bo_bucket_ptr[i].bo_size);
-
+				close(drm_fd);
 			} else {
-				pr_info("Now try reading BO contents with /proc/pid/mem");
+				plugin_log_msg("Now try reading BO contents with /proc/pid/mem");
 				if (asprintf (&fname, PROCPIDMEM, e->pid) < 0) {
 					pr_perror("failed in asprintf, %s\n", fname);
 					ret = -1;
@@ -896,8 +1050,6 @@ int kfd_plugin_dump_file(int fd, int id)
 			} /* PROCPIDMEM read done */
 		}
 	}
-	close(drm_fd);
-
 	e->num_of_bos = helper_args.num_of_bos;
 
 	plugin_log_msg("Dumping bo_info_test \n");
@@ -943,7 +1095,13 @@ int kfd_plugin_dump_file(int fd, int id)
 			q_bucket_ptr[i].q_id,
 			q_bucket_ptr[i].q_address);
 
-		e->q_entries[i]->gpu_id = q_bucket_ptr[i].gpu_id;
+		if (get_gpu_map(&checkpoint_maps, q_bucket_ptr[i].gpu_id,
+				&e->q_entries[i]->gpu_id)) {
+
+			ret = -EFAULT;
+			goto failed;
+		}
+
 		e->q_entries[i]->type = q_bucket_ptr[i].type;
 		e->q_entries[i]->format = q_bucket_ptr[i].format;
 		e->q_entries[i]->q_id = q_bucket_ptr[i].q_id;
@@ -996,8 +1154,14 @@ int kfd_plugin_dump_file(int fd, int id)
 					ev_buckets_ptr[i].memory_exception_data.failure.NoExecute;
 				e->ev_entries[i]->mem_exc_va =
 					ev_buckets_ptr[i].memory_exception_data.va;
-				e->ev_entries[i]->mem_exc_gpu_id =
-					ev_buckets_ptr[i].memory_exception_data.gpu_id;
+				if (ev_buckets_ptr[i].memory_exception_data.gpu_id) {
+					if (get_gpu_map(&checkpoint_maps,
+							ev_buckets_ptr[i].memory_exception_data.gpu_id,
+							&e->ev_entries[i]->mem_exc_gpu_id)) {
+						ret = -EFAULT;
+						goto failed;
+					}
+				}
 			} else if (e->ev_entries[i]->type == KFD_IOC_EVENT_HW_EXCEPTION) {
 				e->ev_entries[i]->hw_exc_reset_type =
 					ev_buckets_ptr[i].hw_exception_data.reset_type;
@@ -1005,8 +1169,14 @@ int kfd_plugin_dump_file(int fd, int id)
 					ev_buckets_ptr[i].hw_exception_data.reset_cause;
 				e->ev_entries[i]->hw_exc_memory_lost =
 					ev_buckets_ptr[i].hw_exception_data.memory_lost;
-				e->ev_entries[i]->hw_exc_gpu_id =
-					ev_buckets_ptr[i].hw_exception_data.gpu_id;
+				if (ev_buckets_ptr[i].hw_exception_data.gpu_id) {
+					if (get_gpu_map(&checkpoint_maps,
+							ev_buckets_ptr[i].hw_exception_data.gpu_id,
+							&e->ev_entries[i]->hw_exc_gpu_id)) {
+						ret = -EFAULT;
+						goto failed;
+					}
+				}
 			}
 		}
 	}
@@ -1067,14 +1237,15 @@ int kfd_plugin_restore_file(int id)
 	snprintf(img_path, sizeof(img_path), "kfd.%d.img", id);
 
 	if (stat(img_path, &filestat) == -1) {
+		struct tp_device *tp_dev;
+		uint32_t target_gpu_id;
+
 		pr_perror("open(%s)", img_path);
 
-		/* This is restorer plugin for renderD nodes. Since criu doesn't
-		 * gurantee that they will be called before the plugin is called
-		 * for kfd file descriptor, we need to make sure we open the render
-		 * nodes only once and before /dev/kfd is open, the render nodes
-		 * are open too. Generally, it is seen that during checkpoint and
-		 * restore both, the kfd plugin gets called first.
+		/* This is restorer plugin for renderD nodes. Criu doesn't guarantee that they will
+		 * be called before the plugin is called for kfd file descriptor.
+		 * TODO: Currently, this code will only work if this function is called for /dev/kfd
+		 * first as we assume restore_maps is already filled. Need to fix this later.
 		 */
 		snprintf(img_path, sizeof(img_path), "renderDXXX.%d.img", id);
 
@@ -1100,12 +1271,30 @@ int kfd_plugin_restore_file(int id)
 		rd = criu_render_node__unpack(NULL, filestat.st_size, buf);
 		if (rd == NULL) {
 			pr_perror("Unable to parse the KFD message %d", id);
-			xfree(buf);
-			return -1;
+			fd = -EBADFD;
+			goto fail;
+		}
+
+		pr_info("kfd_plugin: render node gpu_id = 0x%04x\n", rd->gpu_id);
+
+		if (get_gpu_map(&restore_maps, rd->gpu_id, &target_gpu_id)) {
+			fd = -EBADFD;
+			goto fail;
 		}
 
-		pr_info("kfd_plugin: render node minor num = %d\n", rd->minor_number);
-		fd = open_drm_render_device(rd->minor_number);
+		tp_dev = get_tp_device_by_gpu_id(&dest_topology, target_gpu_id);
+		if (!tp_dev) {
+			fd = -EBADFD;
+			goto fail;
+		}
+
+		pr_info("kfd_plugin: render node destination gpu_id = 0x%04x\n", tp_dev->gpu_id);
+
+		fd = open_drm_render_device(tp_dev->drm_render_minor);
+		if (fd < 0)
+			pr_err("kfd_plugin: Failed to open render device (minor:%d)\n",
+									tp_dev->drm_render_minor);
+fail:
 		criu_render_node__free_unpacked(rd,  NULL);
 		xfree(buf);
 		return fd;
@@ -1139,6 +1328,8 @@ int kfd_plugin_restore_file(int id)
 
 	plugin_log_msg("kfd_plugin: read image file data\n");
 
+	args.num_of_devices = e->num_of_devices;
+
 	devinfo_bucket_ptr = xmalloc(e->num_of_devices * sizeof(struct kfd_criu_devinfo_bucket));
 	if (!devinfo_bucket_ptr) {
 		fd = -EBADFD;
@@ -1146,19 +1337,36 @@ int kfd_plugin_restore_file(int id)
 	}
 	args.kfd_criu_devinfo_buckets_ptr = (uintptr_t)devinfo_bucket_ptr;
 
+	/* set_restore_gpu_maps will parse local topology and fill dest_topology */
+	if (set_restore_gpu_maps(&restore_maps, e->devinfo_entries, e->num_of_devices, &dest_topology)) {
+		fd = -EBADFD;
+		goto clean;
+	}
+
 	for (int i = 0; i < e->num_of_devices; i++) {
+		struct tp_device *tp_dev;
+		int drm_fd;
 		devinfo_bucket_ptr[i].user_gpu_id = e->devinfo_entries[i]->gpu_id;
 
-		// for now always bind the VMA to /dev/dri/renderD128
-		// this should allow us later to restore BO on a different GPU node.
-		devinfo_bucket_ptr[i].drm_fd = open_drm_render_device(i + DRM_FIRST_RENDER_NODE);
-		if (!devinfo_bucket_ptr[i].drm_fd) {
-			pr_perror("kfd_plugin: Can't pass NULL drm render fd to driver\n");
+		if (get_gpu_map(&restore_maps, e->devinfo_entries[i]->gpu_id,
+			&devinfo_bucket_ptr[i].actual_gpu_id)) {
+
+			fd = -EBADFD;
+			goto clean;
+		}
+
+		tp_dev = get_tp_device_by_gpu_id(&dest_topology,
+							devinfo_bucket_ptr[i].actual_gpu_id);
+		if (!tp_dev) {
 			fd = -EBADFD;
 			goto clean;
-		} else {
-			pr_info("kfd_plugin: passing drm render fd = %d to driver\n", devinfo_bucket_ptr[i].drm_fd);
 		}
+		drm_fd = open_drm_render_device(tp_dev->drm_render_minor);
+		if (drm_fd < 0) {
+			fd = -EBADFD;
+			goto clean;
+		}
+		devinfo_bucket_ptr[i].drm_fd = drm_fd;
 	}
 
 	for (int i = 0; i < e->num_of_bos; i++ )
@@ -1188,10 +1396,15 @@ int kfd_plugin_restore_file(int id)
 		(bo_bucket_ptr)[i].bo_addr = (e->bo_info_test[i])->bo_addr;
 		(bo_bucket_ptr)[i].bo_size = (e->bo_info_test[i])->bo_size;
 		(bo_bucket_ptr)[i].bo_offset = (e->bo_info_test[i])->bo_offset;
-		(bo_bucket_ptr)[i].gpu_id = (e->bo_info_test[i])->gpu_id;
 		(bo_bucket_ptr)[i].bo_alloc_flags = (e->bo_info_test[i])->bo_alloc_flags;
 		(bo_bucket_ptr)[i].idr_handle = (e->bo_info_test[i])->idr_handle;
 		(bo_bucket_ptr)[i].user_addr = (e->bo_info_test[i])->user_addr;
+
+		if (get_gpu_map(&restore_maps, e->bo_info_test[i]->gpu_id,
+						&bo_bucket_ptr[i].gpu_id)) {
+			fd = -EBADFD;
+			goto clean;
+		}
 	}
 
 	args.num_of_bos = e->num_of_bos;
@@ -1204,7 +1417,6 @@ int kfd_plugin_restore_file(int id)
 	}
 
 	args.restored_bo_array_ptr = (uint64_t)restored_bo_offsets_array;
-	args.num_of_devices = 1; /* Only support 1 gpu for now */
 
 	q_bucket_ptr = xmalloc(e->num_of_queues * sizeof(struct kfd_criu_q_bucket));
         if (!q_bucket_ptr) {
@@ -1246,7 +1458,10 @@ int kfd_plugin_restore_file(int id)
 			e->q_entries[i]->mqd.len,
 			e->q_entries[i]->ctl_stack.len);
 
-		q_bucket_ptr[i].gpu_id = e->q_entries[i]->gpu_id;
+		if (get_gpu_map(&restore_maps, e->q_entries[i]->gpu_id, &q_bucket_ptr[i].gpu_id)) {
+			fd = -EBADFD;
+			goto clean;
+		}
 		q_bucket_ptr[i].type = e->q_entries[i]->type;
 		q_bucket_ptr[i].format = e->q_entries[i]->format;
 		q_bucket_ptr[i].q_id = e->q_entries[i]->q_id;
@@ -1319,9 +1534,14 @@ int kfd_plugin_restore_file(int id)
 						e->ev_entries[i]->mem_exc_fail_no_execute;
 				ev_bucket_ptr[i].memory_exception_data.va =
 						e->ev_entries[i]->mem_exc_va;
-				ev_bucket_ptr[i].memory_exception_data.gpu_id =
-						e->ev_entries[i]->mem_exc_gpu_id;
+				if (e->ev_entries[i]->mem_exc_gpu_id) {
+					if (get_gpu_map(&restore_maps, e->ev_entries[i]->mem_exc_gpu_id,
+							&ev_bucket_ptr[i].memory_exception_data.gpu_id)) {
 
+						fd = -EBADFD;
+						goto clean;
+					}
+				}
 			} else if (e->ev_entries[i]->type == KFD_IOC_EVENT_HW_EXCEPTION) {
 				ev_bucket_ptr[i].hw_exception_data.reset_type =
 					e->ev_entries[i]->hw_exc_reset_type;
@@ -1329,8 +1549,14 @@ int kfd_plugin_restore_file(int id)
 					e->ev_entries[i]->hw_exc_reset_cause;
 				ev_bucket_ptr[i].hw_exception_data.memory_lost =
 					e->ev_entries[i]->hw_exc_memory_lost;
-				ev_bucket_ptr[i].hw_exception_data.gpu_id =
-					e->ev_entries[i]->hw_exc_gpu_id;
+
+				if (e->ev_entries[i]->hw_exc_gpu_id) {
+					if (get_gpu_map(&restore_maps, e->ev_entries[i]->hw_exc_gpu_id,
+							&ev_bucket_ptr[i].hw_exception_data.gpu_id)) {
+						fd = -EBADFD;
+						goto clean;
+					}
+				}
 			}
 		}
 
@@ -1352,31 +1578,59 @@ int kfd_plugin_restore_file(int id)
 			 KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP |
 			 KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)) {
 
+			struct tp_device *tp_dev;
 			struct vma_metadata *vma_md;
 			vma_md = xmalloc(sizeof(*vma_md));
 			if (!vma_md)
 				return -ENOMEM;
 
-			vma_md->old_pgoff = (e->bo_info_test[i])->bo_offset;
-			vma_md->vma_entry = (e->bo_info_test[i])->bo_addr;
+			memset(vma_md, 0, sizeof(*vma_md));
+
+			vma_md->old_pgoff = bo_bucket_ptr[i].bo_offset;
+			vma_md->vma_entry = bo_bucket_ptr[i].bo_addr;
+
+			tp_dev = get_tp_device_by_gpu_id(&dest_topology, bo_bucket_ptr[i].gpu_id);
+			vma_md->new_minor = tp_dev->drm_render_minor;
+
 			vma_md->new_pgoff = restored_bo_offsets_array[i];
+
+			plugin_log_msg("kfd_plugin: adding vma_entry:addr:0x%lx old-off:0x%lx \
+					new_off:0x%lx new_minor:%d\n", vma_md->vma_entry,
+					vma_md->old_pgoff, vma_md->new_pgoff, vma_md->new_minor);
+
 			list_add_tail(&vma_md->list, &update_vma_info_list);
 		}
 
 		if (e->bo_info_test[i]->bo_alloc_flags &
 			(KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
 
-			pr_info("kfd_plugin: Trying mmap in stage 2\n");
+			int j;
+			int drm_render_fd = -EBADFD;
+
+			for (j = 0; j < e->num_of_devices; j++) {
+				if (devinfo_bucket_ptr[j].actual_gpu_id == bo_bucket_ptr[i].gpu_id) {
+					drm_render_fd = devinfo_bucket_ptr[j].drm_fd;
+					break;
+				}
+			}
+
+			if (drm_render_fd < 0) {
+				pr_err("kfd_plugin: bad fd for render node\n");
+				fd = -EBADFD;
+				goto clean;
+			}
+
+			plugin_log_msg("kfd_plugin: Trying mmap in stage 2\n");
 			if ((e->bo_info_test[i])->bo_alloc_flags &
 			    KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC ||
 			    (e->bo_info_test[i])->bo_alloc_flags &
 			    KFD_IOC_ALLOC_MEM_FLAGS_GTT ) {
-				pr_info("kfd_plugin: large bar write possible\n");
+				plugin_log_msg("kfd_plugin: large bar write possible\n");
 				addr = mmap(NULL,
 					    (e->bo_info_test[i])->bo_size,
 					    PROT_WRITE,
 					    MAP_SHARED,
-					    devinfo_bucket_ptr[0].drm_fd,
+					    drm_render_fd,
 					    restored_bo_offsets_array[i]);
 				if (addr == MAP_FAILED) {
 					pr_perror("kfd_plugin: mmap failed\n");
@@ -1399,7 +1653,7 @@ int kfd_plugin_restore_file(int id)
 					    (e->bo_info_test[i])->bo_size,
 					    PROT_NONE,
 					    MAP_SHARED,
-					    devinfo_bucket_ptr[0].drm_fd,
+					    drm_render_fd,
 					    restored_bo_offsets_array[i]);
 				if (addr == MAP_FAILED) {
 					pr_perror("kfd_plugin: mmap failed\n");
@@ -1480,19 +1734,50 @@ int kfd_plugin_update_vmamap(const char *old_path, char *new_path, const uint64_
 				const uint64_t old_offset, uint64_t *new_offset)
 {
 	struct vma_metadata *vma_md;
+	char path[PATH_MAX];
+	char *p_begin;
+	char *p_end;
+	bool is_kfd = false, is_renderD = false;
+
 
 	pr_info("kfd_plugin: Enter %s\n", __func__);
 
-	/* Once we support restoring on different nodes, new_path may be different from old_path
-	 * because the restored gpu may have a different minor number.
-	 * For now, we are restoring on the same gpu, so new_path is the same as old_path */
+	strncpy(path, old_path, sizeof(path));
+
+	p_begin = path;
+	p_end = p_begin + strlen(path);
 
-	strcpy(new_path, old_path);
+	/*
+	 * Paths sometimes have double forward slashes (e.g //dev/dri/renderD*)
+	 * replace all '//' with '/'.
+	 */
+	while (p_begin < p_end - 1) {
+		if (*p_begin == '/' && *(p_begin + 1) == '/')
+			memmove(p_begin, p_begin + 1, p_end - p_begin);
+		else
+			p_begin++;
+	}
+
+	if (!strncmp(path, "/dev/dri/renderD", strlen("/dev/dri/renderD")))
+		is_renderD = true;
+
+	if (!strcmp(path, "/dev/kfd"))
+		is_kfd = true;
+
+	if (!is_renderD && !is_kfd) {
+		pr_info("Skipping unsupported path:%s addr:%lx old_offset:%lx\n", old_path, addr, old_offset);
+		return 0;
+	}
 
 	list_for_each_entry(vma_md, &update_vma_info_list, list) {
 		if (addr == vma_md->vma_entry && old_offset == vma_md->old_pgoff) {
 			*new_offset = vma_md->new_pgoff;
 
+			if (is_renderD)
+				sprintf(new_path, "/dev/dri/renderD%d", vma_md->new_minor);
+			else
+				strcpy(new_path, old_path);
+
 			pr_info("kfd_plugin: old_pgoff= 0x%lx new_pgoff = 0x%lx old_path = %s new_path = %s\n",
 				vma_md->old_pgoff, vma_md->new_pgoff, old_path, new_path);
 
-- 
2.17.1



More information about the CRIU mailing list