[CRIU] [RFC PATCH 18/20] criu/plugin: Add parameters to override mapping
Felix Kuehling
Felix.Kuehling at amd.com
Sat May 1 04:58:43 MSK 2021
From: David Yat Sin <david.yatsin at amd.com>
Add optional parameters to override default behavior during restore.
These parameters are passed in as environment variables before executing
CRIU.
List of parameters:
KFD_DESTINATION_GPUS - override target gpu's
KFD_FW_VER_CHECK - disable firmware version check
KFD_CACHES_COUNT_CHECK - disable caches count check
Signed-off-by: David Yat Sin <david.yatsin at amd.com>
---
Documentation/kfd_plugin.txt | 34 ++++++++++
test/others/ext-kfd/kfd_plugin.c | 105 ++++++++++++++++++++++++++++++-
2 files changed, 137 insertions(+), 2 deletions(-)
diff --git a/Documentation/kfd_plugin.txt b/Documentation/kfd_plugin.txt
index 9dce736fc..85c5416d5 100644
--- a/Documentation/kfd_plugin.txt
+++ b/Documentation/kfd_plugin.txt
@@ -35,6 +35,40 @@ Dependencies
This work is rebased on latest criu release available at this time.
+OPTIONS
+-------
+Optional parameters can be passed in as environment variables before
+executing criu command.
+
+*KFD_DESTINATION_GPUS*::
+ Override GPUs on local restore node. GPUs can be specified as gpu_id
+ (hex or decimal) or minor number. Default:<None>
+
+ E.g:
+
+ KFD_DESTINATION_GPUS=0xff31,0x90db
+
+ KFD_DESTINATION_GPUS=65329,37083
+
+ KFD_DESTINATION_GPUS=renderD129,renderD128
+
+*KFD_FW_VER_CHECK*::
+ Enable or disable firmware version check.
+ If enabled, firmware version on restored gpu needs to be greater than or
+ equal firmware version on checkpointed gpu. This option is only valid if
+ KFD_DESTINATION_GPUS is used. Default:Enabled
+
+ E.g:
+ KFD_FW_VER_CHECK=0
+
+*KFD_CACHES_COUNT_CHECK*::
+ Enable or disable caches count check. If enabled, the caches count on
+ restored gpu needs to be greater than or equal caches count on checkpointed
+ gpu. Default:Enabled
+
+ E.g:
+ KFD_CACHES_COUNT_CHECK=0
+
AUTHOR
------
The AMDKFD team.
diff --git a/test/others/ext-kfd/kfd_plugin.c b/test/others/ext-kfd/kfd_plugin.c
index 5e6247492..a03b228f1 100644
--- a/test/others/ext-kfd/kfd_plugin.c
+++ b/test/others/ext-kfd/kfd_plugin.c
@@ -45,6 +45,16 @@
#define plugin_log_msg(fmt, ...) {}
#endif
+/* User override options */
+/* Forces gpu mapping to specific gpu list */
+char *kfd_gpu_override = NULL;
+/* Skips all topology checks inside plugin - only if kfd_gpu_override is enabled */
+bool kfd_topology_check = true;
+/* Skip firmware version check */
+bool kfd_fw_version_check = true;
+/* Skip caches count check */
+bool kfd_caches_count_check = true;
+
struct vma_metadata {
struct list_head list;
uint64_t old_pgoff;
@@ -372,6 +382,9 @@ int get_gpu_map(struct gpu_id_maps *gpu_maps, uint32_t src, uint32_t *dest)
bool device_match(DevinfoEntry *src_dev, struct tp_device *tp_dev)
{
+ if (!kfd_topology_check)
+ return true;
+
if (src_dev->cpu_cores_count == tp_dev->cpu_cores_count &&
src_dev->simd_count == tp_dev->simd_count &&
src_dev->mem_banks_count == tp_dev->mem_banks_count &&
@@ -393,8 +406,8 @@ bool device_match(DevinfoEntry *src_dev, struct tp_device *tp_dev)
src_dev->num_cp_queues == tp_dev->num_cp_queues &&
src_dev->capability == tp_dev->capability &&
src_dev->sdma_fw_version == tp_dev->sdma_fw_version &&
- src_dev->caches_count <= tp_dev->caches_count &&
- src_dev->fw_version <= tp_dev->fw_version) {
+ (!kfd_caches_count_check || (src_dev->caches_count <= tp_dev->caches_count)) &&
+ (!kfd_fw_version_check || (src_dev->fw_version <= tp_dev->fw_version))) {
return true;
}
@@ -426,6 +439,61 @@ void print_required_properties(DevinfoEntry *src_dev)
pr_err("========================================================================\n");
}
+/* return 1 if gpu override is set
+ * return 0 if no gpu overide
+ * return -1 if failed to set gpu override */
+int get_user_gpu_override(struct gpu_id_maps *gpu_maps, DevinfoEntry *src_devs[],
+ uint32_t num_devices, struct tp_system *topology)
+{
+ char *token;
+ int index = 0;
+ char *gpu_str = kfd_gpu_override;
+ struct tp_device *tp_dev;
+
+ /* Expected destination gpu formats:
+ * KFD_DESTINATION_GPUS=0xff31,0x90db
+ * KFD_DESTINATION_GPUS=65329,37083
+ * KFD_DESTINATION_GPUS=renderD129,renderD128 */
+ if (!gpu_str)
+ return 0;
+
+ pr_info("kfd_plugin: Destination GPU's override:%s\n", gpu_str);
+
+ token = strtok(gpu_str, ",");
+ while (token) {
+ uint32_t dev_minor=0, gpu_id = 0;
+ if (sscanf(token, "renderD%d", &dev_minor) == 1) {
+ tp_dev = get_tp_device_by_render_minor(topology, dev_minor);
+ gpu_id = tp_dev->gpu_id;
+ } else if (sscanf(token, "0x%x", &gpu_id) == 1 || sscanf(token, "%u", &gpu_id) == 1)
+ tp_dev = get_tp_device_by_gpu_id(topology, gpu_id);
+
+ if (tp_dev) {
+ /* Ignore extra GPU's */
+ if (index >= num_devices)
+ break;
+
+ if (!device_match(src_devs[index], tp_dev)) {
+ pr_err("Local gpu_id = 0x%04x not compatible\n", gpu_id);
+ print_required_properties(src_devs[index]);
+ return -1;
+ }
+
+ gpu_maps->maps[index].src = src_devs[index]->gpu_id;
+ gpu_maps->maps[index].dest = gpu_id;
+ pr_info("Matched gpu 0x%04x->0x%04x\n", gpu_maps->maps[index].src,
+ gpu_maps->maps[index].dest);
+ index++;
+ } else {
+ pr_err("kfd_plugin:Failed to parse destination GPU's: %s", gpu_str);
+ return -1;
+ }
+ token = strtok(NULL, ",");
+ }
+ gpu_maps->num_devices = num_devices;
+ return 1;
+}
+
/* Parse local system topology and compare with checkpointed devices so we can build a set of gpu
* maps that is used for local target gpu's */
int set_restore_gpu_maps(struct gpu_id_maps *gpu_maps, DevinfoEntry *src_devs[],
@@ -443,6 +511,10 @@ int set_restore_gpu_maps(struct gpu_id_maps *gpu_maps, DevinfoEntry *src_devs[],
return -EINVAL;
}
+ /* For debugging purposes, we can override destination GPU's */
+ if (get_user_gpu_override(gpu_maps, src_devs, num_devices, topo))
+ return 0;
+
memset(matched_devices, 0, sizeof(matched_devices));
gpu_maps->num_devices = num_devices;
@@ -708,6 +780,7 @@ static int allocate_ev_entries(CriuKfd *e, int num_events)
}
int kfd_plugin_init(int stage)
{
+ char *opt_param = NULL;
pr_info("kfd_plugin: initialized: %s (AMDGPU/KFD)\n",
CR_PLUGIN_DESC.name);
@@ -715,6 +788,34 @@ int kfd_plugin_init(int stage)
memset(&dest_topology, 0, sizeof(dest_topology));
memset(&checkpoint_maps, 0, sizeof(checkpoint_maps));
memset(&restore_maps, 0, sizeof(restore_maps));
+
+ if (stage == CR_PLUGIN_STAGE__RESTORE) {
+ /* Expected destination gpu format:
+ * KFD_DESTINATION_GPUS=0xff31,0x90db
+ * KFD_DESTINATION_GPUS=65329,37083
+ * KFD_DESTINATION_GPUS=renderD129,renderD128
+ */
+ kfd_gpu_override = getenv("KFD_DESTINATION_GPUS");
+ pr_info("param: KFD_DESTINATION_GPUS:%s\n", kfd_gpu_override ? kfd_gpu_override : "None");
+
+ if ((opt_param = getenv("KFD_TOPOLOGY_CHECK"))) {
+ if (!strcmp(opt_param, "0") || !strcmp(opt_param, "NO"))
+ kfd_topology_check = false;
+ }
+ pr_info("param: KFD_TOPOLOGY_CHECK:%s\n", kfd_topology_check ? "Y" : "N");
+
+ if ((opt_param = getenv("KFD_FW_VER_CHECK"))) {
+ if (!strcmp(opt_param, "0") || !strcmp(opt_param, "NO"))
+ kfd_fw_version_check = false;
+ }
+ pr_info("param: KFD_FW_VERSION_CHECK:%s\n", kfd_fw_version_check ? "Y" : "N");
+
+ if ((opt_param = getenv("KFD_CACHES_COUNT_CHECK"))) {
+ if (!strcmp(opt_param, "0") || !strcmp(opt_param, "NO"))
+ kfd_caches_count_check = false;
+ }
+ pr_info("param: KFD_CACHES_COUNT_CHECK:%s\n", kfd_caches_count_check ? "Y" : "N");
+ }
return 0;
}
--
2.17.1
More information about the CRIU
mailing list