Fix vulkan PCI ID and ID handling (#12775)

* Fix vulkan PCI ID and ID handling Intel GPUs may not report PCI IDs which was leading to incorrect overlap detection. Switch to using the existing PCI IDs, however AMD GPUs claim not to report PCI IDs, but actually do, so try anyway, as this is required for ADLX to find the GPUs on Windows. Numeric IDs lead to scheduling problems, so this also switches Vulkan to use UUID based IDs. The GPU discovery patches have been squashed into a single patch to simplify future rebases. * review comments
2025-12-21 22:33:56 +00:00 · 2025-10-28 15:15:35 -07:00
parent 29f63f37c8
commit 14977a9350
15 changed files with 418 additions and 447 deletions
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -725,7 +725,9 @@ func (b *Backend) BackendDevices() []ml.DeviceInfo {
 		if props.library != nil {
 			info.Library = C.GoString(props.library)
 		}
-		info.PCIID = fmt.Sprintf("%02x:%02x.%x", props.pci_bus_id, props.pci_device_id, props.pci_domain_id)
+		if props.device_id != nil {
+			info.PCIID = C.GoString(props.device_id)
+		}
 		info.LibraryPath = ggml.LibPaths()
 		if props.numeric_id != nil {
 			info.FilteredID = C.GoString(props.numeric_id)
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -174,9 +174,6 @@ extern "C" {
        int compute_major;
        int compute_minor;
        int integrated;
-        int pci_bus_id;
-        int pci_device_id;
-        int pci_domain_id;
        const char *library;
        // number with which the devices are accessed (Vulkan)
        const char *numeric_id;
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3513,9 +3513,6 @@ struct ggml_backend_cuda_device_context {
    int driver_major;
    int driver_minor;
    int integrated;
-    int pciBusID;
-    int pciDeviceID;
-    int pciDomainID;
 };

 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3539,9 +3536,9 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *

 #if defined(GGML_USE_HIP)
    if (ggml_hip_mgmt_init() == 0) {
-        int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
+        int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total);
        if (status == 0) {
-            GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
            ggml_hip_mgmt_release();
            return;
        }
@@ -3551,7 +3548,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
    if (ggml_nvml_init() == 0) {
        int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
        if (status == 0) {
-            GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total);
            ggml_nvml_release();
            return;
        }
@@ -3591,9 +3588,6 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
    props->driver_major = ctx->driver_major;
    props->driver_minor = ctx->driver_minor;
    props->integrated = ctx->integrated;
-    props->pci_bus_id = ctx->pciBusID;
-    props->pci_device_id = ctx->pciDeviceID;
-    props->pci_domain_id = ctx->pciDomainID;
    props->library = GGML_CUDA_NAME;

    bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
@@ -4182,9 +4176,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                dev_ctx->driver_major = driverVersion / 1000;
                dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
                dev_ctx->integrated = prop.integrated;
-                dev_ctx->pciBusID = prop.pciBusID;
-                dev_ctx->pciDeviceID = prop.pciDeviceID;
-                dev_ctx->pciDomainID = prop.pciDomainID;
                ggml_backend_dev_t dev = new ggml_backend_device {
                    /* .iface   = */ ggml_backend_cuda_device_interface,
                    /* .reg     = */ &reg,
--- a/ml/backend/ggml/ggml/src/ggml-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-impl.h
@@ -643,7 +643,7 @@ GGML_API int ggml_nvml_init();
 GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
 GGML_API void ggml_nvml_release();
 GGML_API int ggml_hip_mgmt_init();
-GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
+GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
 GGML_API void ggml_hip_mgmt_release();

 #ifdef __cplusplus
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -231,6 +231,7 @@ class vk_memory_logger;
 #endif
 class vk_perf_logger;
 static void ggml_vk_destroy_buffer(vk_buffer& buf);
+static std::string ggml_vk_get_device_id(int device);

 static constexpr uint32_t mul_mat_vec_max_cols = 8;
 static constexpr uint32_t p021_max_gqa_ratio = 8;
@@ -11598,7 +11599,7 @@ static std::string ggml_vk_get_device_id(int device) {
    const auto& uuid = deviceIDProps.deviceUUID;
    char id[64];
    snprintf(id, sizeof(id),
-        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+        "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
        uuid[0], uuid[1], uuid[2], uuid[3],
        uuid[4], uuid[5],
        uuid[6], uuid[7],
@@ -12431,13 +12432,11 @@ struct ggml_backend_vk_device_context {
    std::string pci_id;
    std::string id;
    std::string uuid;
+    std::string numeric_id;
    int major;
    int minor;
    int driver_major;
    int driver_minor;
-    int pci_bus_id;
-    int pci_device_id;
-    int pci_domain_id;
 };

 void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
@@ -12456,9 +12455,9 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
        switch (props2.properties.vendorID) {
        case VK_VENDOR_ID_AMD:
            if (ggml_hip_mgmt_init() == 0) {
-                int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
+                int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total);
                if (status == 0) {
-                    GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+                    GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
                    ggml_hip_mgmt_release();
                    return;
                }
@@ -12469,7 +12468,7 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
            if (ggml_nvml_init() == 0) {
                int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
                if (status == 0) {
-                    GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+                    GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total);
                    ggml_nvml_release();
                    return;
                }
@@ -12545,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
        }
    }

+    vk::PhysicalDeviceProperties2 props2;
    if (!ext_support) {
-        return "";
+        device.getProperties2(&props2);
+        if (props2.properties.vendorID != VK_VENDOR_ID_AMD) {
+            return "";
+        }
+        // AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero
    }

    vk::PhysicalDeviceProperties2 props = {};
@@ -12563,6 +12567,9 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {

    char pci_bus_id[16] = {};
    snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
+    if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) {
+        return "";
+    }

    return std::string(pci_bus_id);
 }
@@ -12636,11 +12643,8 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
    props->driver_major = ctx->driver_major;
    props->driver_minor = ctx->driver_minor;
    props->integrated = ctx->is_integrated_gpu;
-    props->pci_bus_id = ctx->pci_bus_id;
-    props->pci_device_id = ctx->pci_device_id;
-    props->pci_domain_id = ctx->pci_domain_id;
    props->library = GGML_VK_NAME;
-    props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str();
+    props->numeric_id = ctx->numeric_id.c_str();
 }

 static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -13101,7 +13105,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                vk_devices[dev_idx].getProperties2(&props2);
                std::ostringstream oss;
                oss << std::hex << std::setfill('0');
-                oss << "GPU-";
                int byteIdx = 0;
                for (int i = 0; i < 16; ++i, ++byteIdx) {
                    oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
@@ -13110,15 +13113,12 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                    }
                }
                ctx->uuid = oss.str();
-                ctx->pci_bus_id = pci_bus_props.pciBus;
-                ctx->pci_device_id = pci_bus_props.pciDevice;
-                ctx->pci_domain_id = pci_bus_props.pciDomain;
-                ctx->id = std::to_string(i);
                ctx->major = 0;
                ctx->minor = 0;
                // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
                ctx->driver_major = 0;
                ctx->driver_minor = 0;
+                ctx->numeric_id = std::to_string(i);
            }
            initialized = true;
        }
--- a/ml/backend/ggml/ggml/src/mem_hip.cpp
+++ b/ml/backend/ggml/ggml/src/mem_hip.cpp
@@ -331,7 +331,7 @@ void ggml_hip_mgmt_release() {
    if (gpus != NULL) gpus->pVtbl->Release(gpus); \
    if (gpu != NULL) gpu->pVtbl->Release(gpu)

-int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
    std::lock_guard<std::mutex> lock(ggml_adlx_lock);
    if (adlx.handle == NULL) {
        GGML_LOG_INFO("%s ADLX was not initialized\n", __func__);
@@ -343,9 +343,13 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free,
    IADLXGPU* gpu = NULL;
    IADLXGPUMetrics *gpuMetrics = NULL;
    ADLX_RESULT status;
-    // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs 
-    adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff);

+    uint32_t pci_domain, pci_bus, pci_device, pci_function;
+    if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) {
+        // TODO - parse other formats?
+        GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id);
+        return ADLX_NOT_FOUND;
+    }
    status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices);
    if (ADLX_FAILED(status)) {
        GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status);
@@ -368,16 +372,15 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free,
            GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status);
            continue;
        }
-        adlx_int id;
-        status = gpu->pVtbl->UniqueId(gpu, &id);
+        adlx_int uniqueID;
+        status = gpu->pVtbl->UniqueId(gpu, &uniqueID);
        if (ADLX_FAILED(status)) {
            GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status);
            gpu->pVtbl->Release(gpu);
            gpu = NULL;
            continue;
        }
-        if (id != target) {
-            GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id);
+        if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) {
            gpu->pVtbl->Release(gpu);
            gpu = NULL;
            continue;
@@ -440,7 +443,7 @@ int ggml_hip_mgmt_init() {
    return -1;
 }
 void ggml_hip_mgmt_release() {}
-int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
    return -1;
 }