Fix vulkan PCI ID and ID handling (#12775)

* Fix vulkan PCI ID and ID handling

Intel GPUs may not report PCI IDs which was leading to incorrect overlap
detection.  Switch to using the existing PCI IDs, however AMD GPUs claim not to
report PCI IDs, but actually do, so try anyway, as this is required for ADLX to
find the GPUs on Windows. Numeric IDs lead to scheduling problems, so this also
switches Vulkan to use UUID based IDs. The GPU discovery patches have been
squashed into a single patch to simplify future rebases.

* review comments
This commit is contained in:
Daniel Hiltgen
2025-10-28 15:15:35 -07:00
committed by GitHub
parent 29f63f37c8
commit 14977a9350
15 changed files with 418 additions and 447 deletions

View File

@@ -725,7 +725,9 @@ func (b *Backend) BackendDevices() []ml.DeviceInfo {
if props.library != nil {
info.Library = C.GoString(props.library)
}
info.PCIID = fmt.Sprintf("%02x:%02x.%x", props.pci_bus_id, props.pci_device_id, props.pci_domain_id)
if props.device_id != nil {
info.PCIID = C.GoString(props.device_id)
}
info.LibraryPath = ggml.LibPaths()
if props.numeric_id != nil {
info.FilteredID = C.GoString(props.numeric_id)

View File

@@ -174,9 +174,6 @@ extern "C" {
int compute_major;
int compute_minor;
int integrated;
int pci_bus_id;
int pci_device_id;
int pci_domain_id;
const char *library;
// number with which the devices are accessed (Vulkan)
const char *numeric_id;

View File

@@ -3513,9 +3513,6 @@ struct ggml_backend_cuda_device_context {
int driver_major;
int driver_minor;
int integrated;
int pciBusID;
int pciDeviceID;
int pciDomainID;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3539,9 +3536,9 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
#if defined(GGML_USE_HIP)
if (ggml_hip_mgmt_init() == 0) {
int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total);
if (status == 0) {
GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
ggml_hip_mgmt_release();
return;
}
@@ -3551,7 +3548,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
if (ggml_nvml_init() == 0) {
int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
if (status == 0) {
GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total);
ggml_nvml_release();
return;
}
@@ -3591,9 +3588,6 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->driver_major = ctx->driver_major;
props->driver_minor = ctx->driver_minor;
props->integrated = ctx->integrated;
props->pci_bus_id = ctx->pciBusID;
props->pci_device_id = ctx->pciDeviceID;
props->pci_domain_id = ctx->pciDomainID;
props->library = GGML_CUDA_NAME;
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
@@ -4182,9 +4176,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
dev_ctx->driver_major = driverVersion / 1000;
dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
dev_ctx->integrated = prop.integrated;
dev_ctx->pciBusID = prop.pciBusID;
dev_ctx->pciDeviceID = prop.pciDeviceID;
dev_ctx->pciDomainID = prop.pciDomainID;
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg,

View File

@@ -643,7 +643,7 @@ GGML_API int ggml_nvml_init();
GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
GGML_API void ggml_nvml_release();
GGML_API int ggml_hip_mgmt_init();
GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
GGML_API void ggml_hip_mgmt_release();
#ifdef __cplusplus

View File

@@ -231,6 +231,7 @@ class vk_memory_logger;
#endif
class vk_perf_logger;
static void ggml_vk_destroy_buffer(vk_buffer& buf);
static std::string ggml_vk_get_device_id(int device);
static constexpr uint32_t mul_mat_vec_max_cols = 8;
static constexpr uint32_t p021_max_gqa_ratio = 8;
@@ -11598,7 +11599,7 @@ static std::string ggml_vk_get_device_id(int device) {
const auto& uuid = deviceIDProps.deviceUUID;
char id[64];
snprintf(id, sizeof(id),
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
"%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
uuid[0], uuid[1], uuid[2], uuid[3],
uuid[4], uuid[5],
uuid[6], uuid[7],
@@ -12431,13 +12432,11 @@ struct ggml_backend_vk_device_context {
std::string pci_id;
std::string id;
std::string uuid;
std::string numeric_id;
int major;
int minor;
int driver_major;
int driver_minor;
int pci_bus_id;
int pci_device_id;
int pci_domain_id;
};
void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
@@ -12456,9 +12455,9 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
switch (props2.properties.vendorID) {
case VK_VENDOR_ID_AMD:
if (ggml_hip_mgmt_init() == 0) {
int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total);
if (status == 0) {
GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
ggml_hip_mgmt_release();
return;
}
@@ -12469,7 +12468,7 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
if (ggml_nvml_init() == 0) {
int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
if (status == 0) {
GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total);
ggml_nvml_release();
return;
}
@@ -12545,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
}
}
vk::PhysicalDeviceProperties2 props2;
if (!ext_support) {
return "";
device.getProperties2(&props2);
if (props2.properties.vendorID != VK_VENDOR_ID_AMD) {
return "";
}
// AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero
}
vk::PhysicalDeviceProperties2 props = {};
@@ -12563,6 +12567,9 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) {
return "";
}
return std::string(pci_bus_id);
}
@@ -12636,11 +12643,8 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
props->driver_major = ctx->driver_major;
props->driver_minor = ctx->driver_minor;
props->integrated = ctx->is_integrated_gpu;
props->pci_bus_id = ctx->pci_bus_id;
props->pci_device_id = ctx->pci_device_id;
props->pci_domain_id = ctx->pci_domain_id;
props->library = GGML_VK_NAME;
props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str();
props->numeric_id = ctx->numeric_id.c_str();
}
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -13101,7 +13105,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
vk_devices[dev_idx].getProperties2(&props2);
std::ostringstream oss;
oss << std::hex << std::setfill('0');
oss << "GPU-";
int byteIdx = 0;
for (int i = 0; i < 16; ++i, ++byteIdx) {
oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
@@ -13110,15 +13113,12 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
}
}
ctx->uuid = oss.str();
ctx->pci_bus_id = pci_bus_props.pciBus;
ctx->pci_device_id = pci_bus_props.pciDevice;
ctx->pci_domain_id = pci_bus_props.pciDomain;
ctx->id = std::to_string(i);
ctx->major = 0;
ctx->minor = 0;
// TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
ctx->driver_major = 0;
ctx->driver_minor = 0;
ctx->numeric_id = std::to_string(i);
}
initialized = true;
}

View File

@@ -331,7 +331,7 @@ void ggml_hip_mgmt_release() {
if (gpus != NULL) gpus->pVtbl->Release(gpus); \
if (gpu != NULL) gpu->pVtbl->Release(gpu)
int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
std::lock_guard<std::mutex> lock(ggml_adlx_lock);
if (adlx.handle == NULL) {
GGML_LOG_INFO("%s ADLX was not initialized\n", __func__);
@@ -343,9 +343,13 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free,
IADLXGPU* gpu = NULL;
IADLXGPUMetrics *gpuMetrics = NULL;
ADLX_RESULT status;
// The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs
adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff);
uint32_t pci_domain, pci_bus, pci_device, pci_function;
if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) {
// TODO - parse other formats?
GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id);
return ADLX_NOT_FOUND;
}
status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices);
if (ADLX_FAILED(status)) {
GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status);
@@ -368,16 +372,15 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free,
GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status);
continue;
}
adlx_int id;
status = gpu->pVtbl->UniqueId(gpu, &id);
adlx_int uniqueID;
status = gpu->pVtbl->UniqueId(gpu, &uniqueID);
if (ADLX_FAILED(status)) {
GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status);
gpu->pVtbl->Release(gpu);
gpu = NULL;
continue;
}
if (id != target) {
GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id);
if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) {
gpu->pVtbl->Release(gpu);
gpu = NULL;
continue;
@@ -440,7 +443,7 @@ int ggml_hip_mgmt_init() {
return -1;
}
void ggml_hip_mgmt_release() {}
int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
return -1;
}