From 14977a93506b853f07cc06fa50c211570435877a Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 28 Oct 2025 15:15:35 -0700 Subject: [PATCH] Fix vulkan PCI ID and ID handling (#12775) * Fix vulkan PCI ID and ID handling Intel GPUs may not report PCI IDs which was leading to incorrect overlap detection. Switch to using the existing PCI IDs, however AMD GPUs claim not to report PCI IDs, but actually do, so try anyway, as this is required for ADLX to find the GPUs on Windows. Numeric IDs lead to scheduling problems, so this also switches Vulkan to use UUID based IDs. The GPU discovery patches have been squashed into a single patch to simplify future rebases. * review comments --- discover/runner.go | 27 +- discover/types.go | 3 + .../0026-GPU-discovery-enhancements.patch | 397 ++++++++++++++++-- ...ML-fallback-for-unified-memory-GPUs.patch} | 2 +- ...027-vulkan-get-GPU-ID-ollama-v0.11.5.patch | 95 ----- ...-CUDA-scheduling-strategy-to-spin-1.patch} | 2 +- .../patches/0028-vulkan-pci-and-memory.patch | 254 ----------- ...=> 0029-report-LoadLibrary-failures.patch} | 0 ml/backend/ggml/ggml.go | 4 +- ml/backend/ggml/ggml/include/ggml-backend.h | 3 - .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu | 15 +- ml/backend/ggml/ggml/src/ggml-impl.h | 2 +- .../ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp | 34 +- ml/backend/ggml/ggml/src/mem_hip.cpp | 19 +- ml/device.go | 8 +- 15 files changed, 418 insertions(+), 447 deletions(-) rename llama/patches/{0029-NVML-fallback-for-unified-memory-GPUs.patch => 0027-NVML-fallback-for-unified-memory-GPUs.patch} (99%) delete mode 100644 llama/patches/0027-vulkan-get-GPU-ID-ollama-v0.11.5.patch rename llama/patches/{0030-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch => 0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch} (97%) delete mode 100644 llama/patches/0028-vulkan-pci-and-memory.patch rename llama/patches/{0031-report-LoadLibrary-failures.patch => 0029-report-LoadLibrary-failures.patch} (100%) diff --git a/discover/runner.go b/discover/runner.go index cbaba3c6..e74050d0 100644 --- a/discover/runner.go +++ b/discover/runner.go @@ -117,7 +117,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. // In the second pass, we more deeply initialize the GPUs to weed out devices that // aren't supported by a given library. We run this phase in parallel to speed up discovery. - slog.Debug("filtering out unsupported or overlapping GPU library combinations", "count", len(devices)) + slog.Debug("evluating which if any devices to filter out", "initial_count", len(devices)) ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() var wg sync.WaitGroup @@ -129,7 +129,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. if devices[i].Library == "Metal" { continue } - slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID) + slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID) wg.Add(1) go func(i int) { defer wg.Done() @@ -155,6 +155,12 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. envVar: id, // Filter to just this one GPU } if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 { + slog.Debug("filtering device which didn't fully initialize", + "id", devices[i].ID, + "libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], + "pci_id", devices[i].PCIID, + "library", devices[i].Library, + ) needsDelete[i] = true } else { supportedMu.Lock() @@ -170,7 +176,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. }(i) } wg.Wait() - logutil.Trace("supported GPU library combinations", "supported", supported) + logutil.Trace("supported GPU library combinations before filtering", "supported", supported) filterOutVulkanThatAreSupportedByOtherGPU(needsDelete) @@ -372,12 +378,13 @@ func filterOutVulkanThatAreSupportedByOtherGPU(needsDelete []bool) { } if devices[j].PCIID == devices[i].PCIID && devices[j].Library != "Vulkan" && !needsDelete[j] { needsDelete[i] = true - slog.Debug("dropping Vulkan duplicate by PCI ID", - "vulkan_id", devices[i].ID, - "vulkan_libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], + slog.Debug("filtering device with duplicate PCI ID", + "id", devices[i].ID, + "library", devices[i].Library, + "libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], "pci_id", devices[i].PCIID, - "kept_library", devices[j].Library, "kept_id", devices[j].ID, + "kept_library", devices[j].Library, ) break } @@ -422,6 +429,12 @@ func filterOverlapByLibrary(supported map[string]map[string]map[string]int, need } for dev, i := range byLibDirs[libDir] { if _, found := byLibDirs[newest][dev]; found { + slog.Debug("filtering device with overlapping libraries", + "id", dev, + "library", libDir, + "delete_index", i, + "kept_library", newest, + ) needsDelete[i] = true } } diff --git a/discover/types.go b/discover/types.go index b34bafd2..b1f622f4 100644 --- a/discover/types.go +++ b/discover/types.go @@ -3,6 +3,7 @@ package discover import ( "log/slog" "path/filepath" + "sort" "strings" "github.com/ollama/ollama/format" @@ -26,6 +27,7 @@ type CPU struct { } func LogDetails(devices []ml.DeviceInfo) { + sort.Sort(sort.Reverse(ml.ByFreeMemory(devices))) // Report devices in order of scheduling preference for _, dev := range devices { var libs []string for _, dir := range dev.LibraryPath { @@ -39,6 +41,7 @@ func LogDetails(devices []ml.DeviceInfo) { } slog.Info("inference compute", "id", dev.ID, + "filtered_id", dev.FilteredID, "library", dev.Library, "compute", dev.Compute(), "name", dev.Name, diff --git a/llama/patches/0026-GPU-discovery-enhancements.patch b/llama/patches/0026-GPU-discovery-enhancements.patch index 82513e34..807a4689 100644 --- a/llama/patches/0026-GPU-discovery-enhancements.patch +++ b/llama/patches/0026-GPU-discovery-enhancements.patch @@ -5,24 +5,33 @@ Subject: [PATCH] GPU discovery enhancements Expose more information about the devices through backend props, and leverage management libraries for more accurate VRAM usage reporting if available. + +vulkan: get GPU ID (ollama v0.11.5) + +Signed-off-by: Xiaodong Ye + +Vulkan PCI and Memory + +fix vulkan PCI ID and ID handling --- - ggml/include/ggml-backend.h | 11 + - ggml/src/CMakeLists.txt | 2 + - ggml/src/ggml-cuda/ggml-cuda.cu | 74 +++++ - ggml/src/ggml-cuda/vendors/hip.h | 3 + - ggml/src/ggml-impl.h | 8 + - ggml/src/ggml-metal/ggml-metal.cpp | 2 + - ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++ - ggml/src/mem_nvml.cpp | 209 ++++++++++++++ - 8 files changed, 758 insertions(+) + ggml/include/ggml-backend.h | 8 + + ggml/src/CMakeLists.txt | 2 + + ggml/src/ggml-cuda/ggml-cuda.cu | 65 ++++ + ggml/src/ggml-cuda/vendors/hip.h | 3 + + ggml/src/ggml-impl.h | 8 + + ggml/src/ggml-metal/ggml-metal.cpp | 2 + + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 212 +++++++++++-- + ggml/src/mem_hip.cpp | 452 +++++++++++++++++++++++++++ + ggml/src/mem_nvml.cpp | 209 +++++++++++++ + 9 files changed, 931 insertions(+), 30 deletions(-) create mode 100644 ggml/src/mem_hip.cpp create mode 100644 ggml/src/mem_nvml.cpp diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h -index ba181d09d..094fc3c82 100644 +index ba181d09d..809835243 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h -@@ -169,6 +169,17 @@ extern "C" { +@@ -169,6 +169,14 @@ extern "C" { const char * device_id; // device capabilities struct ggml_backend_dev_caps caps; @@ -31,9 +40,6 @@ index ba181d09d..094fc3c82 100644 + int compute_major; + int compute_minor; + int integrated; -+ int pci_bus_id; -+ int pci_device_id; -+ int pci_domain_id; + const char *library; + // number with which the devices are accessed (Vulkan) + const char *numeric_id; @@ -54,7 +60,7 @@ index 0609c6503..aefe43bdd 100644 target_include_directories(ggml-base PRIVATE .) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 87c6c34a4..816597d2f 100644 +index 87c6c34a4..b075a18be 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -86,7 +92,7 @@ index 87c6c34a4..816597d2f 100644 GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", ggml_cuda_parse_uuid(prop, id).c_str()); -@@ -3484,6 +3499,14 @@ struct ggml_backend_cuda_device_context { +@@ -3484,6 +3499,11 @@ struct ggml_backend_cuda_device_context { std::string description; std::string pci_bus_id; std::string id; @@ -95,22 +101,19 @@ index 87c6c34a4..816597d2f 100644 + int driver_major; + int driver_minor; + int integrated; -+ int pciBusID; -+ int pciDeviceID; -+ int pciDomainID; }; static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { -@@ -3504,6 +3527,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) { +@@ -3504,6 +3524,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) { static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_cuda_set_device(ctx->device); + +#if defined(GGML_USE_HIP) + if (ggml_hip_mgmt_init() == 0) { -+ int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total); ++ int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total); + if (status == 0) { -+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total); ++ GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total); + ggml_hip_mgmt_release(); + return; + } @@ -120,7 +123,7 @@ index 87c6c34a4..816597d2f 100644 + if (ggml_nvml_init() == 0) { + int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total); + if (status == 0) { -+ GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total); ++ GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total); + ggml_nvml_release(); + return; + } @@ -130,7 +133,7 @@ index 87c6c34a4..816597d2f 100644 CUDA_CHECK(cudaMemGetInfo(free, total)); } -@@ -3512,6 +3557,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend +@@ -3512,6 +3554,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend return GGML_BACKEND_DEVICE_TYPE_GPU; } @@ -138,7 +141,7 @@ index 87c6c34a4..816597d2f 100644 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; -@@ -3525,6 +3571,22 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back +@@ -3525,6 +3568,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back // If you need the memory data, call ggml_backend_dev_memory() explicitly. props->memory_total = props->memory_free = 0; @@ -153,15 +156,12 @@ index 87c6c34a4..816597d2f 100644 + props->driver_major = ctx->driver_major; + props->driver_minor = ctx->driver_minor; + props->integrated = ctx->integrated; -+ props->pci_bus_id = ctx->pciBusID; -+ props->pci_device_id = ctx->pciDeviceID; -+ props->pci_domain_id = ctx->pciDomainID; + props->library = GGML_CUDA_NAME; + bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; #ifdef GGML_CUDA_NO_PEER_COPY bool events = false; -@@ -4087,6 +4149,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { +@@ -4087,6 +4143,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { std::lock_guard lock(mutex); if (!initialized) { ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; @@ -169,7 +169,7 @@ index 87c6c34a4..816597d2f 100644 for (int i = 0; i < ggml_cuda_info().device_count; i++) { ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context; -@@ -4102,6 +4165,17 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { +@@ -4102,6 +4159,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); dev_ctx->pci_bus_id = pci_bus_id; @@ -181,9 +181,6 @@ index 87c6c34a4..816597d2f 100644 + dev_ctx->driver_major = driverVersion / 1000; + dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10; + dev_ctx->integrated = prop.integrated; -+ dev_ctx->pciBusID = prop.pciBusID; -+ dev_ctx->pciDeviceID = prop.pciDeviceID; -+ dev_ctx->pciDomainID = prop.pciDomainID; ggml_backend_dev_t dev = new ggml_backend_device { /* .iface = */ ggml_backend_cuda_device_interface, /* .reg = */ ®, @@ -209,7 +206,7 @@ index 1f06be80e..2f9ef2dc0 100644 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h -index d0fb3bcca..80597b6ea 100644 +index d0fb3bcca..b63edd0c1 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -638,6 +638,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx @@ -221,7 +218,7 @@ index d0fb3bcca..80597b6ea 100644 +GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total); +GGML_API void ggml_nvml_release(); +GGML_API int ggml_hip_mgmt_init(); -+GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total); ++GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total); +GGML_API void ggml_hip_mgmt_release(); + #ifdef __cplusplus @@ -247,12 +244,319 @@ index f2ff9f322..f356e4a0a 100644 props->caps = { /* .async = */ true, /* .host_buffer = */ false, +diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +index ed83236f4..0bbcecd01 100644 +--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp ++++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +@@ -231,6 +231,7 @@ class vk_memory_logger; + #endif + class vk_perf_logger; + static void ggml_vk_destroy_buffer(vk_buffer& buf); ++static std::string ggml_vk_get_device_id(int device); + + static constexpr uint32_t mul_mat_vec_max_cols = 8; + static constexpr uint32_t p021_max_gqa_ratio = 8; +@@ -11585,6 +11586,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_ + snprintf(description, description_size, "%s", props.deviceName.data()); + } + ++static std::string ggml_vk_get_device_id(int device) { ++ ggml_vk_instance_init(); ++ ++ std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); ++ ++ vk::PhysicalDeviceProperties2 props; ++ vk::PhysicalDeviceIDProperties deviceIDProps; ++ props.pNext = &deviceIDProps; ++ devices[device].getProperties2(&props); ++ ++ const auto& uuid = deviceIDProps.deviceUUID; ++ char id[64]; ++ snprintf(id, sizeof(id), ++ "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", ++ uuid[0], uuid[1], uuid[2], uuid[3], ++ uuid[4], uuid[5], ++ uuid[6], uuid[7], ++ uuid[8], uuid[9], ++ uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15] ++ ); ++ return std::string(id); ++} ++ + // backend interface + + #define UNUSED GGML_UNUSED +@@ -12391,31 +12415,103 @@ void ggml_backend_vk_get_device_description(int device, char * description, size + ggml_vk_get_device_description(dev_idx, description, description_size); + } + +-void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { ++std::string ggml_backend_vk_get_device_id(int device) { + GGML_ASSERT(device < (int) vk_instance.device_indices.size()); +- GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size()); ++ int dev_idx = vk_instance.device_indices[device]; ++ return ggml_vk_get_device_id(dev_idx); ++} ++ ++////////////////////////// ++ ++struct ggml_backend_vk_device_context { ++ size_t device; ++ std::string name; ++ std::string description; ++ bool is_integrated_gpu; ++ // Combined string id in the form "dddd:bb:dd.f" (domain:bus:device.function) ++ std::string pci_id; ++ std::string id; ++ std::string uuid; ++ std::string numeric_id; ++ int major; ++ int minor; ++ int driver_major; ++ int driver_minor; ++}; ++ ++void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) { ++ GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size()); ++ GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size()); ++ ++ vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]]; + +- vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; +- vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops; +- vk::PhysicalDeviceMemoryProperties2 memprops = {}; +- bool membudget_supported = vk_instance.device_supports_membudget[device]; ++ vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); ++ vk::PhysicalDeviceProperties2 props2; ++ vkdev.getProperties2(&props2); + +- if (membudget_supported) { +- memprops.pNext = &budgetprops; ++ if (!ctx->is_integrated_gpu) ++ { ++ // Use vendor specific management libraries for best VRAM reporting if available ++ switch (props2.properties.vendorID) { ++ case VK_VENDOR_ID_AMD: ++ if (ggml_hip_mgmt_init() == 0) { ++ int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total); ++ if (status == 0) { ++ GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total); ++ ggml_hip_mgmt_release(); ++ return; ++ } ++ ggml_hip_mgmt_release(); ++ } ++ break; ++ case VK_VENDOR_ID_NVIDIA: ++ if (ggml_nvml_init() == 0) { ++ int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total); ++ if (status == 0) { ++ GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total); ++ ggml_nvml_release(); ++ return; ++ } ++ ggml_nvml_release(); ++ } ++ break; ++ } + } +- vkdev.getMemoryProperties2(&memprops); ++ // else fallback to memory budget if supported + +- for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) { +- const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i]; ++ *total = 0; ++ *free = 0; ++ vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props; ++ vk::PhysicalDeviceMemoryProperties2 memprops2; ++ memprops2.pNext = &mem_budget_props; ++ vkdev.getMemoryProperties2(&memprops2); ++ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) { ++ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) { ++ *total += memprops2.memoryProperties.memoryHeaps[i].size; ++ } else if (ctx->is_integrated_gpu) { ++ // Include shared memory on iGPUs ++ *total += memprops2.memoryProperties.memoryHeaps[i].size; ++ } ++ } ++ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) { ++ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) { ++ *free += mem_budget_props.heapBudget[i]; ++ } else if (ctx->is_integrated_gpu) { ++ *free += mem_budget_props.heapBudget[i]; ++ } ++ } ++ if (*total > 0 && *free > 0) { ++ return; ++ } else if (*total > 0) { ++ *free = *total; ++ return; ++ } + ++ // else just report the physical memory ++ for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) { + if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) { + *total = heap.size; +- +- if (membudget_supported && i < budgetprops.heapUsage.size()) { +- *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i]; +- } else { +- *free = heap.size; +- } ++ *free = heap.size; + break; + } + } +@@ -12448,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { + } + } + ++ vk::PhysicalDeviceProperties2 props2; + if (!ext_support) { +- return ""; ++ device.getProperties2(&props2); ++ if (props2.properties.vendorID != VK_VENDOR_ID_AMD) { ++ return ""; ++ } ++ // AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero + } + + vk::PhysicalDeviceProperties2 props = {}; +@@ -12466,19 +12567,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { + + char pci_bus_id[16] = {}; + snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function); ++ if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) { ++ return ""; ++ } + + return std::string(pci_bus_id); + } + +-////////////////////////// +- +-struct ggml_backend_vk_device_context { +- size_t device; +- std::string name; +- std::string description; +- bool is_integrated_gpu; +- std::string pci_bus_id; +-}; ++static bool ggml_backend_vk_parse_pci_bus_id(const std::string & id, int *domain, int *bus, int *device) { ++ if (id.empty()) return false; ++ unsigned int d = 0, b = 0, dev = 0, func = 0; ++ // Expected format: dddd:bb:dd.f (all hex) ++ int n = sscanf(id.c_str(), "%4x:%2x:%2x.%1x", &d, &b, &dev, &func); ++ if (n < 4) return false; ++ if (domain) *domain = (int) d; ++ if (bus) *bus = (int) b; ++ if (device) *device = (int) dev; ++ return true; ++} + + static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; +@@ -12490,9 +12596,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de + return ctx->description.c_str(); + } + ++static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) { ++ ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; ++ return ctx->id.c_str(); ++} ++ + static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context; +- ggml_backend_vk_get_device_memory(ctx->device, free, total); ++ ggml_backend_vk_get_device_memory(ctx, free, total); + } + + static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) { +@@ -12516,8 +12627,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml + + props->name = ggml_backend_vk_device_get_name(dev); + props->description = ggml_backend_vk_device_get_description(dev); ++ props->id = ggml_backend_vk_device_get_id(dev); + props->type = ggml_backend_vk_device_get_type(dev); +- props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); ++ props->device_id = ctx->pci_id.empty() ? nullptr : ctx->pci_id.c_str(); + ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, +@@ -12525,6 +12637,14 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml + /* .buffer_from_host_ptr = */ false, + /* .events = */ false, + }; ++ ++ props->compute_major = ctx->major; ++ props->compute_minor = ctx->minor; ++ props->driver_major = ctx->driver_major; ++ props->driver_minor = ctx->driver_minor; ++ props->integrated = ctx->is_integrated_gpu; ++ props->library = GGML_VK_NAME; ++ props->numeric_id = ctx->numeric_id.c_str(); + } + + static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) { +@@ -12953,6 +13073,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { ++ std::vector vk_devices = vk_instance.instance.enumeratePhysicalDevices(); ++ + for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) { + ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context; + char desc[256]; +@@ -12961,12 +13083,42 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, + ctx->name = GGML_VK_NAME + std::to_string(i); + ctx->description = desc; + ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu; +- ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i); ++ ctx->pci_id = ggml_backend_vk_get_device_pci_id(i); ++ ctx->id = ggml_backend_vk_get_device_id(i); + devices.push_back(new ggml_backend_device { + /* .iface = */ ggml_backend_vk_device_i, + /* .reg = */ reg, + /* .context = */ ctx, + }); ++ ++ // Gather additional information about the device ++ int dev_idx = vk_instance.device_indices[i]; ++ vk::PhysicalDeviceProperties props1; ++ vk_devices[dev_idx].getProperties(&props1); ++ vk::PhysicalDeviceProperties2 props2; ++ vk::PhysicalDeviceIDProperties device_id_props; ++ vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_props; ++ vk::PhysicalDeviceDriverProperties driver_props; ++ props2.pNext = &device_id_props; ++ device_id_props.pNext = &pci_bus_props; ++ pci_bus_props.pNext = &driver_props; ++ vk_devices[dev_idx].getProperties2(&props2); ++ std::ostringstream oss; ++ oss << std::hex << std::setfill('0'); ++ int byteIdx = 0; ++ for (int i = 0; i < 16; ++i, ++byteIdx) { ++ oss << std::setw(2) << static_cast(device_id_props.deviceUUID[i]); ++ if (byteIdx == 3 || byteIdx == 5 || byteIdx == 7 || byteIdx == 9) { ++ oss << '-'; ++ } ++ } ++ ctx->uuid = oss.str(); ++ ctx->major = 0; ++ ctx->minor = 0; ++ // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string ++ ctx->driver_major = 0; ++ ctx->driver_minor = 0; ++ ctx->numeric_id = std::to_string(i); + } + initialized = true; + } diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp new file mode 100644 -index 000000000..8ef19b8cf +index 000000000..5a7f5d465 --- /dev/null +++ b/ggml/src/mem_hip.cpp -@@ -0,0 +1,449 @@ +@@ -0,0 +1,452 @@ +#include "ggml.h" + +#ifdef _WIN32 @@ -586,7 +890,7 @@ index 000000000..8ef19b8cf + if (gpus != NULL) gpus->pVtbl->Release(gpus); \ + if (gpu != NULL) gpu->pVtbl->Release(gpu) + -+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) { ++int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) { + std::lock_guard lock(ggml_adlx_lock); + if (adlx.handle == NULL) { + GGML_LOG_INFO("%s ADLX was not initialized\n", __func__); @@ -598,9 +902,13 @@ index 000000000..8ef19b8cf + IADLXGPU* gpu = NULL; + IADLXGPUMetrics *gpuMetrics = NULL; + ADLX_RESULT status; -+ // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs -+ adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff); + ++ uint32_t pci_domain, pci_bus, pci_device, pci_function; ++ if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) { ++ // TODO - parse other formats? ++ GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id); ++ return ADLX_NOT_FOUND; ++ } + status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices); + if (ADLX_FAILED(status)) { + GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status); @@ -623,16 +931,15 @@ index 000000000..8ef19b8cf + GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status); + continue; + } -+ adlx_int id; -+ status = gpu->pVtbl->UniqueId(gpu, &id); ++ adlx_int uniqueID; ++ status = gpu->pVtbl->UniqueId(gpu, &uniqueID); + if (ADLX_FAILED(status)) { + GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status); + gpu->pVtbl->Release(gpu); + gpu = NULL; + continue; + } -+ if (id != target) { -+ GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id); ++ if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) { + gpu->pVtbl->Release(gpu); + gpu = NULL; + continue; @@ -695,7 +1002,7 @@ index 000000000..8ef19b8cf + return -1; +} +void ggml_hip_mgmt_release() {} -+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) { ++int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) { + return -1; +} + diff --git a/llama/patches/0029-NVML-fallback-for-unified-memory-GPUs.patch b/llama/patches/0027-NVML-fallback-for-unified-memory-GPUs.patch similarity index 99% rename from llama/patches/0029-NVML-fallback-for-unified-memory-GPUs.patch rename to llama/patches/0027-NVML-fallback-for-unified-memory-GPUs.patch index 9ba11168..ec3fdbaa 100644 --- a/llama/patches/0029-NVML-fallback-for-unified-memory-GPUs.patch +++ b/llama/patches/0027-NVML-fallback-for-unified-memory-GPUs.patch @@ -8,7 +8,7 @@ Subject: [PATCH] NVML fallback for unified memory GPUs 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp -index c9073cef..f473a2a2 100644 +index c9073cef0..f473a2a2c 100644 --- a/ggml/src/mem_nvml.cpp +++ b/ggml/src/mem_nvml.cpp @@ -13,6 +13,7 @@ diff --git a/llama/patches/0027-vulkan-get-GPU-ID-ollama-v0.11.5.patch b/llama/patches/0027-vulkan-get-GPU-ID-ollama-v0.11.5.patch deleted file mode 100644 index 997dd386..00000000 --- a/llama/patches/0027-vulkan-get-GPU-ID-ollama-v0.11.5.patch +++ /dev/null @@ -1,95 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Xiaodong Ye -Date: Mon, 18 Aug 2025 12:48:07 +0800 -Subject: [PATCH] vulkan: get GPU ID (ollama v0.11.5) - -Signed-off-by: Xiaodong Ye ---- - ggml/src/ggml-vulkan/ggml-vulkan.cpp | 37 ++++++++++++++++++++++++++++ - 1 file changed, 37 insertions(+) - -diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index 061cd078..adea7783 100644 ---- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp -+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -@@ -11588,6 +11588,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_ - snprintf(description, description_size, "%s", props.deviceName.data()); - } - -+static std::string ggml_vk_get_device_id(int device) { -+ ggml_vk_instance_init(); -+ -+ std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); -+ -+ vk::PhysicalDeviceProperties2 props; -+ vk::PhysicalDeviceIDProperties deviceIDProps; -+ props.pNext = &deviceIDProps; -+ devices[device].getProperties2(&props); -+ -+ const auto& uuid = deviceIDProps.deviceUUID; -+ char id[64]; -+ snprintf(id, sizeof(id), -+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", -+ uuid[0], uuid[1], uuid[2], uuid[3], -+ uuid[4], uuid[5], -+ uuid[6], uuid[7], -+ uuid[8], uuid[9], -+ uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15] -+ ); -+ return std::string(id); -+} -+ - // backend interface - - #define UNUSED GGML_UNUSED -@@ -12394,6 +12417,12 @@ void ggml_backend_vk_get_device_description(int device, char * description, size - ggml_vk_get_device_description(dev_idx, description, description_size); - } - -+std::string ggml_backend_vk_get_device_id(int device) { -+ GGML_ASSERT(device < (int) vk_instance.device_indices.size()); -+ int dev_idx = vk_instance.device_indices[device]; -+ return ggml_vk_get_device_id(dev_idx); -+} -+ - void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { - GGML_ASSERT(device < (int) vk_instance.device_indices.size()); - GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size()); -@@ -12481,6 +12510,7 @@ struct ggml_backend_vk_device_context { - std::string description; - bool is_integrated_gpu; - std::string pci_bus_id; -+ std::string id; - }; - - static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) { -@@ -12493,6 +12523,11 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de - return ctx->description.c_str(); - } - -+static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) { -+ ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; -+ return ctx->id.c_str(); -+} -+ - static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context; - ggml_backend_vk_get_device_memory(ctx->device, free, total); -@@ -12519,6 +12554,7 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml - - props->name = ggml_backend_vk_device_get_name(dev); - props->description = ggml_backend_vk_device_get_description(dev); -+ props->id = ggml_backend_vk_device_get_id(dev); - props->type = ggml_backend_vk_device_get_type(dev); - props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); - ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); -@@ -12965,6 +13001,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, - ctx->description = desc; - ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu; - ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i); -+ ctx->id = ggml_backend_vk_get_device_id(i); - devices.push_back(new ggml_backend_device { - /* .iface = */ ggml_backend_vk_device_i, - /* .reg = */ reg, --- -2.51.0 \ No newline at end of file diff --git a/llama/patches/0030-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch b/llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch similarity index 97% rename from llama/patches/0030-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch rename to llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch index c3c7fedf..f5861a8c 100644 --- a/llama/patches/0030-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch +++ b/llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch @@ -28,7 +28,7 @@ Co-authored-by: Johannes Gäßler 1 file changed, 9 insertions(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 6a278b5e9..87941f872 100644 +index b075a18be..d62f412d6 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -340,6 +340,15 @@ static ggml_cuda_device_info ggml_cuda_init() { diff --git a/llama/patches/0028-vulkan-pci-and-memory.patch b/llama/patches/0028-vulkan-pci-and-memory.patch deleted file mode 100644 index c20ccf5c..00000000 --- a/llama/patches/0028-vulkan-pci-and-memory.patch +++ /dev/null @@ -1,254 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Daniel Hiltgen -Date: Fri Sep 5 08:25:03 2025 -0700 -Subject: [PATCH] Vulkan PCI and Memory - ---- - ggml/src/ggml-vulkan/ggml-vulkan.cpp | 176 ++++++++++++++++++++++----- - 1 file changed, 145 insertions(+), 31 deletions(-) - -diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index adea7783..fb7204ce 100644 ---- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp -+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -@@ -12423,31 +12423,99 @@ std::string ggml_backend_vk_get_device_id(int device) { - return ggml_vk_get_device_id(dev_idx); - } - --void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { -- GGML_ASSERT(device < (int) vk_instance.device_indices.size()); -- GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size()); -+////////////////////////// -+ -+struct ggml_backend_vk_device_context { -+ size_t device; -+ std::string name; -+ std::string description; -+ bool is_integrated_gpu; -+ // Combined string id in the form "dddd:bb:dd.f" (domain:bus:device.function) -+ std::string pci_id; -+ std::string id; -+ std::string uuid; -+ int major; -+ int minor; -+ int driver_major; -+ int driver_minor; -+ int pci_bus_id; -+ int pci_device_id; -+ int pci_domain_id; -+}; -+ -+void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) { -+ GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size()); -+ GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size()); -+ -+ vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]]; - -- vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; -- vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops; -- vk::PhysicalDeviceMemoryProperties2 memprops = {}; -- bool membudget_supported = vk_instance.device_supports_membudget[device]; -+ vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); -+ vk::PhysicalDeviceProperties2 props2; -+ vkdev.getProperties2(&props2); - -- if (membudget_supported) { -- memprops.pNext = &budgetprops; -+ if (!ctx->is_integrated_gpu) -+ { -+ // Use vendor specific management libraries for best VRAM reporting if available -+ switch (props2.properties.vendorID) { -+ case VK_VENDOR_ID_AMD: -+ if (ggml_hip_mgmt_init() == 0) { -+ int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total); -+ if (status == 0) { -+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total); -+ ggml_hip_mgmt_release(); -+ return; -+ } -+ ggml_hip_mgmt_release(); -+ } -+ break; -+ case VK_VENDOR_ID_NVIDIA: -+ if (ggml_nvml_init() == 0) { -+ int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total); -+ if (status == 0) { -+ GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total); -+ ggml_nvml_release(); -+ return; -+ } -+ ggml_nvml_release(); -+ } -+ break; -+ } - } -- vkdev.getMemoryProperties2(&memprops); -+ // else fallback to memory budget if supported - -- for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) { -- const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i]; -+ *total = 0; -+ *free = 0; -+ vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props; -+ vk::PhysicalDeviceMemoryProperties2 memprops2; -+ memprops2.pNext = &mem_budget_props; -+ vkdev.getMemoryProperties2(&memprops2); -+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) { -+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) { -+ *total += memprops2.memoryProperties.memoryHeaps[i].size; -+ } else if (ctx->is_integrated_gpu) { -+ // Include shared memory on iGPUs -+ *total += memprops2.memoryProperties.memoryHeaps[i].size; -+ } -+ } -+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) { -+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) { -+ *free += mem_budget_props.heapBudget[i]; -+ } else if (ctx->is_integrated_gpu) { -+ *free += mem_budget_props.heapBudget[i]; -+ } -+ } -+ if (*total > 0 && *free > 0) { -+ return; -+ } else if (*total > 0) { -+ *free = *total; -+ return; -+ } - -+ // else just report the physical memory -+ for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) { - if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) { - *total = heap.size; -- -- if (membudget_supported && i < budgetprops.heapUsage.size()) { -- *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i]; -- } else { -- *free = heap.size; -- } -+ *free = heap.size; - break; - } - } -@@ -12502,16 +12570,17 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { - return std::string(pci_bus_id); - } - --////////////////////////// -- --struct ggml_backend_vk_device_context { -- size_t device; -- std::string name; -- std::string description; -- bool is_integrated_gpu; -- std::string pci_bus_id; -- std::string id; --}; -+static bool ggml_backend_vk_parse_pci_bus_id(const std::string & id, int *domain, int *bus, int *device) { -+ if (id.empty()) return false; -+ unsigned int d = 0, b = 0, dev = 0, func = 0; -+ // Expected format: dddd:bb:dd.f (all hex) -+ int n = sscanf(id.c_str(), "%4x:%2x:%2x.%1x", &d, &b, &dev, &func); -+ if (n < 4) return false; -+ if (domain) *domain = (int) d; -+ if (bus) *bus = (int) b; -+ if (device) *device = (int) dev; -+ return true; -+} - - static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; -@@ -12530,7 +12599,7 @@ static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) { - - static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context; -- ggml_backend_vk_get_device_memory(ctx->device, free, total); -+ ggml_backend_vk_get_device_memory(ctx, free, total); - } - - static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) { -@@ -12556,7 +12625,7 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml - props->description = ggml_backend_vk_device_get_description(dev); - props->id = ggml_backend_vk_device_get_id(dev); - props->type = ggml_backend_vk_device_get_type(dev); -- props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); -+ props->device_id = ctx->pci_id.empty() ? nullptr : ctx->pci_id.c_str(); - ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); - props->caps = { - /* .async = */ false, -@@ -12564,6 +12633,17 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml - /* .buffer_from_host_ptr = */ false, - /* .events = */ false, - }; -+ -+ props->compute_major = ctx->major; -+ props->compute_minor = ctx->minor; -+ props->driver_major = ctx->driver_major; -+ props->driver_minor = ctx->driver_minor; -+ props->integrated = ctx->is_integrated_gpu; -+ props->pci_bus_id = ctx->pci_bus_id; -+ props->pci_device_id = ctx->pci_device_id; -+ props->pci_domain_id = ctx->pci_domain_id; -+ props->library = GGML_VK_NAME; -+ props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str(); - } - - static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) { -@@ -12992,6 +13071,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, - static std::mutex mutex; - std::lock_guard lock(mutex); - if (!initialized) { -+ std::vector vk_devices = vk_instance.instance.enumeratePhysicalDevices(); -+ - for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) { - ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context; - char desc[256]; -@@ -13000,13 +13081,46 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, - ctx->name = GGML_VK_NAME + std::to_string(i); - ctx->description = desc; - ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu; -- ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i); -+ ctx->pci_id = ggml_backend_vk_get_device_pci_id(i); - ctx->id = ggml_backend_vk_get_device_id(i); - devices.push_back(new ggml_backend_device { - /* .iface = */ ggml_backend_vk_device_i, - /* .reg = */ reg, - /* .context = */ ctx, - }); -+ -+ // Gather additional information about the device -+ int dev_idx = vk_instance.device_indices[i]; -+ vk::PhysicalDeviceProperties props1; -+ vk_devices[dev_idx].getProperties(&props1); -+ vk::PhysicalDeviceProperties2 props2; -+ vk::PhysicalDeviceIDProperties device_id_props; -+ vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_props; -+ vk::PhysicalDeviceDriverProperties driver_props; -+ props2.pNext = &device_id_props; -+ device_id_props.pNext = &pci_bus_props; -+ pci_bus_props.pNext = &driver_props; -+ vk_devices[dev_idx].getProperties2(&props2); -+ std::ostringstream oss; -+ oss << std::hex << std::setfill('0'); -+ oss << "GPU-"; -+ int byteIdx = 0; -+ for (int i = 0; i < 16; ++i, ++byteIdx) { -+ oss << std::setw(2) << static_cast(device_id_props.deviceUUID[i]); -+ if (byteIdx == 3 || byteIdx == 5 || byteIdx == 7 || byteIdx == 9) { -+ oss << '-'; -+ } -+ } -+ ctx->uuid = oss.str(); -+ ctx->pci_bus_id = pci_bus_props.pciBus; -+ ctx->pci_device_id = pci_bus_props.pciDevice; -+ ctx->pci_domain_id = pci_bus_props.pciDomain; -+ ctx->id = std::to_string(i); -+ ctx->major = 0; -+ ctx->minor = 0; -+ // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string -+ ctx->driver_major = 0; -+ ctx->driver_minor = 0; - } - initialized = true; - } --- -2.51.0 \ No newline at end of file diff --git a/llama/patches/0031-report-LoadLibrary-failures.patch b/llama/patches/0029-report-LoadLibrary-failures.patch similarity index 100% rename from llama/patches/0031-report-LoadLibrary-failures.patch rename to llama/patches/0029-report-LoadLibrary-failures.patch diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 64aae141..3feb5b5d 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -725,7 +725,9 @@ func (b *Backend) BackendDevices() []ml.DeviceInfo { if props.library != nil { info.Library = C.GoString(props.library) } - info.PCIID = fmt.Sprintf("%02x:%02x.%x", props.pci_bus_id, props.pci_device_id, props.pci_domain_id) + if props.device_id != nil { + info.PCIID = C.GoString(props.device_id) + } info.LibraryPath = ggml.LibPaths() if props.numeric_id != nil { info.FilteredID = C.GoString(props.numeric_id) diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h index 094fc3c8..80983524 100644 --- a/ml/backend/ggml/ggml/include/ggml-backend.h +++ b/ml/backend/ggml/ggml/include/ggml-backend.h @@ -174,9 +174,6 @@ extern "C" { int compute_major; int compute_minor; int integrated; - int pci_bus_id; - int pci_device_id; - int pci_domain_id; const char *library; // number with which the devices are accessed (Vulkan) const char *numeric_id; diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index f9cf2d4f..d62f412d 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3513,9 +3513,6 @@ struct ggml_backend_cuda_device_context { int driver_major; int driver_minor; int integrated; - int pciBusID; - int pciDeviceID; - int pciDomainID; }; static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { @@ -3539,9 +3536,9 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * #if defined(GGML_USE_HIP) if (ggml_hip_mgmt_init() == 0) { - int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total); + int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total); if (status == 0) { - GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total); + GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total); ggml_hip_mgmt_release(); return; } @@ -3551,7 +3548,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * if (ggml_nvml_init() == 0) { int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total); if (status == 0) { - GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total); + GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total); ggml_nvml_release(); return; } @@ -3591,9 +3588,6 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back props->driver_major = ctx->driver_major; props->driver_minor = ctx->driver_minor; props->integrated = ctx->integrated; - props->pci_bus_id = ctx->pciBusID; - props->pci_device_id = ctx->pciDeviceID; - props->pci_domain_id = ctx->pciDomainID; props->library = GGML_CUDA_NAME; bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; @@ -4182,9 +4176,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { dev_ctx->driver_major = driverVersion / 1000; dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10; dev_ctx->integrated = prop.integrated; - dev_ctx->pciBusID = prop.pciBusID; - dev_ctx->pciDeviceID = prop.pciDeviceID; - dev_ctx->pciDomainID = prop.pciDomainID; ggml_backend_dev_t dev = new ggml_backend_device { /* .iface = */ ggml_backend_cuda_device_interface, /* .reg = */ ®, diff --git a/ml/backend/ggml/ggml/src/ggml-impl.h b/ml/backend/ggml/ggml/src/ggml-impl.h index 80597b6e..b63edd0c 100644 --- a/ml/backend/ggml/ggml/src/ggml-impl.h +++ b/ml/backend/ggml/ggml/src/ggml-impl.h @@ -643,7 +643,7 @@ GGML_API int ggml_nvml_init(); GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total); GGML_API void ggml_nvml_release(); GGML_API int ggml_hip_mgmt_init(); -GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total); +GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total); GGML_API void ggml_hip_mgmt_release(); #ifdef __cplusplus diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 564bc4a7..0bbcecd0 100644 --- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -231,6 +231,7 @@ class vk_memory_logger; #endif class vk_perf_logger; static void ggml_vk_destroy_buffer(vk_buffer& buf); +static std::string ggml_vk_get_device_id(int device); static constexpr uint32_t mul_mat_vec_max_cols = 8; static constexpr uint32_t p021_max_gqa_ratio = 8; @@ -11598,7 +11599,7 @@ static std::string ggml_vk_get_device_id(int device) { const auto& uuid = deviceIDProps.deviceUUID; char id[64]; snprintf(id, sizeof(id), - "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], uuid[6], uuid[7], @@ -12431,13 +12432,11 @@ struct ggml_backend_vk_device_context { std::string pci_id; std::string id; std::string uuid; + std::string numeric_id; int major; int minor; int driver_major; int driver_minor; - int pci_bus_id; - int pci_device_id; - int pci_domain_id; }; void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) { @@ -12456,9 +12455,9 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size switch (props2.properties.vendorID) { case VK_VENDOR_ID_AMD: if (ggml_hip_mgmt_init() == 0) { - int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total); + int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total); if (status == 0) { - GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total); + GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total); ggml_hip_mgmt_release(); return; } @@ -12469,7 +12468,7 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size if (ggml_nvml_init() == 0) { int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total); if (status == 0) { - GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total); + GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total); ggml_nvml_release(); return; } @@ -12545,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { } } + vk::PhysicalDeviceProperties2 props2; if (!ext_support) { - return ""; + device.getProperties2(&props2); + if (props2.properties.vendorID != VK_VENDOR_ID_AMD) { + return ""; + } + // AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero } vk::PhysicalDeviceProperties2 props = {}; @@ -12563,6 +12567,9 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { char pci_bus_id[16] = {}; snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function); + if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) { + return ""; + } return std::string(pci_bus_id); } @@ -12636,11 +12643,8 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml props->driver_major = ctx->driver_major; props->driver_minor = ctx->driver_minor; props->integrated = ctx->is_integrated_gpu; - props->pci_bus_id = ctx->pci_bus_id; - props->pci_device_id = ctx->pci_device_id; - props->pci_domain_id = ctx->pci_domain_id; props->library = GGML_VK_NAME; - props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str(); + props->numeric_id = ctx->numeric_id.c_str(); } static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) { @@ -13101,7 +13105,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, vk_devices[dev_idx].getProperties2(&props2); std::ostringstream oss; oss << std::hex << std::setfill('0'); - oss << "GPU-"; int byteIdx = 0; for (int i = 0; i < 16; ++i, ++byteIdx) { oss << std::setw(2) << static_cast(device_id_props.deviceUUID[i]); @@ -13110,15 +13113,12 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, } } ctx->uuid = oss.str(); - ctx->pci_bus_id = pci_bus_props.pciBus; - ctx->pci_device_id = pci_bus_props.pciDevice; - ctx->pci_domain_id = pci_bus_props.pciDomain; - ctx->id = std::to_string(i); ctx->major = 0; ctx->minor = 0; // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string ctx->driver_major = 0; ctx->driver_minor = 0; + ctx->numeric_id = std::to_string(i); } initialized = true; } diff --git a/ml/backend/ggml/ggml/src/mem_hip.cpp b/ml/backend/ggml/ggml/src/mem_hip.cpp index 8ef19b8c..5a7f5d46 100644 --- a/ml/backend/ggml/ggml/src/mem_hip.cpp +++ b/ml/backend/ggml/ggml/src/mem_hip.cpp @@ -331,7 +331,7 @@ void ggml_hip_mgmt_release() { if (gpus != NULL) gpus->pVtbl->Release(gpus); \ if (gpu != NULL) gpu->pVtbl->Release(gpu) -int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) { +int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) { std::lock_guard lock(ggml_adlx_lock); if (adlx.handle == NULL) { GGML_LOG_INFO("%s ADLX was not initialized\n", __func__); @@ -343,9 +343,13 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, IADLXGPU* gpu = NULL; IADLXGPUMetrics *gpuMetrics = NULL; ADLX_RESULT status; - // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs - adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff); + uint32_t pci_domain, pci_bus, pci_device, pci_function; + if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) { + // TODO - parse other formats? + GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id); + return ADLX_NOT_FOUND; + } status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices); if (ADLX_FAILED(status)) { GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status); @@ -368,16 +372,15 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status); continue; } - adlx_int id; - status = gpu->pVtbl->UniqueId(gpu, &id); + adlx_int uniqueID; + status = gpu->pVtbl->UniqueId(gpu, &uniqueID); if (ADLX_FAILED(status)) { GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status); gpu->pVtbl->Release(gpu); gpu = NULL; continue; } - if (id != target) { - GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id); + if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) { gpu->pVtbl->Release(gpu); gpu = NULL; continue; @@ -440,7 +443,7 @@ int ggml_hip_mgmt_init() { return -1; } void ggml_hip_mgmt_release() {} -int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) { +int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) { return -1; } diff --git a/ml/device.go b/ml/device.go index 39fba7d1..57c3976b 100644 --- a/ml/device.go +++ b/ml/device.go @@ -391,6 +391,10 @@ func (a DeviceInfo) Compare(b DeviceInfo) DeviceComparison { if a.PCIID != b.PCIID { return UniqueDevice } + // If PCIID is empty, we have to use ID + library for uniqueness + if a.PCIID == "" && a.DeviceID != b.DeviceID { + return UniqueDevice + } if a.Library == b.Library { return SameBackendDevice } @@ -454,13 +458,13 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) { var envVar string switch d.Library { case "ROCm": + // ROCm must be filtered as it can crash the runner on unsupported devices envVar = "ROCR_VISIBLE_DEVICES" if runtime.GOOS != "linux" { envVar = "HIP_VISIBLE_DEVICES" } - case "Vulkan": - envVar = "GGML_VK_VISIBLE_DEVICES" default: + // CUDA and Vulkan are not filtered via env var, but via scheduling decisions return } v, existing := env[envVar]