diff --git a/discover/runner.go b/discover/runner.go index cbaba3c6..e74050d0 100644 --- a/discover/runner.go +++ b/discover/runner.go @@ -117,7 +117,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. // In the second pass, we more deeply initialize the GPUs to weed out devices that // aren't supported by a given library. We run this phase in parallel to speed up discovery. - slog.Debug("filtering out unsupported or overlapping GPU library combinations", "count", len(devices)) + slog.Debug("evluating which if any devices to filter out", "initial_count", len(devices)) ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() var wg sync.WaitGroup @@ -129,7 +129,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. if devices[i].Library == "Metal" { continue } - slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID) + slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID) wg.Add(1) go func(i int) { defer wg.Done() @@ -155,6 +155,12 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. envVar: id, // Filter to just this one GPU } if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 { + slog.Debug("filtering device which didn't fully initialize", + "id", devices[i].ID, + "libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], + "pci_id", devices[i].PCIID, + "library", devices[i].Library, + ) needsDelete[i] = true } else { supportedMu.Lock() @@ -170,7 +176,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. }(i) } wg.Wait() - logutil.Trace("supported GPU library combinations", "supported", supported) + logutil.Trace("supported GPU library combinations before filtering", "supported", supported) filterOutVulkanThatAreSupportedByOtherGPU(needsDelete) @@ -372,12 +378,13 @@ func filterOutVulkanThatAreSupportedByOtherGPU(needsDelete []bool) { } if devices[j].PCIID == devices[i].PCIID && devices[j].Library != "Vulkan" && !needsDelete[j] { needsDelete[i] = true - slog.Debug("dropping Vulkan duplicate by PCI ID", - "vulkan_id", devices[i].ID, - "vulkan_libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], + slog.Debug("filtering device with duplicate PCI ID", + "id", devices[i].ID, + "library", devices[i].Library, + "libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], "pci_id", devices[i].PCIID, - "kept_library", devices[j].Library, "kept_id", devices[j].ID, + "kept_library", devices[j].Library, ) break } @@ -422,6 +429,12 @@ func filterOverlapByLibrary(supported map[string]map[string]map[string]int, need } for dev, i := range byLibDirs[libDir] { if _, found := byLibDirs[newest][dev]; found { + slog.Debug("filtering device with overlapping libraries", + "id", dev, + "library", libDir, + "delete_index", i, + "kept_library", newest, + ) needsDelete[i] = true } } diff --git a/discover/types.go b/discover/types.go index b34bafd2..b1f622f4 100644 --- a/discover/types.go +++ b/discover/types.go @@ -3,6 +3,7 @@ package discover import ( "log/slog" "path/filepath" + "sort" "strings" "github.com/ollama/ollama/format" @@ -26,6 +27,7 @@ type CPU struct { } func LogDetails(devices []ml.DeviceInfo) { + sort.Sort(sort.Reverse(ml.ByFreeMemory(devices))) // Report devices in order of scheduling preference for _, dev := range devices { var libs []string for _, dir := range dev.LibraryPath { @@ -39,6 +41,7 @@ func LogDetails(devices []ml.DeviceInfo) { } slog.Info("inference compute", "id", dev.ID, + "filtered_id", dev.FilteredID, "library", dev.Library, "compute", dev.Compute(), "name", dev.Name, diff --git a/llama/patches/0026-GPU-discovery-enhancements.patch b/llama/patches/0026-GPU-discovery-enhancements.patch index 82513e34..807a4689 100644 --- a/llama/patches/0026-GPU-discovery-enhancements.patch +++ b/llama/patches/0026-GPU-discovery-enhancements.patch @@ -5,24 +5,33 @@ Subject: [PATCH] GPU discovery enhancements Expose more information about the devices through backend props, and leverage management libraries for more accurate VRAM usage reporting if available. + +vulkan: get GPU ID (ollama v0.11.5) + +Signed-off-by: Xiaodong Ye + +Vulkan PCI and Memory + +fix vulkan PCI ID and ID handling --- - ggml/include/ggml-backend.h | 11 + - ggml/src/CMakeLists.txt | 2 + - ggml/src/ggml-cuda/ggml-cuda.cu | 74 +++++ - ggml/src/ggml-cuda/vendors/hip.h | 3 + - ggml/src/ggml-impl.h | 8 + - ggml/src/ggml-metal/ggml-metal.cpp | 2 + - ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++ - ggml/src/mem_nvml.cpp | 209 ++++++++++++++ - 8 files changed, 758 insertions(+) + ggml/include/ggml-backend.h | 8 + + ggml/src/CMakeLists.txt | 2 + + ggml/src/ggml-cuda/ggml-cuda.cu | 65 ++++ + ggml/src/ggml-cuda/vendors/hip.h | 3 + + ggml/src/ggml-impl.h | 8 + + ggml/src/ggml-metal/ggml-metal.cpp | 2 + + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 212 +++++++++++-- + ggml/src/mem_hip.cpp | 452 +++++++++++++++++++++++++++ + ggml/src/mem_nvml.cpp | 209 +++++++++++++ + 9 files changed, 931 insertions(+), 30 deletions(-) create mode 100644 ggml/src/mem_hip.cpp create mode 100644 ggml/src/mem_nvml.cpp diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h -index ba181d09d..094fc3c82 100644 +index ba181d09d..809835243 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h -@@ -169,6 +169,17 @@ extern "C" { +@@ -169,6 +169,14 @@ extern "C" { const char * device_id; // device capabilities struct ggml_backend_dev_caps caps; @@ -31,9 +40,6 @@ index ba181d09d..094fc3c82 100644 + int compute_major; + int compute_minor; + int integrated; -+ int pci_bus_id; -+ int pci_device_id; -+ int pci_domain_id; + const char *library; + // number with which the devices are accessed (Vulkan) + const char *numeric_id; @@ -54,7 +60,7 @@ index 0609c6503..aefe43bdd 100644 target_include_directories(ggml-base PRIVATE .) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 87c6c34a4..816597d2f 100644 +index 87c6c34a4..b075a18be 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -86,7 +92,7 @@ index 87c6c34a4..816597d2f 100644 GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", ggml_cuda_parse_uuid(prop, id).c_str()); -@@ -3484,6 +3499,14 @@ struct ggml_backend_cuda_device_context { +@@ -3484,6 +3499,11 @@ struct ggml_backend_cuda_device_context { std::string description; std::string pci_bus_id; std::string id; @@ -95,22 +101,19 @@ index 87c6c34a4..816597d2f 100644 + int driver_major; + int driver_minor; + int integrated; -+ int pciBusID; -+ int pciDeviceID; -+ int pciDomainID; }; static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { -@@ -3504,6 +3527,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) { +@@ -3504,6 +3524,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) { static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_cuda_set_device(ctx->device); + +#if defined(GGML_USE_HIP) + if (ggml_hip_mgmt_init() == 0) { -+ int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total); ++ int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total); + if (status == 0) { -+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total); ++ GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total); + ggml_hip_mgmt_release(); + return; + } @@ -120,7 +123,7 @@ index 87c6c34a4..816597d2f 100644 + if (ggml_nvml_init() == 0) { + int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total); + if (status == 0) { -+ GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total); ++ GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total); + ggml_nvml_release(); + return; + } @@ -130,7 +133,7 @@ index 87c6c34a4..816597d2f 100644 CUDA_CHECK(cudaMemGetInfo(free, total)); } -@@ -3512,6 +3557,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend +@@ -3512,6 +3554,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend return GGML_BACKEND_DEVICE_TYPE_GPU; } @@ -138,7 +141,7 @@ index 87c6c34a4..816597d2f 100644 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; -@@ -3525,6 +3571,22 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back +@@ -3525,6 +3568,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back // If you need the memory data, call ggml_backend_dev_memory() explicitly. props->memory_total = props->memory_free = 0; @@ -153,15 +156,12 @@ index 87c6c34a4..816597d2f 100644 + props->driver_major = ctx->driver_major; + props->driver_minor = ctx->driver_minor; + props->integrated = ctx->integrated; -+ props->pci_bus_id = ctx->pciBusID; -+ props->pci_device_id = ctx->pciDeviceID; -+ props->pci_domain_id = ctx->pciDomainID; + props->library = GGML_CUDA_NAME; + bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; #ifdef GGML_CUDA_NO_PEER_COPY bool events = false; -@@ -4087,6 +4149,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { +@@ -4087,6 +4143,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { std::lock_guard lock(mutex); if (!initialized) { ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; @@ -169,7 +169,7 @@ index 87c6c34a4..816597d2f 100644 for (int i = 0; i < ggml_cuda_info().device_count; i++) { ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context; -@@ -4102,6 +4165,17 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { +@@ -4102,6 +4159,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); dev_ctx->pci_bus_id = pci_bus_id; @@ -181,9 +181,6 @@ index 87c6c34a4..816597d2f 100644 + dev_ctx->driver_major = driverVersion / 1000; + dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10; + dev_ctx->integrated = prop.integrated; -+ dev_ctx->pciBusID = prop.pciBusID; -+ dev_ctx->pciDeviceID = prop.pciDeviceID; -+ dev_ctx->pciDomainID = prop.pciDomainID; ggml_backend_dev_t dev = new ggml_backend_device { /* .iface = */ ggml_backend_cuda_device_interface, /* .reg = */ ®, @@ -209,7 +206,7 @@ index 1f06be80e..2f9ef2dc0 100644 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h -index d0fb3bcca..80597b6ea 100644 +index d0fb3bcca..b63edd0c1 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -638,6 +638,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx @@ -221,7 +218,7 @@ index d0fb3bcca..80597b6ea 100644 +GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total); +GGML_API void ggml_nvml_release(); +GGML_API int ggml_hip_mgmt_init(); -+GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total); ++GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total); +GGML_API void ggml_hip_mgmt_release(); + #ifdef __cplusplus @@ -247,12 +244,319 @@ index f2ff9f322..f356e4a0a 100644 props->caps = { /* .async = */ true, /* .host_buffer = */ false, +diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +index ed83236f4..0bbcecd01 100644 +--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp ++++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +@@ -231,6 +231,7 @@ class vk_memory_logger; + #endif + class vk_perf_logger; + static void ggml_vk_destroy_buffer(vk_buffer& buf); ++static std::string ggml_vk_get_device_id(int device); + + static constexpr uint32_t mul_mat_vec_max_cols = 8; + static constexpr uint32_t p021_max_gqa_ratio = 8; +@@ -11585,6 +11586,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_ + snprintf(description, description_size, "%s", props.deviceName.data()); + } + ++static std::string ggml_vk_get_device_id(int device) { ++ ggml_vk_instance_init(); ++ ++ std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); ++ ++ vk::PhysicalDeviceProperties2 props; ++ vk::PhysicalDeviceIDProperties deviceIDProps; ++ props.pNext = &deviceIDProps; ++ devices[device].getProperties2(&props); ++ ++ const auto& uuid = deviceIDProps.deviceUUID; ++ char id[64]; ++ snprintf(id, sizeof(id), ++ "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", ++ uuid[0], uuid[1], uuid[2], uuid[3], ++ uuid[4], uuid[5], ++ uuid[6], uuid[7], ++ uuid[8], uuid[9], ++ uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15] ++ ); ++ return std::string(id); ++} ++ + // backend interface + + #define UNUSED GGML_UNUSED +@@ -12391,31 +12415,103 @@ void ggml_backend_vk_get_device_description(int device, char * description, size + ggml_vk_get_device_description(dev_idx, description, description_size); + } + +-void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { ++std::string ggml_backend_vk_get_device_id(int device) { + GGML_ASSERT(device < (int) vk_instance.device_indices.size()); +- GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size()); ++ int dev_idx = vk_instance.device_indices[device]; ++ return ggml_vk_get_device_id(dev_idx); ++} ++ ++////////////////////////// ++ ++struct ggml_backend_vk_device_context { ++ size_t device; ++ std::string name; ++ std::string description; ++ bool is_integrated_gpu; ++ // Combined string id in the form "dddd:bb:dd.f" (domain:bus:device.function) ++ std::string pci_id; ++ std::string id; ++ std::string uuid; ++ std::string numeric_id; ++ int major; ++ int minor; ++ int driver_major; ++ int driver_minor; ++}; ++ ++void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) { ++ GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size()); ++ GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size()); ++ ++ vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]]; + +- vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; +- vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops; +- vk::PhysicalDeviceMemoryProperties2 memprops = {}; +- bool membudget_supported = vk_instance.device_supports_membudget[device]; ++ vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); ++ vk::PhysicalDeviceProperties2 props2; ++ vkdev.getProperties2(&props2); + +- if (membudget_supported) { +- memprops.pNext = &budgetprops; ++ if (!ctx->is_integrated_gpu) ++ { ++ // Use vendor specific management libraries for best VRAM reporting if available ++ switch (props2.properties.vendorID) { ++ case VK_VENDOR_ID_AMD: ++ if (ggml_hip_mgmt_init() == 0) { ++ int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total); ++ if (status == 0) { ++ GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total); ++ ggml_hip_mgmt_release(); ++ return; ++ } ++ ggml_hip_mgmt_release(); ++ } ++ break; ++ case VK_VENDOR_ID_NVIDIA: ++ if (ggml_nvml_init() == 0) { ++ int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total); ++ if (status == 0) { ++ GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total); ++ ggml_nvml_release(); ++ return; ++ } ++ ggml_nvml_release(); ++ } ++ break; ++ } + } +- vkdev.getMemoryProperties2(&memprops); ++ // else fallback to memory budget if supported + +- for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) { +- const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i]; ++ *total = 0; ++ *free = 0; ++ vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props; ++ vk::PhysicalDeviceMemoryProperties2 memprops2; ++ memprops2.pNext = &mem_budget_props; ++ vkdev.getMemoryProperties2(&memprops2); ++ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) { ++ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) { ++ *total += memprops2.memoryProperties.memoryHeaps[i].size; ++ } else if (ctx->is_integrated_gpu) { ++ // Include shared memory on iGPUs ++ *total += memprops2.memoryProperties.memoryHeaps[i].size; ++ } ++ } ++ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) { ++ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) { ++ *free += mem_budget_props.heapBudget[i]; ++ } else if (ctx->is_integrated_gpu) { ++ *free += mem_budget_props.heapBudget[i]; ++ } ++ } ++ if (*total > 0 && *free > 0) { ++ return; ++ } else if (*total > 0) { ++ *free = *total; ++ return; ++ } + ++ // else just report the physical memory ++ for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) { + if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) { + *total = heap.size; +- +- if (membudget_supported && i < budgetprops.heapUsage.size()) { +- *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i]; +- } else { +- *free = heap.size; +- } ++ *free = heap.size; + break; + } + } +@@ -12448,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { + } + } + ++ vk::PhysicalDeviceProperties2 props2; + if (!ext_support) { +- return ""; ++ device.getProperties2(&props2); ++ if (props2.properties.vendorID != VK_VENDOR_ID_AMD) { ++ return ""; ++ } ++ // AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero + } + + vk::PhysicalDeviceProperties2 props = {}; +@@ -12466,19 +12567,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { + + char pci_bus_id[16] = {}; + snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function); ++ if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) { ++ return ""; ++ } + + return std::string(pci_bus_id); + } + +-////////////////////////// +- +-struct ggml_backend_vk_device_context { +- size_t device; +- std::string name; +- std::string description; +- bool is_integrated_gpu; +- std::string pci_bus_id; +-}; ++static bool ggml_backend_vk_parse_pci_bus_id(const std::string & id, int *domain, int *bus, int *device) { ++ if (id.empty()) return false; ++ unsigned int d = 0, b = 0, dev = 0, func = 0; ++ // Expected format: dddd:bb:dd.f (all hex) ++ int n = sscanf(id.c_str(), "%4x:%2x:%2x.%1x", &d, &b, &dev, &func); ++ if (n < 4) return false; ++ if (domain) *domain = (int) d; ++ if (bus) *bus = (int) b; ++ if (device) *device = (int) dev; ++ return true; ++} + + static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; +@@ -12490,9 +12596,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de + return ctx->description.c_str(); + } + ++static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) { ++ ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; ++ return ctx->id.c_str(); ++} ++ + static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context; +- ggml_backend_vk_get_device_memory(ctx->device, free, total); ++ ggml_backend_vk_get_device_memory(ctx, free, total); + } + + static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) { +@@ -12516,8 +12627,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml + + props->name = ggml_backend_vk_device_get_name(dev); + props->description = ggml_backend_vk_device_get_description(dev); ++ props->id = ggml_backend_vk_device_get_id(dev); + props->type = ggml_backend_vk_device_get_type(dev); +- props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); ++ props->device_id = ctx->pci_id.empty() ? nullptr : ctx->pci_id.c_str(); + ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, +@@ -12525,6 +12637,14 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml + /* .buffer_from_host_ptr = */ false, + /* .events = */ false, + }; ++ ++ props->compute_major = ctx->major; ++ props->compute_minor = ctx->minor; ++ props->driver_major = ctx->driver_major; ++ props->driver_minor = ctx->driver_minor; ++ props->integrated = ctx->is_integrated_gpu; ++ props->library = GGML_VK_NAME; ++ props->numeric_id = ctx->numeric_id.c_str(); + } + + static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) { +@@ -12953,6 +13073,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { ++ std::vector vk_devices = vk_instance.instance.enumeratePhysicalDevices(); ++ + for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) { + ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context; + char desc[256]; +@@ -12961,12 +13083,42 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, + ctx->name = GGML_VK_NAME + std::to_string(i); + ctx->description = desc; + ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu; +- ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i); ++ ctx->pci_id = ggml_backend_vk_get_device_pci_id(i); ++ ctx->id = ggml_backend_vk_get_device_id(i); + devices.push_back(new ggml_backend_device { + /* .iface = */ ggml_backend_vk_device_i, + /* .reg = */ reg, + /* .context = */ ctx, + }); ++ ++ // Gather additional information about the device ++ int dev_idx = vk_instance.device_indices[i]; ++ vk::PhysicalDeviceProperties props1; ++ vk_devices[dev_idx].getProperties(&props1); ++ vk::PhysicalDeviceProperties2 props2; ++ vk::PhysicalDeviceIDProperties device_id_props; ++ vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_props; ++ vk::PhysicalDeviceDriverProperties driver_props; ++ props2.pNext = &device_id_props; ++ device_id_props.pNext = &pci_bus_props; ++ pci_bus_props.pNext = &driver_props; ++ vk_devices[dev_idx].getProperties2(&props2); ++ std::ostringstream oss; ++ oss << std::hex << std::setfill('0'); ++ int byteIdx = 0; ++ for (int i = 0; i < 16; ++i, ++byteIdx) { ++ oss << std::setw(2) << static_cast(device_id_props.deviceUUID[i]); ++ if (byteIdx == 3 || byteIdx == 5 || byteIdx == 7 || byteIdx == 9) { ++ oss << '-'; ++ } ++ } ++ ctx->uuid = oss.str(); ++ ctx->major = 0; ++ ctx->minor = 0; ++ // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string ++ ctx->driver_major = 0; ++ ctx->driver_minor = 0; ++ ctx->numeric_id = std::to_string(i); + } + initialized = true; + } diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp new file mode 100644 -index 000000000..8ef19b8cf +index 000000000..5a7f5d465 --- /dev/null +++ b/ggml/src/mem_hip.cpp -@@ -0,0 +1,449 @@ +@@ -0,0 +1,452 @@ +#include "ggml.h" + +#ifdef _WIN32 @@ -586,7 +890,7 @@ index 000000000..8ef19b8cf + if (gpus != NULL) gpus->pVtbl->Release(gpus); \ + if (gpu != NULL) gpu->pVtbl->Release(gpu) + -+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) { ++int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) { + std::lock_guard lock(ggml_adlx_lock); + if (adlx.handle == NULL) { + GGML_LOG_INFO("%s ADLX was not initialized\n", __func__); @@ -598,9 +902,13 @@ index 000000000..8ef19b8cf + IADLXGPU* gpu = NULL; + IADLXGPUMetrics *gpuMetrics = NULL; + ADLX_RESULT status; -+ // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs -+ adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff); + ++ uint32_t pci_domain, pci_bus, pci_device, pci_function; ++ if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) { ++ // TODO - parse other formats? ++ GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id); ++ return ADLX_NOT_FOUND; ++ } + status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices); + if (ADLX_FAILED(status)) { + GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status); @@ -623,16 +931,15 @@ index 000000000..8ef19b8cf + GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status); + continue; + } -+ adlx_int id; -+ status = gpu->pVtbl->UniqueId(gpu, &id); ++ adlx_int uniqueID; ++ status = gpu->pVtbl->UniqueId(gpu, &uniqueID); + if (ADLX_FAILED(status)) { + GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status); + gpu->pVtbl->Release(gpu); + gpu = NULL; + continue; + } -+ if (id != target) { -+ GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id); ++ if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) { + gpu->pVtbl->Release(gpu); + gpu = NULL; + continue; @@ -695,7 +1002,7 @@ index 000000000..8ef19b8cf + return -1; +} +void ggml_hip_mgmt_release() {} -+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) { ++int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) { + return -1; +} + diff --git a/llama/patches/0029-NVML-fallback-for-unified-memory-GPUs.patch b/llama/patches/0027-NVML-fallback-for-unified-memory-GPUs.patch similarity index 99% rename from llama/patches/0029-NVML-fallback-for-unified-memory-GPUs.patch rename to llama/patches/0027-NVML-fallback-for-unified-memory-GPUs.patch index 9ba11168..ec3fdbaa 100644 --- a/llama/patches/0029-NVML-fallback-for-unified-memory-GPUs.patch +++ b/llama/patches/0027-NVML-fallback-for-unified-memory-GPUs.patch @@ -8,7 +8,7 @@ Subject: [PATCH] NVML fallback for unified memory GPUs 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp -index c9073cef..f473a2a2 100644 +index c9073cef0..f473a2a2c 100644 --- a/ggml/src/mem_nvml.cpp +++ b/ggml/src/mem_nvml.cpp @@ -13,6 +13,7 @@ diff --git a/llama/patches/0027-vulkan-get-GPU-ID-ollama-v0.11.5.patch b/llama/patches/0027-vulkan-get-GPU-ID-ollama-v0.11.5.patch deleted file mode 100644 index 997dd386..00000000 --- a/llama/patches/0027-vulkan-get-GPU-ID-ollama-v0.11.5.patch +++ /dev/null @@ -1,95 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Xiaodong Ye -Date: Mon, 18 Aug 2025 12:48:07 +0800 -Subject: [PATCH] vulkan: get GPU ID (ollama v0.11.5) - -Signed-off-by: Xiaodong Ye ---- - ggml/src/ggml-vulkan/ggml-vulkan.cpp | 37 ++++++++++++++++++++++++++++ - 1 file changed, 37 insertions(+) - -diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index 061cd078..adea7783 100644 ---- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp -+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -@@ -11588,6 +11588,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_ - snprintf(description, description_size, "%s", props.deviceName.data()); - } - -+static std::string ggml_vk_get_device_id(int device) { -+ ggml_vk_instance_init(); -+ -+ std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); -+ -+ vk::PhysicalDeviceProperties2 props; -+ vk::PhysicalDeviceIDProperties deviceIDProps; -+ props.pNext = &deviceIDProps; -+ devices[device].getProperties2(&props); -+ -+ const auto& uuid = deviceIDProps.deviceUUID; -+ char id[64]; -+ snprintf(id, sizeof(id), -+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", -+ uuid[0], uuid[1], uuid[2], uuid[3], -+ uuid[4], uuid[5], -+ uuid[6], uuid[7], -+ uuid[8], uuid[9], -+ uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15] -+ ); -+ return std::string(id); -+} -+ - // backend interface - - #define UNUSED GGML_UNUSED -@@ -12394,6 +12417,12 @@ void ggml_backend_vk_get_device_description(int device, char * description, size - ggml_vk_get_device_description(dev_idx, description, description_size); - } - -+std::string ggml_backend_vk_get_device_id(int device) { -+ GGML_ASSERT(device < (int) vk_instance.device_indices.size()); -+ int dev_idx = vk_instance.device_indices[device]; -+ return ggml_vk_get_device_id(dev_idx); -+} -+ - void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { - GGML_ASSERT(device < (int) vk_instance.device_indices.size()); - GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size()); -@@ -12481,6 +12510,7 @@ struct ggml_backend_vk_device_context { - std::string description; - bool is_integrated_gpu; - std::string pci_bus_id; -+ std::string id; - }; - - static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) { -@@ -12493,6 +12523,11 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de - return ctx->description.c_str(); - } - -+static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) { -+ ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; -+ return ctx->id.c_str(); -+} -+ - static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context; - ggml_backend_vk_get_device_memory(ctx->device, free, total); -@@ -12519,6 +12554,7 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml - - props->name = ggml_backend_vk_device_get_name(dev); - props->description = ggml_backend_vk_device_get_description(dev); -+ props->id = ggml_backend_vk_device_get_id(dev); - props->type = ggml_backend_vk_device_get_type(dev); - props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); - ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); -@@ -12965,6 +13001,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, - ctx->description = desc; - ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu; - ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i); -+ ctx->id = ggml_backend_vk_get_device_id(i); - devices.push_back(new ggml_backend_device { - /* .iface = */ ggml_backend_vk_device_i, - /* .reg = */ reg, --- -2.51.0 \ No newline at end of file diff --git a/llama/patches/0030-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch b/llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch similarity index 97% rename from llama/patches/0030-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch rename to llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch index c3c7fedf..f5861a8c 100644 --- a/llama/patches/0030-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch +++ b/llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch @@ -28,7 +28,7 @@ Co-authored-by: Johannes Gäßler 1 file changed, 9 insertions(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 6a278b5e9..87941f872 100644 +index b075a18be..d62f412d6 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -340,6 +340,15 @@ static ggml_cuda_device_info ggml_cuda_init() { diff --git a/llama/patches/0028-vulkan-pci-and-memory.patch b/llama/patches/0028-vulkan-pci-and-memory.patch deleted file mode 100644 index c20ccf5c..00000000 --- a/llama/patches/0028-vulkan-pci-and-memory.patch +++ /dev/null @@ -1,254 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Daniel Hiltgen -Date: Fri Sep 5 08:25:03 2025 -0700 -Subject: [PATCH] Vulkan PCI and Memory - ---- - ggml/src/ggml-vulkan/ggml-vulkan.cpp | 176 ++++++++++++++++++++++----- - 1 file changed, 145 insertions(+), 31 deletions(-) - -diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index adea7783..fb7204ce 100644 ---- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp -+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -@@ -12423,31 +12423,99 @@ std::string ggml_backend_vk_get_device_id(int device) { - return ggml_vk_get_device_id(dev_idx); - } - --void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { -- GGML_ASSERT(device < (int) vk_instance.device_indices.size()); -- GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size()); -+////////////////////////// -+ -+struct ggml_backend_vk_device_context { -+ size_t device; -+ std::string name; -+ std::string description; -+ bool is_integrated_gpu; -+ // Combined string id in the form "dddd:bb:dd.f" (domain:bus:device.function) -+ std::string pci_id; -+ std::string id; -+ std::string uuid; -+ int major; -+ int minor; -+ int driver_major; -+ int driver_minor; -+ int pci_bus_id; -+ int pci_device_id; -+ int pci_domain_id; -+}; -+ -+void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) { -+ GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size()); -+ GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size()); -+ -+ vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]]; - -- vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; -- vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops; -- vk::PhysicalDeviceMemoryProperties2 memprops = {}; -- bool membudget_supported = vk_instance.device_supports_membudget[device]; -+ vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); -+ vk::PhysicalDeviceProperties2 props2; -+ vkdev.getProperties2(&props2); - -- if (membudget_supported) { -- memprops.pNext = &budgetprops; -+ if (!ctx->is_integrated_gpu) -+ { -+ // Use vendor specific management libraries for best VRAM reporting if available -+ switch (props2.properties.vendorID) { -+ case VK_VENDOR_ID_AMD: -+ if (ggml_hip_mgmt_init() == 0) { -+ int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total); -+ if (status == 0) { -+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total); -+ ggml_hip_mgmt_release(); -+ return; -+ } -+ ggml_hip_mgmt_release(); -+ } -+ break; -+ case VK_VENDOR_ID_NVIDIA: -+ if (ggml_nvml_init() == 0) { -+ int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total); -+ if (status == 0) { -+ GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total); -+ ggml_nvml_release(); -+ return; -+ } -+ ggml_nvml_release(); -+ } -+ break; -+ } - } -- vkdev.getMemoryProperties2(&memprops); -+ // else fallback to memory budget if supported - -- for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) { -- const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i]; -+ *total = 0; -+ *free = 0; -+ vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props; -+ vk::PhysicalDeviceMemoryProperties2 memprops2; -+ memprops2.pNext = &mem_budget_props; -+ vkdev.getMemoryProperties2(&memprops2); -+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) { -+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) { -+ *total += memprops2.memoryProperties.memoryHeaps[i].size; -+ } else if (ctx->is_integrated_gpu) { -+ // Include shared memory on iGPUs -+ *total += memprops2.memoryProperties.memoryHeaps[i].size; -+ } -+ } -+ for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) { -+ if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) { -+ *free += mem_budget_props.heapBudget[i]; -+ } else if (ctx->is_integrated_gpu) { -+ *free += mem_budget_props.heapBudget[i]; -+ } -+ } -+ if (*total > 0 && *free > 0) { -+ return; -+ } else if (*total > 0) { -+ *free = *total; -+ return; -+ } - -+ // else just report the physical memory -+ for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) { - if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) { - *total = heap.size; -- -- if (membudget_supported && i < budgetprops.heapUsage.size()) { -- *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i]; -- } else { -- *free = heap.size; -- } -+ *free = heap.size; - break; - } - } -@@ -12502,16 +12570,17 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { - return std::string(pci_bus_id); - } - --////////////////////////// -- --struct ggml_backend_vk_device_context { -- size_t device; -- std::string name; -- std::string description; -- bool is_integrated_gpu; -- std::string pci_bus_id; -- std::string id; --}; -+static bool ggml_backend_vk_parse_pci_bus_id(const std::string & id, int *domain, int *bus, int *device) { -+ if (id.empty()) return false; -+ unsigned int d = 0, b = 0, dev = 0, func = 0; -+ // Expected format: dddd:bb:dd.f (all hex) -+ int n = sscanf(id.c_str(), "%4x:%2x:%2x.%1x", &d, &b, &dev, &func); -+ if (n < 4) return false; -+ if (domain) *domain = (int) d; -+ if (bus) *bus = (int) b; -+ if (device) *device = (int) dev; -+ return true; -+} - - static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; -@@ -12530,7 +12599,7 @@ static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) { - - static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context; -- ggml_backend_vk_get_device_memory(ctx->device, free, total); -+ ggml_backend_vk_get_device_memory(ctx, free, total); - } - - static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) { -@@ -12556,7 +12625,7 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml - props->description = ggml_backend_vk_device_get_description(dev); - props->id = ggml_backend_vk_device_get_id(dev); - props->type = ggml_backend_vk_device_get_type(dev); -- props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); -+ props->device_id = ctx->pci_id.empty() ? nullptr : ctx->pci_id.c_str(); - ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); - props->caps = { - /* .async = */ false, -@@ -12564,6 +12633,17 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml - /* .buffer_from_host_ptr = */ false, - /* .events = */ false, - }; -+ -+ props->compute_major = ctx->major; -+ props->compute_minor = ctx->minor; -+ props->driver_major = ctx->driver_major; -+ props->driver_minor = ctx->driver_minor; -+ props->integrated = ctx->is_integrated_gpu; -+ props->pci_bus_id = ctx->pci_bus_id; -+ props->pci_device_id = ctx->pci_device_id; -+ props->pci_domain_id = ctx->pci_domain_id; -+ props->library = GGML_VK_NAME; -+ props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str(); - } - - static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) { -@@ -12992,6 +13071,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, - static std::mutex mutex; - std::lock_guard lock(mutex); - if (!initialized) { -+ std::vector vk_devices = vk_instance.instance.enumeratePhysicalDevices(); -+ - for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) { - ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context; - char desc[256]; -@@ -13000,13 +13081,46 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, - ctx->name = GGML_VK_NAME + std::to_string(i); - ctx->description = desc; - ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu; -- ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i); -+ ctx->pci_id = ggml_backend_vk_get_device_pci_id(i); - ctx->id = ggml_backend_vk_get_device_id(i); - devices.push_back(new ggml_backend_device { - /* .iface = */ ggml_backend_vk_device_i, - /* .reg = */ reg, - /* .context = */ ctx, - }); -+ -+ // Gather additional information about the device -+ int dev_idx = vk_instance.device_indices[i]; -+ vk::PhysicalDeviceProperties props1; -+ vk_devices[dev_idx].getProperties(&props1); -+ vk::PhysicalDeviceProperties2 props2; -+ vk::PhysicalDeviceIDProperties device_id_props; -+ vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_props; -+ vk::PhysicalDeviceDriverProperties driver_props; -+ props2.pNext = &device_id_props; -+ device_id_props.pNext = &pci_bus_props; -+ pci_bus_props.pNext = &driver_props; -+ vk_devices[dev_idx].getProperties2(&props2); -+ std::ostringstream oss; -+ oss << std::hex << std::setfill('0'); -+ oss << "GPU-"; -+ int byteIdx = 0; -+ for (int i = 0; i < 16; ++i, ++byteIdx) { -+ oss << std::setw(2) << static_cast(device_id_props.deviceUUID[i]); -+ if (byteIdx == 3 || byteIdx == 5 || byteIdx == 7 || byteIdx == 9) { -+ oss << '-'; -+ } -+ } -+ ctx->uuid = oss.str(); -+ ctx->pci_bus_id = pci_bus_props.pciBus; -+ ctx->pci_device_id = pci_bus_props.pciDevice; -+ ctx->pci_domain_id = pci_bus_props.pciDomain; -+ ctx->id = std::to_string(i); -+ ctx->major = 0; -+ ctx->minor = 0; -+ // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string -+ ctx->driver_major = 0; -+ ctx->driver_minor = 0; - } - initialized = true; - } --- -2.51.0 \ No newline at end of file diff --git a/llama/patches/0031-report-LoadLibrary-failures.patch b/llama/patches/0029-report-LoadLibrary-failures.patch similarity index 100% rename from llama/patches/0031-report-LoadLibrary-failures.patch rename to llama/patches/0029-report-LoadLibrary-failures.patch diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 64aae141..3feb5b5d 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -725,7 +725,9 @@ func (b *Backend) BackendDevices() []ml.DeviceInfo { if props.library != nil { info.Library = C.GoString(props.library) } - info.PCIID = fmt.Sprintf("%02x:%02x.%x", props.pci_bus_id, props.pci_device_id, props.pci_domain_id) + if props.device_id != nil { + info.PCIID = C.GoString(props.device_id) + } info.LibraryPath = ggml.LibPaths() if props.numeric_id != nil { info.FilteredID = C.GoString(props.numeric_id) diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h index 094fc3c8..80983524 100644 --- a/ml/backend/ggml/ggml/include/ggml-backend.h +++ b/ml/backend/ggml/ggml/include/ggml-backend.h @@ -174,9 +174,6 @@ extern "C" { int compute_major; int compute_minor; int integrated; - int pci_bus_id; - int pci_device_id; - int pci_domain_id; const char *library; // number with which the devices are accessed (Vulkan) const char *numeric_id; diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index f9cf2d4f..d62f412d 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3513,9 +3513,6 @@ struct ggml_backend_cuda_device_context { int driver_major; int driver_minor; int integrated; - int pciBusID; - int pciDeviceID; - int pciDomainID; }; static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { @@ -3539,9 +3536,9 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * #if defined(GGML_USE_HIP) if (ggml_hip_mgmt_init() == 0) { - int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total); + int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total); if (status == 0) { - GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total); + GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total); ggml_hip_mgmt_release(); return; } @@ -3551,7 +3548,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * if (ggml_nvml_init() == 0) { int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total); if (status == 0) { - GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total); + GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total); ggml_nvml_release(); return; } @@ -3591,9 +3588,6 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back props->driver_major = ctx->driver_major; props->driver_minor = ctx->driver_minor; props->integrated = ctx->integrated; - props->pci_bus_id = ctx->pciBusID; - props->pci_device_id = ctx->pciDeviceID; - props->pci_domain_id = ctx->pciDomainID; props->library = GGML_CUDA_NAME; bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; @@ -4182,9 +4176,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { dev_ctx->driver_major = driverVersion / 1000; dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10; dev_ctx->integrated = prop.integrated; - dev_ctx->pciBusID = prop.pciBusID; - dev_ctx->pciDeviceID = prop.pciDeviceID; - dev_ctx->pciDomainID = prop.pciDomainID; ggml_backend_dev_t dev = new ggml_backend_device { /* .iface = */ ggml_backend_cuda_device_interface, /* .reg = */ ®, diff --git a/ml/backend/ggml/ggml/src/ggml-impl.h b/ml/backend/ggml/ggml/src/ggml-impl.h index 80597b6e..b63edd0c 100644 --- a/ml/backend/ggml/ggml/src/ggml-impl.h +++ b/ml/backend/ggml/ggml/src/ggml-impl.h @@ -643,7 +643,7 @@ GGML_API int ggml_nvml_init(); GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total); GGML_API void ggml_nvml_release(); GGML_API int ggml_hip_mgmt_init(); -GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total); +GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total); GGML_API void ggml_hip_mgmt_release(); #ifdef __cplusplus diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 564bc4a7..0bbcecd0 100644 --- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -231,6 +231,7 @@ class vk_memory_logger; #endif class vk_perf_logger; static void ggml_vk_destroy_buffer(vk_buffer& buf); +static std::string ggml_vk_get_device_id(int device); static constexpr uint32_t mul_mat_vec_max_cols = 8; static constexpr uint32_t p021_max_gqa_ratio = 8; @@ -11598,7 +11599,7 @@ static std::string ggml_vk_get_device_id(int device) { const auto& uuid = deviceIDProps.deviceUUID; char id[64]; snprintf(id, sizeof(id), - "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], uuid[6], uuid[7], @@ -12431,13 +12432,11 @@ struct ggml_backend_vk_device_context { std::string pci_id; std::string id; std::string uuid; + std::string numeric_id; int major; int minor; int driver_major; int driver_minor; - int pci_bus_id; - int pci_device_id; - int pci_domain_id; }; void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) { @@ -12456,9 +12455,9 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size switch (props2.properties.vendorID) { case VK_VENDOR_ID_AMD: if (ggml_hip_mgmt_init() == 0) { - int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total); + int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total); if (status == 0) { - GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total); + GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total); ggml_hip_mgmt_release(); return; } @@ -12469,7 +12468,7 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size if (ggml_nvml_init() == 0) { int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total); if (status == 0) { - GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total); + GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total); ggml_nvml_release(); return; } @@ -12545,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { } } + vk::PhysicalDeviceProperties2 props2; if (!ext_support) { - return ""; + device.getProperties2(&props2); + if (props2.properties.vendorID != VK_VENDOR_ID_AMD) { + return ""; + } + // AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero } vk::PhysicalDeviceProperties2 props = {}; @@ -12563,6 +12567,9 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { char pci_bus_id[16] = {}; snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function); + if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) { + return ""; + } return std::string(pci_bus_id); } @@ -12636,11 +12643,8 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml props->driver_major = ctx->driver_major; props->driver_minor = ctx->driver_minor; props->integrated = ctx->is_integrated_gpu; - props->pci_bus_id = ctx->pci_bus_id; - props->pci_device_id = ctx->pci_device_id; - props->pci_domain_id = ctx->pci_domain_id; props->library = GGML_VK_NAME; - props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str(); + props->numeric_id = ctx->numeric_id.c_str(); } static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) { @@ -13101,7 +13105,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, vk_devices[dev_idx].getProperties2(&props2); std::ostringstream oss; oss << std::hex << std::setfill('0'); - oss << "GPU-"; int byteIdx = 0; for (int i = 0; i < 16; ++i, ++byteIdx) { oss << std::setw(2) << static_cast(device_id_props.deviceUUID[i]); @@ -13110,15 +13113,12 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, } } ctx->uuid = oss.str(); - ctx->pci_bus_id = pci_bus_props.pciBus; - ctx->pci_device_id = pci_bus_props.pciDevice; - ctx->pci_domain_id = pci_bus_props.pciDomain; - ctx->id = std::to_string(i); ctx->major = 0; ctx->minor = 0; // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string ctx->driver_major = 0; ctx->driver_minor = 0; + ctx->numeric_id = std::to_string(i); } initialized = true; } diff --git a/ml/backend/ggml/ggml/src/mem_hip.cpp b/ml/backend/ggml/ggml/src/mem_hip.cpp index 8ef19b8c..5a7f5d46 100644 --- a/ml/backend/ggml/ggml/src/mem_hip.cpp +++ b/ml/backend/ggml/ggml/src/mem_hip.cpp @@ -331,7 +331,7 @@ void ggml_hip_mgmt_release() { if (gpus != NULL) gpus->pVtbl->Release(gpus); \ if (gpu != NULL) gpu->pVtbl->Release(gpu) -int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) { +int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) { std::lock_guard lock(ggml_adlx_lock); if (adlx.handle == NULL) { GGML_LOG_INFO("%s ADLX was not initialized\n", __func__); @@ -343,9 +343,13 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, IADLXGPU* gpu = NULL; IADLXGPUMetrics *gpuMetrics = NULL; ADLX_RESULT status; - // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs - adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff); + uint32_t pci_domain, pci_bus, pci_device, pci_function; + if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) { + // TODO - parse other formats? + GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id); + return ADLX_NOT_FOUND; + } status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices); if (ADLX_FAILED(status)) { GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status); @@ -368,16 +372,15 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status); continue; } - adlx_int id; - status = gpu->pVtbl->UniqueId(gpu, &id); + adlx_int uniqueID; + status = gpu->pVtbl->UniqueId(gpu, &uniqueID); if (ADLX_FAILED(status)) { GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status); gpu->pVtbl->Release(gpu); gpu = NULL; continue; } - if (id != target) { - GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id); + if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) { gpu->pVtbl->Release(gpu); gpu = NULL; continue; @@ -440,7 +443,7 @@ int ggml_hip_mgmt_init() { return -1; } void ggml_hip_mgmt_release() {} -int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) { +int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) { return -1; } diff --git a/ml/device.go b/ml/device.go index 39fba7d1..57c3976b 100644 --- a/ml/device.go +++ b/ml/device.go @@ -391,6 +391,10 @@ func (a DeviceInfo) Compare(b DeviceInfo) DeviceComparison { if a.PCIID != b.PCIID { return UniqueDevice } + // If PCIID is empty, we have to use ID + library for uniqueness + if a.PCIID == "" && a.DeviceID != b.DeviceID { + return UniqueDevice + } if a.Library == b.Library { return SameBackendDevice } @@ -454,13 +458,13 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) { var envVar string switch d.Library { case "ROCm": + // ROCm must be filtered as it can crash the runner on unsupported devices envVar = "ROCR_VISIBLE_DEVICES" if runtime.GOOS != "linux" { envVar = "HIP_VISIBLE_DEVICES" } - case "Vulkan": - envVar = "GGML_VK_VISIBLE_DEVICES" default: + // CUDA and Vulkan are not filtered via env var, but via scheduling decisions return } v, existing := env[envVar]