vulkan: Add memory detection for Intel GPU using DXGI+PDH (#12664)

* PDH free memory skeleton

* Add PDH printing

* Add LUID support for Vulkan

* wire luid from ggml-vulkan to mem-dxgi-pdh file

* Fix to ggml-impl

* Continue skeleton

* Implemented ggml_dxgi_pdh_get_device_memory

* fix comments

* Fix - change value GB to bytes

* add ifdefs to only support windows and not linux

* modify error codes

* Finished ggml_dxgi_pdh_init() function

* completed ggml_dxgi_pdh_release()

* Formatting changes, add static to functions

* fix build errors

* fix go build error

* fix luid - now should match between dxgi and vulkan

* Fix the free memory reporting (was using copy by value, change to reference)

* keep only dxgi1_2.h

* Modifications based on PR feedback

* fix merge conflicts (2) and fix desc1.description printout

* move dxgi + pdh api calls to before the vendor specific library calls

* change from 3 samples to 1 sample for PDH

* modify when old_mode is set

* add fix for building MacOS

* fix release and returns for other vendors

* add patch file
This commit is contained in:
virajwad
2025-11-04 15:11:55 -07:00
committed by GitHub
parent d3b4b9970a
commit 220e133fca
5 changed files with 747 additions and 3 deletions

View File

@@ -211,6 +211,7 @@ add_library(ggml-base
ggml-quants.h
mem_hip.cpp
mem_nvml.cpp
mem_dxgi_pdh.cpp
gguf.cpp)
target_include_directories(ggml-base PRIVATE .)

View File

@@ -645,6 +645,9 @@ GGML_API void ggml_nvml_release();
GGML_API int ggml_hip_mgmt_init();
GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
GGML_API void ggml_hip_mgmt_release();
GGML_API int ggml_dxgi_pdh_init();
GGML_API int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu);
GGML_API void ggml_dxgi_pdh_release();
#ifdef __cplusplus
}

View File

@@ -73,6 +73,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
#define VK_KHR_SHADER_BFLOAT16_EXTENSION_NAME "VK_KHR_shader_bfloat16"
#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR ((VkStructureType)1000141000)
#define VK_COMPONENT_TYPE_BFLOAT16_KHR ((VkComponentTypeKHR)1000141000)
#define VK_LUID_SIZE_KHR VK_LUID_SIZE
typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
VkStructureType sType;
@@ -12433,6 +12434,7 @@ struct ggml_backend_vk_device_context {
std::string pci_id;
std::string id;
std::string uuid;
std::string luid;
std::string numeric_id;
int major;
int minor;
@@ -12449,8 +12451,22 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
vk::PhysicalDeviceProperties2 props2;
vkdev.getProperties2(&props2);
GGML_LOG_DEBUG("ggml_backend_vk_get_device_memory called: uuid %s\n", ctx->uuid.c_str());
GGML_LOG_DEBUG("ggml_backend_vk_get_device_memory called: luid %s\n", ctx->luid.c_str());
if (!ctx->is_integrated_gpu)
// Check VRAM reporting for Windows IGPU/DGPU using DXGI + PDH (vendor agnostic)
if (ggml_dxgi_pdh_init() == 0) {
GGML_LOG_DEBUG("DXGI + PDH Initialized. Getting GPU free memory info\n");
int status = ggml_dxgi_pdh_get_device_memory(ctx->luid.c_str(), free, total, ctx->is_integrated_gpu);
if (status == 0) {
GGML_LOG_DEBUG("%s utilizing DXGI + PDH memory reporting free: %zu total: %zu\n", __func__, *free, *total);
ggml_dxgi_pdh_release();
return;
}
ggml_dxgi_pdh_release();
}
if (!ctx->is_integrated_gpu)
{
// Use vendor specific management libraries for best VRAM reporting if available
switch (props2.properties.vendorID) {
@@ -12478,8 +12494,8 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
break;
}
}
// else fallback to memory budget if supported
// else fallback to memory budget if supported
*total = 0;
*free = 0;
vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
@@ -13091,7 +13107,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
/* .reg = */ reg,
/* .context = */ ctx,
});
// Gather additional information about the device
int dev_idx = vk_instance.device_indices[i];
vk::PhysicalDeviceProperties props1;
@@ -13114,6 +13129,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
}
}
ctx->uuid = oss.str();
const auto& luid = device_id_props.deviceLUID;
char luid_str[32]; // "0x" + 16 hex digits + null terminator = 19 chars
snprintf(luid_str, sizeof(luid_str), // high part + low part
"0x%02x%02x%02x%02x%02x%02x%02x%02x",
luid[7], luid[6], luid[5], luid[4],
luid[3], luid[2], luid[1], luid[0]
);
ctx->luid = std::string(luid_str);
ctx->major = 0;
ctx->minor = 0;
// TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string

View File

@@ -0,0 +1,297 @@
// DXGI and PDH Performance Counters Library
// This Windows-only (10/11) library provides accurate VRAM reporting
#include "ggml.h"
#include "ggml-impl.h"
#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
#include <windows.h>
#include <pdh.h>
#include <dxgi1_2.h>
#include <sstream>
#include <thread>
#include <filesystem>
#include <mutex>
namespace fs = std::filesystem;
static std::mutex ggml_dxgi_pdh_lock;
/*
Struct to keep track of GPU adapter information at runtime
*/
struct GpuInfo {
std::wstring description; // debug field
LUID luid;
std::wstring pdhInstance;
double dedicatedTotal = 0.0;
double sharedTotal = 0.0;
double dedicatedUsage = 0.0;
double sharedUsage = 0.0;
};
/*
DLL Function Pointers
*/
struct {
void *dxgi_dll_handle;
void *pdh_dll_handle;
// DXGI Functions
HRESULT (*CreateDXGIFactory1)(REFIID riid, void **ppFactory);
// PDH functions
PDH_STATUS (*PdhOpenQueryW)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery);
PDH_STATUS (*PdhAddCounterW)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter);
PDH_STATUS (*PdhCollectQueryData)(PDH_HQUERY hQuery);
PDH_STATUS (*PdhGetFormattedCounterValue)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue);
PDH_STATUS (*PdhCloseQuery)(PDH_HQUERY hQuery);
} dll_functions {
nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr
};
/*
Create a PDH Instance name
*/
static std::wstring generate_pdh_instance_name_from_luid(const LUID& luid) {
std::wstringstream ss;
ss << L"luid_0x" << std::hex << std::setw(8) << std::setfill(L'0') << std::uppercase << luid.HighPart
<< L"_0x" << std::setw(8) << std::setfill(L'0') << luid.LowPart;
return ss.str();
}
/*
Conversion from Bytes to GigaBytes
*/
template <typename T>
static inline double b_to_gb(T n)
{
return (double(n) / (1024.0 * 1024 * 1024));
}
/*
Fetch the GPU adapter 'dedicated memory' and 'shared memory' using DXGI
*/
static void fetch_dxgi_adapter_desc1(const DXGI_ADAPTER_DESC1& desc, GpuInfo* info) {
auto dedicatedVideoMemory = desc.DedicatedVideoMemory;
auto sharedSystemMemory = desc.SharedSystemMemory;
GGML_LOG_DEBUG("[DXGI] Adapter Description: %ls, LUID: 0x%08X%08X, Dedicated: %.2f GB, Shared: %.2f GB\n", desc.Description, desc.AdapterLuid.HighPart, desc.AdapterLuid.LowPart, b_to_gb(dedicatedVideoMemory), b_to_gb(sharedSystemMemory));
if (info) {
info->dedicatedTotal = dedicatedVideoMemory; // values in bytes
info->sharedTotal = sharedSystemMemory;
}
}
/*
Enumerate over the GPU adapters detected using DXGI and return their information
*/
static std::vector<GpuInfo> get_dxgi_gpu_infos() {
std::vector<GpuInfo> infos;
IDXGIFactory1* pFactory = nullptr;
if (SUCCEEDED(dll_functions.CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&pFactory))) {
UINT i = 0;
IDXGIAdapter1* pAdapter = nullptr;
while (pFactory->EnumAdapters1(i, &pAdapter) != DXGI_ERROR_NOT_FOUND) {
DXGI_ADAPTER_DESC1 desc;
pAdapter->GetDesc1(&desc);
// Get all the GPU adapter info
GpuInfo info;
fetch_dxgi_adapter_desc1(desc, &info);
info.description = std::wstring(desc.Description);
info.luid = desc.AdapterLuid;
info.pdhInstance = generate_pdh_instance_name_from_luid(desc.AdapterLuid);
infos.push_back(info);
pAdapter->Release();
++i;
}
pFactory->Release();
}
return infos;
}
static bool get_gpu_memory_usage(GpuInfo& gpu) {
PDH_HQUERY query;
if (dll_functions.PdhOpenQueryW(NULL, 0, &query) != ERROR_SUCCESS) {
return false;
}
struct GpuCounters {
PDH_HCOUNTER dedicated;
PDH_HCOUNTER shared;
};
GpuCounters gpuCounter{};
std::wstring dedicatedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Dedicated Usage";
std::wstring sharedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Shared Usage";
if (dll_functions.PdhAddCounterW(query, dedicatedPath.c_str(), 0, &gpuCounter.dedicated) != ERROR_SUCCESS ||
dll_functions.PdhAddCounterW(query, sharedPath.c_str(), 0, &gpuCounter.shared) != ERROR_SUCCESS) {
GGML_LOG_ERROR("Failed to add PDH counters for GPU %s\n", std::string(gpu.pdhInstance.begin(), gpu.pdhInstance.end()).c_str());
dll_functions.PdhCloseQuery(query);
return false;
}
// Sample the data
if (dll_functions.PdhCollectQueryData(query) != ERROR_SUCCESS) {
dll_functions.PdhCloseQuery(query);
return false;
}
// Read final values
PDH_FMT_COUNTERVALUE val;
if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.dedicated, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS)
gpu.dedicatedUsage = val.doubleValue;
if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.shared, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS)
gpu.sharedUsage = val.doubleValue;
dll_functions.PdhCloseQuery(query);
return true;
}
extern "C" {
int ggml_dxgi_pdh_init() {
GGML_LOG_DEBUG("%s called\n", __func__);
std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
if (dll_functions.dxgi_dll_handle != NULL && dll_functions.pdh_dll_handle != NULL) {
// Already initialized as we have both DLL handles
return ERROR_SUCCESS;
}
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
fs::path libPath_dxgi = fs::path("\\Windows") / fs::path("System32") / fs::path("dxgi.dll");
fs::path libPath_pdh = fs::path("\\Windows") / fs::path("System32") / fs::path("pdh.dll");
// Call LoadLibraryW on both DLLs to ensure they are loaded
void *dxgi = (void*)LoadLibraryW(libPath_dxgi.wstring().c_str());
void *pdh = (void*)LoadLibraryW(libPath_pdh.wstring().c_str());
if(dxgi == NULL || pdh == NULL) {
if (dxgi != NULL) {
FreeLibrary((HMODULE)(dxgi));
}
if (pdh != NULL) {
FreeLibrary((HMODULE)(pdh));
}
SetErrorMode(old_mode);
return ERROR_DLL_NOT_FOUND;
}
else {
// save the dll handles
dll_functions.dxgi_dll_handle = dxgi;
dll_functions.pdh_dll_handle = pdh;
}
// Get pointers to the library functions loaded by the DLLs
dll_functions.CreateDXGIFactory1 = (HRESULT (*)(REFIID riid, void **ppFactory)) GetProcAddress((HMODULE)(dll_functions.dxgi_dll_handle), "CreateDXGIFactory1");
dll_functions.PdhOpenQueryW = (PDH_STATUS (*)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhOpenQueryW");
dll_functions.PdhAddCounterW = (PDH_STATUS (*)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhAddCounterW");
dll_functions.PdhCollectQueryData = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCollectQueryData");
dll_functions.PdhGetFormattedCounterValue = (PDH_STATUS (*)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhGetFormattedCounterValue");
dll_functions.PdhCloseQuery = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCloseQuery");
SetErrorMode(old_mode); // set old mode before any return
// Check if any function pointers are NULL (not found)
if (dll_functions.CreateDXGIFactory1 == NULL || dll_functions.PdhOpenQueryW == NULL || dll_functions.PdhAddCounterW == NULL || dll_functions.PdhCollectQueryData == NULL || dll_functions.PdhGetFormattedCounterValue == NULL || dll_functions.PdhCloseQuery == NULL) {
GGML_LOG_INFO("%s unable to locate required symbols in either dxgi.dll or pdh.dll", __func__);
FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle));
FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle));
dll_functions.dxgi_dll_handle = NULL;
dll_functions.pdh_dll_handle = NULL;
return ERROR_PROC_NOT_FOUND;
}
// No other initializations needed, successfully loaded the libraries and functions!
return ERROR_SUCCESS;
}
void ggml_dxgi_pdh_release() {
std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
if (dll_functions.dxgi_dll_handle == NULL && dll_functions.pdh_dll_handle == NULL) {
// Already freed the DLLs
return;
}
// Call FreeLibrary on both DLLs
FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle));
FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle));
dll_functions.dxgi_dll_handle = NULL;
dll_functions.pdh_dll_handle = NULL;
return; // successfully released
}
int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) {
std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
// Enumerate GPUs using DXGI and find the matching LUID
// This also fetches the total memory info for each of the enumerated GPUs
std::vector<GpuInfo> gpus = get_dxgi_gpu_infos();
GpuInfo *targetGpu = nullptr;
for (auto& gpu : gpus) {
char luid_buffer[32]; // "0x" + 16 hex digits + null terminator
snprintf(luid_buffer, sizeof(luid_buffer), "0x%08x%08x", gpu.luid.HighPart, gpu.luid.LowPart);
std::string gpu_luid_str(luid_buffer);
if (gpu_luid_str == std::string(luid)) {
targetGpu = &gpu;
break;
}
}
if (!targetGpu) {
GGML_LOG_ERROR("GPU with LUID %s not found.\n", luid);
return ERROR_NOT_FOUND;
}
// Get the current memory usage for the target GPU
int status = get_gpu_memory_usage(*targetGpu);
if (!status) {
GGML_LOG_ERROR("Failed to get GPU memory usage.\n");
return ERROR_DEVICE_NOT_AVAILABLE;
}
// Calculate the free memory based on whether it's an integrated or discrete GPU
if (is_integrated_gpu) {
// IGPU free = SharedTotal - SharedUsage
GGML_LOG_DEBUG("Integrated GPU (%ls) with LUID %s detected. Shared Total: %.2f bytes (%.2f GB), Shared Usage: %.2f bytes (%.2f GB), Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->sharedTotal, b_to_gb(targetGpu->sharedTotal), targetGpu->sharedUsage, b_to_gb(targetGpu->sharedUsage), targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage));
*free = (targetGpu->sharedTotal - targetGpu->sharedUsage) + (targetGpu->dedicatedTotal - targetGpu->dedicatedUsage); // Some IGPUs also have dedicated memory, which can be used along with the IGPU's shared memory
*total = targetGpu->sharedTotal + targetGpu->dedicatedTotal;
}
else {
// DGPU free = DedicatedTotal - DedicatedUsage
GGML_LOG_DEBUG("Discrete GPU (%ls) with LUID %s detected. Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage));
*free = targetGpu->dedicatedTotal - targetGpu->dedicatedUsage;
*total = targetGpu->dedicatedTotal;
}
return ERROR_SUCCESS;
}
} // extern "C"
#else // #ifdef _WIN32
extern "C" {
// DXGI + PDH not available for Linux implementation
int ggml_dxgi_pdh_init() {
return -1;
}
void ggml_dxgi_pdh_release() {}
int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) {
return -1;
}
} // extern "C"
#endif // #ifdef _WIN32