bring back sysfs based VRAM information for AMD (#12871)

* build: optimize dockerfile context for iterating

This moves the copy of the source into the layer AFTER
doing software installs so we don't have to go through
the RPM install for cuda, etc. every time you touch a
source file.

* amd: implement linux sysfs based VRAM lookup

This adds a C++ implementation of sysfs DRM VRAM discovery
for more accurate free VRAM data on linux for AMD GPUs.
This commit is contained in:
Daniel Hiltgen
2025-11-17 15:40:58 -08:00
committed by GitHub
parent 399eacf486
commit 2f36d769aa
5 changed files with 186 additions and 22 deletions

View File

@@ -3513,7 +3513,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
if (ggml_hip_mgmt_init() == 0) {
int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total);
if (status == 0) {
GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
GGML_LOG_DEBUG("%s device %s utilizing AMD specific memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
ggml_hip_mgmt_release();
return;
}

View File

@@ -13212,7 +13212,7 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
if (ggml_hip_mgmt_init() == 0) {
int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total);
if (status == 0) {
GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
GGML_LOG_DEBUG("%s device %s utilizing AMD specific memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
ggml_hip_mgmt_release();
return;
}

View File

@@ -1,4 +1,5 @@
#include "ggml.h"
#include "ggml-impl.h"
#ifdef _WIN32
// AMD Device Library eXtra (ADLX)
@@ -16,7 +17,6 @@
// Unused function parameters are commented out to avoid unnecessary type
// definitions.
#include "ggml-impl.h"
#include <filesystem>
#include <mutex>
@@ -436,15 +436,92 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
#else // #ifdef _WIN32
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#include <filesystem>
#include <sys/stat.h>
#include <dirent.h>
#include <unistd.h>
#include <glob.h>
namespace fs = std::filesystem;
extern "C" {
// TODO Linux implementation of accurate VRAM reporting
int ggml_hip_mgmt_init() {
return -1;
return 0;
}
void ggml_hip_mgmt_release() {}
int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
return -1;
GGML_LOG_INFO("%s searching for device %s\n", __func__, id);
const std::string drmDeviceGlob = "/sys/class/drm/card*/device/uevent";
const std::string drmTotalMemoryFile = "mem_info_vram_total";
const std::string drmUsedMemoryFile = "mem_info_vram_used";
const std::string drmUeventPCISlotLabel = "PCI_SLOT_NAME=";
glob_t glob_result;
glob(drmDeviceGlob.c_str(), GLOB_NOSORT, NULL, &glob_result);
for (size_t i = 0; i < glob_result.gl_pathc; ++i) {
const char* device_file = glob_result.gl_pathv[i];
std::ifstream file(device_file);
if (!file.is_open()) {
std::cerr << "Failed to open sysfs node" << std::endl;
globfree(&glob_result);
return 1;
}
std::string line;
while (std::getline(file, line)) {
// Check for PCI_SLOT_NAME label
if (line.find(drmUeventPCISlotLabel) == 0) {
std::istringstream iss(line.substr(drmUeventPCISlotLabel.size()));
std::string pciSlot;
iss >> pciSlot;
if (pciSlot == std::string(id)) {
std::string dir = fs::path(device_file).parent_path().string();
std::string totalFile = dir + "/" + drmTotalMemoryFile;
std::ifstream totalFileStream(totalFile.c_str());
if (!totalFileStream.is_open()) {
GGML_LOG_DEBUG("%s Failed to read sysfs node %s\n", __func__, totalFile.c_str());
file.close();
globfree(&glob_result);
return 1;
}
uint64_t memory;
totalFileStream >> memory;
*total = memory;
std::string usedFile = dir + "/" + drmUsedMemoryFile;
std::ifstream usedFileStream(usedFile.c_str());
if (!usedFileStream.is_open()) {
GGML_LOG_DEBUG("%s Failed to read sysfs node %s\n", __func__, usedFile.c_str());
file.close();
globfree(&glob_result);
return 1;
}
uint64_t memoryUsed;
usedFileStream >> memoryUsed;
*free = memory - memoryUsed;
file.close();
globfree(&glob_result);
return 0;
}
}
}
file.close();
}
GGML_LOG_DEBUG("%s unable to find matching device\n", __func__);
globfree(&glob_result);
return 1;
}
} // extern "C"