discovery: only retry AMD GPUs (#12894)

* discovery: only retry AMD GPUs

CUDA and Vulkan don't crash on unsupported devices, so retry isn't necessary.
This also refactors the code to shift the Library specific logic into the ml
package.

* review comments
This commit is contained in:
Daniel Hiltgen
2025-11-04 15:33:46 -08:00
committed by GitHub
parent 220e133fca
commit 27f1fde413
9 changed files with 96 additions and 137 deletions

View File

@@ -730,10 +730,6 @@ func (b *Backend) BackendDevices() []ml.DeviceInfo {
info.PCIID = C.GoString(props.device_id)
}
info.LibraryPath = ggml.LibPaths()
if props.numeric_id != nil {
info.FilteredID = C.GoString(props.numeric_id)
}
C.ggml_backend_dev_memory(dev, &props.memory_free, &props.memory_total)
info.TotalMemory = (uint64)(props.memory_total)
info.FreeMemory = (uint64)(props.memory_free)

View File

@@ -175,8 +175,6 @@ extern "C" {
int compute_minor;
int integrated;
const char *library;
// number with which the devices are accessed (Vulkan)
const char *numeric_id;
};
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);

View File

@@ -12435,7 +12435,6 @@ struct ggml_backend_vk_device_context {
std::string id;
std::string uuid;
std::string luid;
std::string numeric_id;
int major;
int minor;
int driver_major;
@@ -12661,7 +12660,6 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
props->driver_minor = ctx->driver_minor;
props->integrated = ctx->is_integrated_gpu;
props->library = GGML_VK_NAME;
props->numeric_id = ctx->numeric_id.c_str();
}
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -13142,7 +13140,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
// TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
ctx->driver_major = 0;
ctx->driver_minor = 0;
ctx->numeric_id = std::to_string(i);
}
initialized = true;
}

View File

@@ -257,7 +257,7 @@ type DeviceInfo struct {
// FilterID is populated with the unfiltered device ID if a numeric ID is used
// so the device can be included.
FilteredID string `json:"filtered_id,omitempty"`
FilterID string `json:"filter_id,omitempty"`
// Integrated is set true for integrated GPUs, false for Discrete GPUs
Integrated bool `json:"integration,omitempty"`
@@ -455,6 +455,35 @@ func GetVisibleDevicesEnv(l []DeviceInfo) map[string]string {
return env
}
// NeedsInitValidation returns true if the device in question has the potential
// to crash at inference time and requires deeper validation before we include
// it in the supported devices list.
func (d DeviceInfo) NeedsInitValidation() bool {
// At this time the only library we know needs a 2nd pass is ROCm since
// rocblas will crash on unsupported devices. We want to find those crashes
// during bootstrap discovery so we can eliminate those GPUs before the user
// tries to run inference on them
return d.Library == "ROCm"
}
// Set the init validation environment variable
func (d DeviceInfo) AddInitValidation(env map[string]string) {
env["GGML_CUDA_INIT"] = "1" // force deep initialization to trigger crash on unsupported GPUs
}
// PreferredLibrary returns true if this library is preferred over the other input
// library
// Used to filter out Vulkan in favor of CUDA or ROCm
func (d DeviceInfo) PreferredLibrary(other DeviceInfo) bool {
// TODO in the future if we find Vulkan is better than ROCm on some devices
// that implementation can live here.
if d.Library == "CUDA" || d.Library == "ROCm" {
return true
}
return false
}
func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) {
var envVar string
switch d.Library {
@@ -472,8 +501,8 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) {
if existing {
v = v + ","
}
if d.FilteredID != "" {
v = v + d.FilteredID
if d.FilterID != "" {
v = v + d.FilterID
} else {
v = v + d.ID
}