discovery: only retry AMD GPUs (#12894)

* discovery: only retry AMD GPUs CUDA and Vulkan don't crash on unsupported devices, so retry isn't necessary. This also refactors the code to shift the Library specific logic into the ml package. * review comments
2025-12-21 14:26:30 +00:00 · 2025-11-04 15:33:46 -08:00
parent 220e133fca
commit 27f1fde413
9 changed files with 96 additions and 137 deletions
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -730,10 +730,6 @@ func (b *Backend) BackendDevices() []ml.DeviceInfo {
 			info.PCIID = C.GoString(props.device_id)
 		}
 		info.LibraryPath = ggml.LibPaths()
-		if props.numeric_id != nil {
-			info.FilteredID = C.GoString(props.numeric_id)
-		}
-
 		C.ggml_backend_dev_memory(dev, &props.memory_free, &props.memory_total)
 		info.TotalMemory = (uint64)(props.memory_total)
 		info.FreeMemory = (uint64)(props.memory_free)
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -175,8 +175,6 @@ extern "C" {
        int compute_minor;
        int integrated;
        const char *library;
-        // number with which the devices are accessed (Vulkan)
-        const char *numeric_id;
    };

    GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -12435,7 +12435,6 @@ struct ggml_backend_vk_device_context {
    std::string id;
    std::string uuid;
    std::string luid;
-    std::string numeric_id;
    int major;
    int minor;
    int driver_major;
@@ -12661,7 +12660,6 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
    props->driver_minor = ctx->driver_minor;
    props->integrated = ctx->is_integrated_gpu;
    props->library = GGML_VK_NAME;
-    props->numeric_id = ctx->numeric_id.c_str();
 }

 static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -13142,7 +13140,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
                ctx->driver_major = 0;
                ctx->driver_minor = 0;
-                ctx->numeric_id = std::to_string(i);
            }
            initialized = true;
        }
--- a/ml/device.go
+++ b/ml/device.go
@@ -257,7 +257,7 @@ type DeviceInfo struct {

 	// FilterID is populated with the unfiltered device ID if a numeric ID is used
 	// so the device can be included.
-	FilteredID string `json:"filtered_id,omitempty"`
+	FilterID string `json:"filter_id,omitempty"`

 	// Integrated is set true for integrated GPUs, false for Discrete GPUs
 	Integrated bool `json:"integration,omitempty"`
@@ -455,6 +455,35 @@ func GetVisibleDevicesEnv(l []DeviceInfo) map[string]string {
 	return env
 }

+// NeedsInitValidation returns true if the device in question has the potential
+// to crash at inference time and requires deeper validation before we include
+// it in the supported devices list.
+func (d DeviceInfo) NeedsInitValidation() bool {
+	// At this time the only library we know needs a 2nd pass is ROCm since
+	// rocblas will crash on unsupported devices.  We want to find those crashes
+	// during bootstrap discovery so we can eliminate those GPUs before the user
+	// tries to run inference on them
+	return d.Library == "ROCm"
+}
+
+// Set the init validation environment variable
+func (d DeviceInfo) AddInitValidation(env map[string]string) {
+	env["GGML_CUDA_INIT"] = "1" // force deep initialization to trigger crash on unsupported GPUs
+}
+
+// PreferredLibrary returns true if this library is preferred over the other input
+// library
+// Used to filter out Vulkan in favor of CUDA or ROCm
+func (d DeviceInfo) PreferredLibrary(other DeviceInfo) bool {
+	// TODO in the future if we find Vulkan is better than ROCm on some devices
+	// that implementation can live here.
+
+	if d.Library == "CUDA" || d.Library == "ROCm" {
+		return true
+	}
+	return false
+}
+
 func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) {
 	var envVar string
 	switch d.Library {
@@ -472,8 +501,8 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) {
 	if existing {
 		v = v + ","
 	}
-	if d.FilteredID != "" {
-		v = v + d.FilteredID
+	if d.FilterID != "" {
+		v = v + d.FilterID
 	} else {
 		v = v + d.ID
 	}