discovery: only retry AMD GPUs (#12894)

* discovery: only retry AMD GPUs CUDA and Vulkan don't crash on unsupported devices, so retry isn't necessary. This also refactors the code to shift the Library specific logic into the ml package. * review comments
2025-12-21 14:26:30 +00:00 · 2025-11-04 15:33:46 -08:00
parent 220e133fca
commit 27f1fde413
9 changed files with 96 additions and 137 deletions
--- a/discover/runner.go
+++ b/discover/runner.go
@@ -27,7 +27,6 @@ var (
 	deviceMu     sync.Mutex
 	devices      []ml.DeviceInfo
 	libDirs      map[string]struct{}
-	rocmDir      string
 	exe          string
 	bootstrapped bool
 )
@@ -61,14 +60,6 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 			libDirs[filepath.Dir(file)] = struct{}{}
 		}

-		// Our current packaging model places ggml-hip in the main directory
-		// but keeps rocm in an isolated directory.  We have to add it to
-		// the [LD_LIBRARY_]PATH so ggml-hip will load properly
-		rocmDir = filepath.Join(ml.LibOllamaPath, "rocm")
-		if _, err := os.Stat(rocmDir); err != nil {
-			rocmDir = ""
-		}
-
 		if len(libDirs) == 0 {
 			libDirs[""] = struct{}{}
 		}
@@ -82,9 +73,20 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 		// are enumerated, but not actually supported.
 		// We run this in serial to avoid potentially initializing a GPU multiple
 		// times concurrently leading to memory contention
-		// TODO refactor so we group the lib dirs and do serial per version, but parallel for different libs
 		for dir := range libDirs {
+			// Typically bootstrapping takes < 1s, but on some systems, with devices
+			// in low power/idle mode, initialization can take multiple seconds.  We
+			// set a longer timeout just for bootstrap discovery to reduce the chance
+			// of giving up too quickly
 			bootstrapTimeout := 30 * time.Second
+			if runtime.GOOS == "windows" {
+				// On Windows with Defender enabled, AV scanning of the DLLs
+				// takes place sequentially and this can significantly increase
+				// the time it takes too do the initial discovery pass.
+				// Subsequent loads will be faster as the scan results are
+				// cached
+				bootstrapTimeout = 90 * time.Second
+			}
 			var dirs []string
 			if dir != "" {
 				if requested != "" && filepath.Base(dir) != requested {
@@ -93,21 +95,11 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 				} else if jetpack != "" && filepath.Base(dir) != "cuda_"+jetpack {
 					continue
 				}
-			}
-			if dir == "" {
-				dirs = []string{ml.LibOllamaPath}
-			} else {
 				dirs = []string{ml.LibOllamaPath, dir}
+			} else {
+				dirs = []string{ml.LibOllamaPath}
 			}

-			// ROCm can take a long time on some systems, so give it more time before giving up
-			if dir != "" && strings.Contains(filepath.Base(dir), "rocm") {
-				bootstrapTimeout = 60 * time.Second
-			}
-			// Typically bootstrapping takes < 1s, but on some systems, with devices
-			// in low power/idle mode, initialization can take multiple seconds.  We
-			// set a long timeout just for bootstrap discovery to reduce the chance
-			// of giving up too quickly
 			ctx1stPass, cancel := context.WithTimeout(ctx, bootstrapTimeout)
 			defer cancel()

@@ -117,6 +109,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.

 		// In the second pass, we more deeply initialize the GPUs to weed out devices that
 		// aren't supported by a given library.  We run this phase in parallel to speed up discovery.
+		// Only devices that need verification are included in this pass
 		slog.Debug("evluating which if any devices to filter out", "initial_count", len(devices))
 		ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
 		defer cancel()
@@ -125,35 +118,16 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 		supportedMu := sync.Mutex{}
 		supported := make(map[string]map[string]map[string]int) // [Library][libDir][ID] = pre-deletion devices index
 		for i := range devices {
-			libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
-			if devices[i].Library == "Metal" {
+			if !devices[i].NeedsInitValidation() {
 				continue
 			}
-			slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
+			libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
+			slog.Debug("verifying device is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
 			wg.Add(1)
 			go func(i int) {
 				defer wg.Done()
-				var envVar string
-				id := devices[i].ID
-				if devices[i].Library == "ROCm" {
-					if runtime.GOOS != "linux" {
-						envVar = "HIP_VISIBLE_DEVICES"
-					} else {
-						envVar = "ROCR_VISIBLE_DEVICES"
-					}
-				} else if devices[i].Library == "CUDA" {
-					envVar = "CUDA_VISIBLE_DEVICES"
-				} else if devices[i].Library == "Vulkan" {
-					id = devices[i].FilteredID
-					envVar = "GGML_VK_VISIBLE_DEVICES"
-				} else {
-					slog.Error("Unknown Library:" + devices[i].Library)
-				}
-
-				extraEnvs := map[string]string{
-					"GGML_CUDA_INIT": "1", // force deep initialization to trigger crash on unsupported GPUs
-					envVar:           id,  // Filter to just this one GPU
-				}
+				extraEnvs := ml.GetVisibleDevicesEnv(devices[i : i+1])
+				devices[i].AddInitValidation(extraEnvs)
 				if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
 					slog.Debug("filtering device which didn't fully initialize",
 						"id", devices[i].ID,
@@ -178,26 +152,28 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 		wg.Wait()
 		logutil.Trace("supported GPU library combinations before filtering", "supported", supported)

-		filterOutVulkanThatAreSupportedByOtherGPU(needsDelete)
-
 		// Mark for deletion any overlaps - favoring the library version that can cover all GPUs if possible
 		filterOverlapByLibrary(supported, needsDelete)

-		// TODO if we ever support multiple ROCm library versions this algorithm will need to be adjusted to keep the rocmID numeric value correct
-		rocmID := 0
+		// Any Libraries that utilize numeric IDs need adjusting based on any possible filtering taking place
+		postFilteredID := map[string]int{}
 		for i := 0; i < len(needsDelete); i++ {
 			if needsDelete[i] {
 				logutil.Trace("removing unsupported or overlapping GPU combination", "libDir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
 				devices = append(devices[:i], devices[i+1:]...)
 				needsDelete = append(needsDelete[:i], needsDelete[i+1:]...)
 				i--
-			} else if devices[i].Library == "ROCm" {
+			} else {
+				if _, ok := postFilteredID[devices[i].Library]; !ok {
+					postFilteredID[devices[i].Library] = 0
+				}
 				if _, err := strconv.Atoi(devices[i].ID); err == nil {
 					// Replace the numeric ID with the post-filtered IDs
-					devices[i].FilteredID = devices[i].ID
-					devices[i].ID = strconv.Itoa(rocmID)
+					slog.Debug("adjusting filtering IDs", "FilterID", devices[i].ID, "new_ID", strconv.Itoa(postFilteredID[devices[i].Library]))
+					devices[i].FilterID = devices[i].ID
+					devices[i].ID = strconv.Itoa(postFilteredID[devices[i].Library])
 				}
-				rocmID++
+				postFilteredID[devices[i].Library]++
 			}
 		}

@@ -214,7 +190,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 				case ml.DuplicateDevice:
 					// Different library, choose based on priority
 					var droppedDevice ml.DeviceInfo
-					if devices[i].Library == "CUDA" || devices[i].Library == "ROCm" {
+					if devices[i].PreferredLibrary(devices[j]) {
 						droppedDevice = devices[j]
 					} else {
 						droppedDevice = devices[i]
@@ -363,38 +339,6 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 	return devices
 }

-func filterOutVulkanThatAreSupportedByOtherGPU(needsDelete []bool) {
-	// Filter out Vulkan devices that share a PCI ID with a non-Vulkan device that is not marked for deletion
-	for i := range devices {
-		if devices[i].Library != "Vulkan" || needsDelete[i] {
-			continue
-		}
-		if devices[i].PCIID == "" {
-			continue
-		}
-		for j := range devices {
-			if i == j {
-				continue
-			}
-			if devices[j].PCIID == "" {
-				continue
-			}
-			if devices[j].PCIID == devices[i].PCIID && devices[j].Library != "Vulkan" && !needsDelete[j] {
-				needsDelete[i] = true
-				slog.Debug("filtering device with duplicate PCI ID",
-					"id", devices[i].ID,
-					"library", devices[i].Library,
-					"libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
-					"pci_id", devices[i].PCIID,
-					"kept_id", devices[j].ID,
-					"kept_library", devices[j].Library,
-				)
-				break
-			}
-		}
-	}
-}
-
 func filterOverlapByLibrary(supported map[string]map[string]map[string]int, needsDelete []bool) {
 	// For multi-GPU systems, use the newest version that supports all the GPUs
 	for _, byLibDirs := range supported {
--- a/discover/types.go
+++ b/discover/types.go
@@ -41,7 +41,7 @@ func LogDetails(devices []ml.DeviceInfo) {
 		}
 		slog.Info("inference compute",
 			"id", dev.ID,
-			"filtered_id", dev.FilteredID,
+			"filter_id", dev.FilterID,
 			"library", dev.Library,
 			"compute", dev.Compute(),
 			"name", dev.Name,