DRY out the runner lifecycle code (#12540)

* DRY out the runner lifecycle code Now that discovery uses the runners as well, this unifies the runner spawning code into a single place. This also unifies GPU discovery types with the newer ml.DeviceInfo * win: make incremental builds better Place build artifacts in discrete directories so incremental builds don't have to start fresh * Adjust sort order to consider iGPUs * handle cpu inference oom scenarios * review comments
2025-12-21 14:26:30 +00:00 · 2025-10-23 11:20:02 -07:00
parent 1c093e97af
commit 3258a89b6e
16 changed files with 720 additions and 924 deletions
--- a/discover/cpu_linux_test.go
+++ b/discover/cpu_linux_test.go
@@ -2065,12 +2065,6 @@ power management:
 			cpus := linuxCPUDetails(buf)

 			slog.Info("example", "scenario", k, "cpus", cpus)
-			si := SystemInfo{
-				System: CPUInfo{
-					CPUs: cpus,
-				},
-			}
-			threadCount := si.GetOptimalThreadCount()
 			if len(v.expCPUs) != len(cpus) {
 				t.Fatalf("incorrect number of sockets: expected:%v got:%v", v.expCPUs, cpus)
 			}
@@ -2085,10 +2079,6 @@ power management:
 					t.Fatalf("incorrect number of threads: expected:%v got:%v", v.expCPUs[i], c)
 				}
 			}
-
-			if threadCount != v.expThreadCount {
-				t.Fatalf("incorrect thread count expected:%d got:%d", v.expThreadCount, threadCount)
-			}
 		})
 	}
 }
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -1,16 +1,13 @@
 package discover

 import (
-	"context"
 	"log/slog"
 	"os"
-	"path/filepath"
 	"regexp"
 	"runtime"
 	"strconv"
 	"strings"

-	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/ml"
 )

@@ -18,159 +15,28 @@ import (
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")

-func GetCPUInfo() GpuInfo {
-	mem, err := GetCPUMem()
+// GetSystemInfo returns the last cached state of the GPUs on the system
+func GetSystemInfo() ml.SystemInfo {
+	memInfo, err := GetCPUMem()
 	if err != nil {
 		slog.Warn("error looking up system memory", "error", err)
 	}
-
-	return GpuInfo{
-		memInfo: mem,
-		DeviceID: ml.DeviceID{
-			Library: "cpu",
-			ID:      "0",
-		},
-	}
-}
-
-func GetGPUInfo(ctx context.Context, runners []FilteredRunnerDiscovery) GpuInfoList {
-	devs := GPUDevices(ctx, runners)
-	return devInfoToInfoList(devs)
-}
-
-func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList {
-	resp := []GpuInfo{}
-	// Our current packaging model places ggml-hip in the main directory
-	// but keeps rocm in an isolated directory.  We have to add it to
-	// the [LD_LIBRARY_]PATH so ggml-hip will load properly
-	rocmDir := filepath.Join(LibOllamaPath, "rocm")
-	if _, err := os.Stat(rocmDir); err != nil {
-		rocmDir = ""
+	var threadCount int
+	cpus := GetCPUDetails()
+	for _, c := range cpus {
+		threadCount += c.CoreCount - c.EfficiencyCoreCount
 	}

-	for _, dev := range devs {
-		info := GpuInfo{
-			DeviceID: dev.DeviceID,
-			filterID: dev.FilteredID,
-			Name:     dev.Description,
-			memInfo: memInfo{
-				TotalMemory: dev.TotalMemory,
-				FreeMemory:  dev.FreeMemory,
-			},
-			// TODO can we avoid variant
-			DependencyPath: dev.LibraryPath,
-			DriverMajor:    dev.DriverMajor,
-			DriverMinor:    dev.DriverMinor,
-			ComputeMajor:   dev.ComputeMajor,
-			ComputeMinor:   dev.ComputeMinor,
-		}
-		if dev.Library == "CUDA" || dev.Library == "ROCm" {
-			info.MinimumMemory = 457 * format.MebiByte
-		}
-		if dev.Library == "ROCm" && rocmDir != "" {
-			info.DependencyPath = append(info.DependencyPath, rocmDir)
-		}
-		// TODO any special processing of Vulkan devices?
-		resp = append(resp, info)
-	}
-	if len(resp) == 0 {
-		mem, err := GetCPUMem()
-		if err != nil {
-			slog.Warn("error looking up system memory", "error", err)
-		}
-
-		resp = append(resp, GpuInfo{
-			memInfo: mem,
-			DeviceID: ml.DeviceID{
-				Library: "cpu",
-				ID:      "0",
-			},
-		})
-	}
-	return resp
-}
-
-// Given the list of GPUs this instantiation is targeted for,
-// figure out the visible devices environment variable
-//
-// If different libraries are detected, the first one is what we use
-func (l GpuInfoList) GetVisibleDevicesEnv() []string {
-	if len(l) == 0 {
-		return nil
-	}
-	res := []string{}
-	envVar := rocmGetVisibleDevicesEnv(l)
-	if envVar != "" {
-		res = append(res, envVar)
-	}
-	envVar = vkGetVisibleDevicesEnv(l)
-	if envVar != "" {
-		res = append(res, envVar)
-	}
-	return res
-}
-
-func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
-	ids := []string{}
-	for _, info := range gpuInfo {
-		if info.Library != "ROCm" {
-			continue
-		}
-		// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
-		if info.filterID != "" {
-			ids = append(ids, info.filterID)
-		} else {
-			ids = append(ids, info.ID)
-		}
-	}
-	if len(ids) == 0 {
-		return ""
-	}
-	envVar := "ROCR_VISIBLE_DEVICES="
-	if runtime.GOOS != "linux" {
-		envVar = "HIP_VISIBLE_DEVICES="
-	}
-	// There are 3 potential env vars to use to select GPUs.
-	// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
-	// HIP_VISIBLE_DEVICES supports numeric IDs only
-	// GPU_DEVICE_ORDINAL supports numeric IDs only
-	return envVar + strings.Join(ids, ",")
-}
-
-func vkGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
-	ids := []string{}
-	for _, info := range gpuInfo {
-		if info.Library != "Vulkan" {
-			continue
-		}
-		if info.filterID != "" {
-			ids = append(ids, info.filterID)
-		} else {
-			ids = append(ids, info.ID)
-		}
-	}
-	if len(ids) == 0 {
-		return ""
-	}
-	envVar := "GGML_VK_VISIBLE_DEVICES="
-	return envVar + strings.Join(ids, ",")
-}
-
-// GetSystemInfo returns the last cached state of the GPUs on the system
-func GetSystemInfo() SystemInfo {
-	deviceMu.Lock()
-	defer deviceMu.Unlock()
-	gpus := devInfoToInfoList(devices)
-	if len(gpus) == 1 && gpus[0].Library == "cpu" {
-		gpus = []GpuInfo{}
+	if threadCount == 0 {
+		// Fall back to Go's num CPU
+		threadCount = runtime.NumCPU()
 	}

-	return SystemInfo{
-		System: CPUInfo{
-			CPUs:    GetCPUDetails(),
-			GpuInfo: GetCPUInfo(),
-		},
-		GPUs: gpus,
+	return ml.SystemInfo{
+		ThreadCount: threadCount,
+		TotalMemory: memInfo.TotalMemory,
+		FreeMemory:  memInfo.FreeMemory,
+		FreeSwap:    memInfo.FreeSwap,
 	}
 }

--- a/discover/runner.go
+++ b/discover/runner.go
@@ -4,13 +4,8 @@ package discover

 import (
 	"context"
-	"encoding/json"
-	"fmt"
 	"io"
 	"log/slog"
-	"math/rand"
-	"net"
-	"net/http"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -23,6 +18,7 @@ import (

 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 )
@@ -36,7 +32,7 @@ var (
 	bootstrapped bool
 )

-func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.DeviceInfo {
+func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
 	deviceMu.Lock()
 	defer deviceMu.Unlock()
 	startDiscovery := time.Now()
@@ -154,9 +150,9 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
 					slog.Error("Unknown Library:" + devices[i].Library)
 				}

-				extraEnvs := []string{
-					"GGML_CUDA_INIT=1", // force deep initialization to trigger crash on unsupported GPUs
-					envVar + "=" + id,  // Filter to just this one GPU
+				extraEnvs := map[string]string{
+					"GGML_CUDA_INIT": "1", // force deep initialization to trigger crash on unsupported GPUs
+					envVar:           id,  // Filter to just this one GPU
 				}
 				if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
 					needsDelete[i] = true
@@ -449,100 +445,35 @@ func (r *bootstrapRunner) HasExited() bool {
 	return false
 }

-func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []string) []ml.DeviceInfo {
-	// TODO DRY out with llm/server.go
-	slog.Debug("spawning runner with", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
+func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map[string]string) []ml.DeviceInfo {
+	var out io.Writer
+	if envconfig.LogLevel() == logutil.LevelTrace {
+		out = os.Stderr
+	}
 	start := time.Now()
 	defer func() {
 		slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
 	}()
-	port := 0
-	if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
-		var l *net.TCPListener
-		if l, err = net.ListenTCP("tcp", a); err == nil {
-			port = l.Addr().(*net.TCPAddr).Port
-			l.Close()
-		}
-	}
-	if port == 0 {
-		slog.Debug("ResolveTCPAddr failed, using random port")
-		port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
-	}
-	params := []string{"runner", "--ollama-engine", "--port", strconv.Itoa(port)}
-	var pathEnv string
-	switch runtime.GOOS {
-	case "windows":
-		pathEnv = "PATH"
-	case "darwin":
-		pathEnv = "DYLD_LIBRARY_PATH"
-	default:
-		pathEnv = "LD_LIBRARY_PATH"
-	}
-	libraryPaths := append([]string{LibOllamaPath}, ollamaLibDirs...)
-	if rocmDir != "" {
-		libraryPaths = append(libraryPaths, rocmDir)
-	}
-	// Note: we always put our dependency paths first
-	// since these are the exact version we compiled/linked against
-	if libraryPath, ok := os.LookupEnv(pathEnv); ok {
-		libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
-	}

-	cmd := exec.Command(exe, params...)
-	cmd.Env = os.Environ()
-	if envconfig.LogLevel() == logutil.LevelTrace {
-		cmd.Stdout = os.Stdout
-		cmd.Stderr = os.Stderr
-	}
-
-	// cmd.SysProcAttr = llm.LlamaServerSysProcAttr // circular dependency - bring back once refactored
-	pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
-	pathNeeded := true
-	ollamaPathNeeded := true
-	extraDone := make([]bool, len(extraEnvs))
-	for i := range cmd.Env {
-		cmp := strings.SplitN(cmd.Env[i], "=", 2)
-		if strings.EqualFold(cmp[0], pathEnv) {
-			cmd.Env[i] = pathEnv + "=" + pathEnvVal
-			pathNeeded = false
-		} else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") {
-			cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(ollamaLibDirs, string(filepath.ListSeparator))
-			ollamaPathNeeded = false
-		} else {
-			for j := range extraEnvs {
-				if extraDone[j] {
-					continue
-				}
-				extra := strings.SplitN(extraEnvs[j], "=", 2)
-				if cmp[0] == extra[0] {
-					cmd.Env[i] = extraEnvs[j]
-					extraDone[j] = true
-				}
-			}
-		}
-	}
-	if pathNeeded {
-		cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
-	}
-	if ollamaPathNeeded {
-		cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ollamaLibDirs, string(filepath.ListSeparator)))
-	}
-	for i := range extraDone {
-		if !extraDone[i] {
-			cmd.Env = append(cmd.Env, extraEnvs[i])
-		}
-	}
-	logutil.Trace("starting runner for device discovery", "env", cmd.Env, "cmd", cmd)
-	if err := cmd.Start(); err != nil {
-		slog.Warn("unable to start discovery subprocess", "cmd", cmd, "error", err)
+	logutil.Trace("starting runner for device discovery", "libDirs", ollamaLibDirs, "extraEnvs", extraEnvs)
+	cmd, port, err := llm.StartRunner(
+		true, // ollama engine
+		"",   // no model
+		ollamaLibDirs,
+		out,
+		extraEnvs,
+	)
+	if err != nil {
+		slog.Debug("failed to start runner to discovery GPUs", "error", err)
 		return nil
 	}
+
 	go func() {
 		cmd.Wait() // exit status ignored
 	}()

 	defer cmd.Process.Kill()
-	devices, err := GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
+	devices, err := ml.GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
 	if err != nil {
 		if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
 			// Expected during bootstrapping while we filter out unsupported AMD GPUs
@@ -555,52 +486,3 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []s

 	return devices
 }
-
-func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceInfo, error) {
-	var moreDevices []ml.DeviceInfo
-	port := runner.GetPort()
-	tick := time.Tick(10 * time.Millisecond)
-	for {
-		select {
-		case <-ctx.Done():
-			return nil, fmt.Errorf("failed to finish discovery before timeout")
-		case <-tick:
-			r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
-			if err != nil {
-				return nil, fmt.Errorf("failed to create request: %w", err)
-			}
-			r.Header.Set("Content-Type", "application/json")
-
-			resp, err := http.DefaultClient.Do(r)
-			if err != nil {
-				// slog.Warn("failed to send request", "error", err)
-				if runner.HasExited() {
-					return nil, fmt.Errorf("runner crashed")
-				}
-				continue
-			}
-			defer resp.Body.Close()
-
-			if resp.StatusCode == http.StatusNotFound {
-				// old runner, fall back to bootstrapping model
-				return nil, fmt.Errorf("llamarunner free vram reporting not supported")
-			}
-
-			body, err := io.ReadAll(resp.Body)
-			if err != nil {
-				slog.Warn("failed to read response", "error", err)
-				continue
-			}
-			if resp.StatusCode != 200 {
-				logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
-				return nil, fmt.Errorf("runner error: %s", string(body))
-			}
-
-			if err := json.Unmarshal(body, &moreDevices); err != nil {
-				slog.Warn("unmarshal encode response", "error", err)
-				continue
-			}
-			return moreDevices, nil
-		}
-	}
-}
--- a/discover/types.go
+++ b/discover/types.go
@@ -1,10 +1,8 @@
 package discover

 import (
-	"context"
 	"log/slog"
 	"path/filepath"
-	"runtime"
 	"strings"

 	"github.com/ollama/ollama/format"
@@ -17,50 +15,6 @@ type memInfo struct {
 	FreeSwap    uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
 }

-// Beginning of an `ollama info` command
-type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
-	ml.DeviceID
-	memInfo
-
-	// Optional variant to select (e.g. versions, cpu feature flags)
-	Variant string `json:"variant"`
-
-	// MinimumMemory represents the minimum memory required to use the GPU
-	MinimumMemory uint64 `json:"-"`
-
-	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
-	DependencyPath []string `json:"lib_path,omitempty"`
-
-	// Set to true if we can NOT reliably discover FreeMemory.  A value of true indicates
-	// the FreeMemory is best effort, and may over or under report actual memory usage
-	// False indicates FreeMemory can generally be trusted on this GPU
-	UnreliableFreeMemory bool
-
-	// GPU information
-	filterID     string // AMD/Vulkan Workaround: The numeric ID of the device used to filter out other devices
-	Name         string `json:"name"`          // user friendly name if available
-	ComputeMajor int    `json:"compute_major"` // Compute Capability or gfx
-	ComputeMinor int    `json:"compute_minor"`
-
-	// Driver Information - TODO no need to put this on each GPU
-	DriverMajor int `json:"driver_major,omitempty"`
-	DriverMinor int `json:"driver_minor,omitempty"`
-
-	// TODO other performance capability info to help in scheduling decisions
-}
-
-func (gpu GpuInfo) RunnerName() string {
-	if gpu.Variant != "" {
-		return gpu.Library + "_" + gpu.Variant
-	}
-	return gpu.Library
-}
-
-type CPUInfo struct {
-	GpuInfo
-	CPUs []CPU
-}
-
 // CPU type represents a CPU Package occupying a socket
 type CPU struct {
 	ID                  string `cpuinfo:"processor"`
@@ -71,32 +25,6 @@ type CPU struct {
 	ThreadCount         int
 }

-type GpuInfoList []GpuInfo
-
-func (l GpuInfoList) ByLibrary() []GpuInfoList {
-	resp := []GpuInfoList{}
-	libs := []string{}
-	for _, info := range l {
-		found := false
-		requested := info.Library
-		if info.Variant != "" {
-			requested += "_" + info.Variant
-		}
-		for i, lib := range libs {
-			if lib == requested {
-				resp[i] = append(resp[i], info)
-				found = true
-				break
-			}
-		}
-		if !found {
-			libs = append(libs, requested)
-			resp = append(resp, []GpuInfo{info})
-		}
-	}
-	return resp
-}
-
 func LogDetails(devices []ml.DeviceInfo) {
 	for _, dev := range devices {
 		var libs []string
@@ -141,74 +69,3 @@ func LogDetails(devices []ml.DeviceInfo) {
 		)
 	}
 }
-
-// Sort by Free Space
-type ByFreeMemory []GpuInfo
-
-func (a ByFreeMemory) Len() int           { return len(a) }
-func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
-func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
-
-type SystemInfo struct {
-	System CPUInfo   `json:"system"`
-	GPUs   []GpuInfo `json:"gpus"`
-}
-
-// Return the optimal number of threads to use for inference
-func (si SystemInfo) GetOptimalThreadCount() int {
-	if len(si.System.CPUs) == 0 {
-		// Fall back to Go's num CPU
-		return runtime.NumCPU()
-	}
-
-	coreCount := 0
-	for _, c := range si.System.CPUs {
-		coreCount += c.CoreCount - c.EfficiencyCoreCount
-	}
-
-	return coreCount
-}
-
-// For each GPU, check if it does NOT support flash attention
-func (l GpuInfoList) FlashAttentionSupported() bool {
-	for _, gpu := range l {
-		supportsFA := gpu.Library == "cpu" ||
-			gpu.Name == "Metal" || gpu.Library == "Metal" ||
-			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || // We don't have kernels for Jetson Xavier
-			gpu.Library == "ROCm" ||
-			gpu.Library == "Vulkan"
-
-		if !supportsFA {
-			return false
-		}
-	}
-	return true
-}
-
-type BaseRunner interface {
-	// GetPort returns the localhost port number the runner is running on
-	GetPort() int
-
-	// HasExited indicates if the runner is no longer running.  This can be used during
-	// bootstrap to detect if a given filtered device is incompatible and triggered an assert
-	HasExited() bool
-}
-
-type RunnerDiscovery interface {
-	BaseRunner
-
-	// GetDeviceInfos will perform a query of the underlying device libraries
-	// for device identification and free VRAM information
-	// During bootstrap scenarios, this routine may take seconds to complete
-	GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
-}
-
-type FilteredRunnerDiscovery interface {
-	RunnerDiscovery
-
-	// GetActiveDeviceIDs returns the filtered set of devices actively in
-	// use by this runner for running models.  If the runner is a bootstrap runner, no devices
-	// will be active yet so no device IDs are returned.
-	// This routine will not query the underlying device and will return immediately
-	GetActiveDeviceIDs() []ml.DeviceID
-}