mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 14:26:30 +00:00
DRY out the runner lifecycle code (#12540)
* DRY out the runner lifecycle code Now that discovery uses the runners as well, this unifies the runner spawning code into a single place. This also unifies GPU discovery types with the newer ml.DeviceInfo * win: make incremental builds better Place build artifacts in discrete directories so incremental builds don't have to start fresh * Adjust sort order to consider iGPUs * handle cpu inference oom scenarios * review comments
This commit is contained in:
@@ -2065,12 +2065,6 @@ power management:
|
||||
cpus := linuxCPUDetails(buf)
|
||||
|
||||
slog.Info("example", "scenario", k, "cpus", cpus)
|
||||
si := SystemInfo{
|
||||
System: CPUInfo{
|
||||
CPUs: cpus,
|
||||
},
|
||||
}
|
||||
threadCount := si.GetOptimalThreadCount()
|
||||
if len(v.expCPUs) != len(cpus) {
|
||||
t.Fatalf("incorrect number of sockets: expected:%v got:%v", v.expCPUs, cpus)
|
||||
}
|
||||
@@ -2085,10 +2079,6 @@ power management:
|
||||
t.Fatalf("incorrect number of threads: expected:%v got:%v", v.expCPUs[i], c)
|
||||
}
|
||||
}
|
||||
|
||||
if threadCount != v.expThreadCount {
|
||||
t.Fatalf("incorrect thread count expected:%d got:%d", v.expThreadCount, threadCount)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
164
discover/gpu.go
164
discover/gpu.go
@@ -1,16 +1,13 @@
|
||||
package discover
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
@@ -18,159 +15,28 @@ import (
|
||||
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
||||
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
||||
|
||||
func GetCPUInfo() GpuInfo {
|
||||
mem, err := GetCPUMem()
|
||||
// GetSystemInfo returns the last cached state of the GPUs on the system
|
||||
func GetSystemInfo() ml.SystemInfo {
|
||||
memInfo, err := GetCPUMem()
|
||||
if err != nil {
|
||||
slog.Warn("error looking up system memory", "error", err)
|
||||
}
|
||||
|
||||
return GpuInfo{
|
||||
memInfo: mem,
|
||||
DeviceID: ml.DeviceID{
|
||||
Library: "cpu",
|
||||
ID: "0",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func GetGPUInfo(ctx context.Context, runners []FilteredRunnerDiscovery) GpuInfoList {
|
||||
devs := GPUDevices(ctx, runners)
|
||||
return devInfoToInfoList(devs)
|
||||
}
|
||||
|
||||
func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList {
|
||||
resp := []GpuInfo{}
|
||||
// Our current packaging model places ggml-hip in the main directory
|
||||
// but keeps rocm in an isolated directory. We have to add it to
|
||||
// the [LD_LIBRARY_]PATH so ggml-hip will load properly
|
||||
rocmDir := filepath.Join(LibOllamaPath, "rocm")
|
||||
if _, err := os.Stat(rocmDir); err != nil {
|
||||
rocmDir = ""
|
||||
var threadCount int
|
||||
cpus := GetCPUDetails()
|
||||
for _, c := range cpus {
|
||||
threadCount += c.CoreCount - c.EfficiencyCoreCount
|
||||
}
|
||||
|
||||
for _, dev := range devs {
|
||||
info := GpuInfo{
|
||||
DeviceID: dev.DeviceID,
|
||||
filterID: dev.FilteredID,
|
||||
Name: dev.Description,
|
||||
memInfo: memInfo{
|
||||
TotalMemory: dev.TotalMemory,
|
||||
FreeMemory: dev.FreeMemory,
|
||||
},
|
||||
// TODO can we avoid variant
|
||||
DependencyPath: dev.LibraryPath,
|
||||
DriverMajor: dev.DriverMajor,
|
||||
DriverMinor: dev.DriverMinor,
|
||||
ComputeMajor: dev.ComputeMajor,
|
||||
ComputeMinor: dev.ComputeMinor,
|
||||
}
|
||||
if dev.Library == "CUDA" || dev.Library == "ROCm" {
|
||||
info.MinimumMemory = 457 * format.MebiByte
|
||||
}
|
||||
if dev.Library == "ROCm" && rocmDir != "" {
|
||||
info.DependencyPath = append(info.DependencyPath, rocmDir)
|
||||
}
|
||||
// TODO any special processing of Vulkan devices?
|
||||
resp = append(resp, info)
|
||||
}
|
||||
if len(resp) == 0 {
|
||||
mem, err := GetCPUMem()
|
||||
if err != nil {
|
||||
slog.Warn("error looking up system memory", "error", err)
|
||||
}
|
||||
|
||||
resp = append(resp, GpuInfo{
|
||||
memInfo: mem,
|
||||
DeviceID: ml.DeviceID{
|
||||
Library: "cpu",
|
||||
ID: "0",
|
||||
},
|
||||
})
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// Given the list of GPUs this instantiation is targeted for,
|
||||
// figure out the visible devices environment variable
|
||||
//
|
||||
// If different libraries are detected, the first one is what we use
|
||||
func (l GpuInfoList) GetVisibleDevicesEnv() []string {
|
||||
if len(l) == 0 {
|
||||
return nil
|
||||
}
|
||||
res := []string{}
|
||||
envVar := rocmGetVisibleDevicesEnv(l)
|
||||
if envVar != "" {
|
||||
res = append(res, envVar)
|
||||
}
|
||||
envVar = vkGetVisibleDevicesEnv(l)
|
||||
if envVar != "" {
|
||||
res = append(res, envVar)
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
|
||||
ids := []string{}
|
||||
for _, info := range gpuInfo {
|
||||
if info.Library != "ROCm" {
|
||||
continue
|
||||
}
|
||||
// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
|
||||
if info.filterID != "" {
|
||||
ids = append(ids, info.filterID)
|
||||
} else {
|
||||
ids = append(ids, info.ID)
|
||||
}
|
||||
}
|
||||
if len(ids) == 0 {
|
||||
return ""
|
||||
}
|
||||
envVar := "ROCR_VISIBLE_DEVICES="
|
||||
if runtime.GOOS != "linux" {
|
||||
envVar = "HIP_VISIBLE_DEVICES="
|
||||
}
|
||||
// There are 3 potential env vars to use to select GPUs.
|
||||
// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
|
||||
// HIP_VISIBLE_DEVICES supports numeric IDs only
|
||||
// GPU_DEVICE_ORDINAL supports numeric IDs only
|
||||
return envVar + strings.Join(ids, ",")
|
||||
}
|
||||
|
||||
func vkGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
|
||||
ids := []string{}
|
||||
for _, info := range gpuInfo {
|
||||
if info.Library != "Vulkan" {
|
||||
continue
|
||||
}
|
||||
if info.filterID != "" {
|
||||
ids = append(ids, info.filterID)
|
||||
} else {
|
||||
ids = append(ids, info.ID)
|
||||
}
|
||||
}
|
||||
if len(ids) == 0 {
|
||||
return ""
|
||||
}
|
||||
envVar := "GGML_VK_VISIBLE_DEVICES="
|
||||
return envVar + strings.Join(ids, ",")
|
||||
}
|
||||
|
||||
// GetSystemInfo returns the last cached state of the GPUs on the system
|
||||
func GetSystemInfo() SystemInfo {
|
||||
deviceMu.Lock()
|
||||
defer deviceMu.Unlock()
|
||||
gpus := devInfoToInfoList(devices)
|
||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||
gpus = []GpuInfo{}
|
||||
if threadCount == 0 {
|
||||
// Fall back to Go's num CPU
|
||||
threadCount = runtime.NumCPU()
|
||||
}
|
||||
|
||||
return SystemInfo{
|
||||
System: CPUInfo{
|
||||
CPUs: GetCPUDetails(),
|
||||
GpuInfo: GetCPUInfo(),
|
||||
},
|
||||
GPUs: gpus,
|
||||
return ml.SystemInfo{
|
||||
ThreadCount: threadCount,
|
||||
TotalMemory: memInfo.TotalMemory,
|
||||
FreeMemory: memInfo.FreeMemory,
|
||||
FreeSwap: memInfo.FreeSwap,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,13 +4,8 @@ package discover
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"math/rand"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
@@ -23,6 +18,7 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/logutil"
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
@@ -36,7 +32,7 @@ var (
|
||||
bootstrapped bool
|
||||
)
|
||||
|
||||
func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.DeviceInfo {
|
||||
func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
|
||||
deviceMu.Lock()
|
||||
defer deviceMu.Unlock()
|
||||
startDiscovery := time.Now()
|
||||
@@ -154,9 +150,9 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
|
||||
slog.Error("Unknown Library:" + devices[i].Library)
|
||||
}
|
||||
|
||||
extraEnvs := []string{
|
||||
"GGML_CUDA_INIT=1", // force deep initialization to trigger crash on unsupported GPUs
|
||||
envVar + "=" + id, // Filter to just this one GPU
|
||||
extraEnvs := map[string]string{
|
||||
"GGML_CUDA_INIT": "1", // force deep initialization to trigger crash on unsupported GPUs
|
||||
envVar: id, // Filter to just this one GPU
|
||||
}
|
||||
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
|
||||
needsDelete[i] = true
|
||||
@@ -449,100 +445,35 @@ func (r *bootstrapRunner) HasExited() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []string) []ml.DeviceInfo {
|
||||
// TODO DRY out with llm/server.go
|
||||
slog.Debug("spawning runner with", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
|
||||
func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map[string]string) []ml.DeviceInfo {
|
||||
var out io.Writer
|
||||
if envconfig.LogLevel() == logutil.LevelTrace {
|
||||
out = os.Stderr
|
||||
}
|
||||
start := time.Now()
|
||||
defer func() {
|
||||
slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
|
||||
}()
|
||||
port := 0
|
||||
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
|
||||
var l *net.TCPListener
|
||||
if l, err = net.ListenTCP("tcp", a); err == nil {
|
||||
port = l.Addr().(*net.TCPAddr).Port
|
||||
l.Close()
|
||||
}
|
||||
}
|
||||
if port == 0 {
|
||||
slog.Debug("ResolveTCPAddr failed, using random port")
|
||||
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
||||
}
|
||||
params := []string{"runner", "--ollama-engine", "--port", strconv.Itoa(port)}
|
||||
var pathEnv string
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
pathEnv = "PATH"
|
||||
case "darwin":
|
||||
pathEnv = "DYLD_LIBRARY_PATH"
|
||||
default:
|
||||
pathEnv = "LD_LIBRARY_PATH"
|
||||
}
|
||||
libraryPaths := append([]string{LibOllamaPath}, ollamaLibDirs...)
|
||||
if rocmDir != "" {
|
||||
libraryPaths = append(libraryPaths, rocmDir)
|
||||
}
|
||||
// Note: we always put our dependency paths first
|
||||
// since these are the exact version we compiled/linked against
|
||||
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
||||
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
|
||||
}
|
||||
|
||||
cmd := exec.Command(exe, params...)
|
||||
cmd.Env = os.Environ()
|
||||
if envconfig.LogLevel() == logutil.LevelTrace {
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
}
|
||||
|
||||
// cmd.SysProcAttr = llm.LlamaServerSysProcAttr // circular dependency - bring back once refactored
|
||||
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
||||
pathNeeded := true
|
||||
ollamaPathNeeded := true
|
||||
extraDone := make([]bool, len(extraEnvs))
|
||||
for i := range cmd.Env {
|
||||
cmp := strings.SplitN(cmd.Env[i], "=", 2)
|
||||
if strings.EqualFold(cmp[0], pathEnv) {
|
||||
cmd.Env[i] = pathEnv + "=" + pathEnvVal
|
||||
pathNeeded = false
|
||||
} else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") {
|
||||
cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(ollamaLibDirs, string(filepath.ListSeparator))
|
||||
ollamaPathNeeded = false
|
||||
} else {
|
||||
for j := range extraEnvs {
|
||||
if extraDone[j] {
|
||||
continue
|
||||
}
|
||||
extra := strings.SplitN(extraEnvs[j], "=", 2)
|
||||
if cmp[0] == extra[0] {
|
||||
cmd.Env[i] = extraEnvs[j]
|
||||
extraDone[j] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if pathNeeded {
|
||||
cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
|
||||
}
|
||||
if ollamaPathNeeded {
|
||||
cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ollamaLibDirs, string(filepath.ListSeparator)))
|
||||
}
|
||||
for i := range extraDone {
|
||||
if !extraDone[i] {
|
||||
cmd.Env = append(cmd.Env, extraEnvs[i])
|
||||
}
|
||||
}
|
||||
logutil.Trace("starting runner for device discovery", "env", cmd.Env, "cmd", cmd)
|
||||
if err := cmd.Start(); err != nil {
|
||||
slog.Warn("unable to start discovery subprocess", "cmd", cmd, "error", err)
|
||||
logutil.Trace("starting runner for device discovery", "libDirs", ollamaLibDirs, "extraEnvs", extraEnvs)
|
||||
cmd, port, err := llm.StartRunner(
|
||||
true, // ollama engine
|
||||
"", // no model
|
||||
ollamaLibDirs,
|
||||
out,
|
||||
extraEnvs,
|
||||
)
|
||||
if err != nil {
|
||||
slog.Debug("failed to start runner to discovery GPUs", "error", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
go func() {
|
||||
cmd.Wait() // exit status ignored
|
||||
}()
|
||||
|
||||
defer cmd.Process.Kill()
|
||||
devices, err := GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
|
||||
devices, err := ml.GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
|
||||
if err != nil {
|
||||
if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
|
||||
// Expected during bootstrapping while we filter out unsupported AMD GPUs
|
||||
@@ -555,52 +486,3 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []s
|
||||
|
||||
return devices
|
||||
}
|
||||
|
||||
func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceInfo, error) {
|
||||
var moreDevices []ml.DeviceInfo
|
||||
port := runner.GetPort()
|
||||
tick := time.Tick(10 * time.Millisecond)
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, fmt.Errorf("failed to finish discovery before timeout")
|
||||
case <-tick:
|
||||
r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
r.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := http.DefaultClient.Do(r)
|
||||
if err != nil {
|
||||
// slog.Warn("failed to send request", "error", err)
|
||||
if runner.HasExited() {
|
||||
return nil, fmt.Errorf("runner crashed")
|
||||
}
|
||||
continue
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == http.StatusNotFound {
|
||||
// old runner, fall back to bootstrapping model
|
||||
return nil, fmt.Errorf("llamarunner free vram reporting not supported")
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
slog.Warn("failed to read response", "error", err)
|
||||
continue
|
||||
}
|
||||
if resp.StatusCode != 200 {
|
||||
logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
|
||||
return nil, fmt.Errorf("runner error: %s", string(body))
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(body, &moreDevices); err != nil {
|
||||
slog.Warn("unmarshal encode response", "error", err)
|
||||
continue
|
||||
}
|
||||
return moreDevices, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
package discover
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
@@ -17,50 +15,6 @@ type memInfo struct {
|
||||
FreeSwap uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
|
||||
}
|
||||
|
||||
// Beginning of an `ollama info` command
|
||||
type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
|
||||
ml.DeviceID
|
||||
memInfo
|
||||
|
||||
// Optional variant to select (e.g. versions, cpu feature flags)
|
||||
Variant string `json:"variant"`
|
||||
|
||||
// MinimumMemory represents the minimum memory required to use the GPU
|
||||
MinimumMemory uint64 `json:"-"`
|
||||
|
||||
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
||||
DependencyPath []string `json:"lib_path,omitempty"`
|
||||
|
||||
// Set to true if we can NOT reliably discover FreeMemory. A value of true indicates
|
||||
// the FreeMemory is best effort, and may over or under report actual memory usage
|
||||
// False indicates FreeMemory can generally be trusted on this GPU
|
||||
UnreliableFreeMemory bool
|
||||
|
||||
// GPU information
|
||||
filterID string // AMD/Vulkan Workaround: The numeric ID of the device used to filter out other devices
|
||||
Name string `json:"name"` // user friendly name if available
|
||||
ComputeMajor int `json:"compute_major"` // Compute Capability or gfx
|
||||
ComputeMinor int `json:"compute_minor"`
|
||||
|
||||
// Driver Information - TODO no need to put this on each GPU
|
||||
DriverMajor int `json:"driver_major,omitempty"`
|
||||
DriverMinor int `json:"driver_minor,omitempty"`
|
||||
|
||||
// TODO other performance capability info to help in scheduling decisions
|
||||
}
|
||||
|
||||
func (gpu GpuInfo) RunnerName() string {
|
||||
if gpu.Variant != "" {
|
||||
return gpu.Library + "_" + gpu.Variant
|
||||
}
|
||||
return gpu.Library
|
||||
}
|
||||
|
||||
type CPUInfo struct {
|
||||
GpuInfo
|
||||
CPUs []CPU
|
||||
}
|
||||
|
||||
// CPU type represents a CPU Package occupying a socket
|
||||
type CPU struct {
|
||||
ID string `cpuinfo:"processor"`
|
||||
@@ -71,32 +25,6 @@ type CPU struct {
|
||||
ThreadCount int
|
||||
}
|
||||
|
||||
type GpuInfoList []GpuInfo
|
||||
|
||||
func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
||||
resp := []GpuInfoList{}
|
||||
libs := []string{}
|
||||
for _, info := range l {
|
||||
found := false
|
||||
requested := info.Library
|
||||
if info.Variant != "" {
|
||||
requested += "_" + info.Variant
|
||||
}
|
||||
for i, lib := range libs {
|
||||
if lib == requested {
|
||||
resp[i] = append(resp[i], info)
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
libs = append(libs, requested)
|
||||
resp = append(resp, []GpuInfo{info})
|
||||
}
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
func LogDetails(devices []ml.DeviceInfo) {
|
||||
for _, dev := range devices {
|
||||
var libs []string
|
||||
@@ -141,74 +69,3 @@ func LogDetails(devices []ml.DeviceInfo) {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by Free Space
|
||||
type ByFreeMemory []GpuInfo
|
||||
|
||||
func (a ByFreeMemory) Len() int { return len(a) }
|
||||
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
|
||||
|
||||
type SystemInfo struct {
|
||||
System CPUInfo `json:"system"`
|
||||
GPUs []GpuInfo `json:"gpus"`
|
||||
}
|
||||
|
||||
// Return the optimal number of threads to use for inference
|
||||
func (si SystemInfo) GetOptimalThreadCount() int {
|
||||
if len(si.System.CPUs) == 0 {
|
||||
// Fall back to Go's num CPU
|
||||
return runtime.NumCPU()
|
||||
}
|
||||
|
||||
coreCount := 0
|
||||
for _, c := range si.System.CPUs {
|
||||
coreCount += c.CoreCount - c.EfficiencyCoreCount
|
||||
}
|
||||
|
||||
return coreCount
|
||||
}
|
||||
|
||||
// For each GPU, check if it does NOT support flash attention
|
||||
func (l GpuInfoList) FlashAttentionSupported() bool {
|
||||
for _, gpu := range l {
|
||||
supportsFA := gpu.Library == "cpu" ||
|
||||
gpu.Name == "Metal" || gpu.Library == "Metal" ||
|
||||
(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || // We don't have kernels for Jetson Xavier
|
||||
gpu.Library == "ROCm" ||
|
||||
gpu.Library == "Vulkan"
|
||||
|
||||
if !supportsFA {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
type BaseRunner interface {
|
||||
// GetPort returns the localhost port number the runner is running on
|
||||
GetPort() int
|
||||
|
||||
// HasExited indicates if the runner is no longer running. This can be used during
|
||||
// bootstrap to detect if a given filtered device is incompatible and triggered an assert
|
||||
HasExited() bool
|
||||
}
|
||||
|
||||
type RunnerDiscovery interface {
|
||||
BaseRunner
|
||||
|
||||
// GetDeviceInfos will perform a query of the underlying device libraries
|
||||
// for device identification and free VRAM information
|
||||
// During bootstrap scenarios, this routine may take seconds to complete
|
||||
GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
|
||||
}
|
||||
|
||||
type FilteredRunnerDiscovery interface {
|
||||
RunnerDiscovery
|
||||
|
||||
// GetActiveDeviceIDs returns the filtered set of devices actively in
|
||||
// use by this runner for running models. If the runner is a bootstrap runner, no devices
|
||||
// will be active yet so no device IDs are returned.
|
||||
// This routine will not query the underlying device and will return immediately
|
||||
GetActiveDeviceIDs() []ml.DeviceID
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user