mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-22 23:03:55 +00:00
DRY out the runner lifecycle code (#12540)
* DRY out the runner lifecycle code Now that discovery uses the runners as well, this unifies the runner spawning code into a single place. This also unifies GPU discovery types with the newer ml.DeviceInfo * win: make incremental builds better Place build artifacts in discrete directories so incremental builds don't have to start fresh * Adjust sort order to consider iGPUs * handle cpu inference oom scenarios * review comments
This commit is contained in:
@@ -9,9 +9,9 @@ import (
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
func TestGenerateDebugRenderOnly(t *testing.T) {
|
||||
@@ -37,9 +37,9 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(&mock),
|
||||
getGpuFn: getGpuFn,
|
||||
getCpuFn: getCpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
@@ -230,9 +230,9 @@ func TestChatDebugRenderOnly(t *testing.T) {
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(&mock),
|
||||
getGpuFn: getGpuFn,
|
||||
getCpuFn: getCpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
|
||||
@@ -12,9 +12,9 @@ import (
|
||||
"github.com/google/go-cmp/cmp"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
// TestGenerateWithBuiltinRenderer tests that api/generate uses built-in renderers
|
||||
@@ -42,9 +42,9 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) {
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(&mock),
|
||||
getGpuFn: getGpuFn,
|
||||
getCpuFn: getCpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
@@ -226,9 +226,9 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) {
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(&mock),
|
||||
getGpuFn: getGpuFn,
|
||||
getCpuFn: getCpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
|
||||
@@ -17,9 +17,9 @@ import (
|
||||
"github.com/google/go-cmp/cmp"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
type mockRunner struct {
|
||||
@@ -48,8 +48,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
|
||||
return
|
||||
}
|
||||
|
||||
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
||||
return func(_ discover.GpuInfoList, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
|
||||
func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
||||
return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
|
||||
return mock, nil
|
||||
}
|
||||
}
|
||||
@@ -157,9 +157,9 @@ func TestGenerateChat(t *testing.T) {
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(&mock),
|
||||
getGpuFn: getGpuFn,
|
||||
getCpuFn: getCpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
@@ -768,9 +768,9 @@ func TestGenerate(t *testing.T) {
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(&mock),
|
||||
getGpuFn: getGpuFn,
|
||||
getCpuFn: getCpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
@@ -1193,9 +1193,9 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(mock),
|
||||
getGpuFn: getGpuFn,
|
||||
getCpuFn: getCpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{llama: mock}
|
||||
return false
|
||||
|
||||
@@ -14,9 +14,9 @@ import (
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
func getTestTools() []api.Tool {
|
||||
@@ -275,9 +275,9 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(&mock),
|
||||
getGpuFn: getGpuFn,
|
||||
getCpuFn: getCpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 100 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
}
|
||||
@@ -426,9 +426,9 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(&mock),
|
||||
getGpuFn: getGpuFn,
|
||||
getCpuFn: getCpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 100 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
}
|
||||
@@ -608,9 +608,9 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(&mock),
|
||||
getGpuFn: getGpuFn,
|
||||
getCpuFn: getCpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
req.successCh <- &runnerRef{
|
||||
llama: &mock,
|
||||
}
|
||||
|
||||
123
server/sched.go
123
server/sched.go
@@ -5,12 +5,9 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"slices"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -52,12 +49,10 @@ type Scheduler struct {
|
||||
activeLoading llm.LlamaServer
|
||||
loaded map[string]*runnerRef
|
||||
|
||||
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool
|
||||
newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
||||
getGpuFn func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList
|
||||
getCpuFn func() discover.GpuInfo
|
||||
|
||||
// waitForRecovery sets the limit for how long to wait for memory usage to recover after unload before scheduling the next model
|
||||
loadFn func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
|
||||
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
||||
getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo
|
||||
getSystemInfoFn func() ml.SystemInfo
|
||||
waitForRecovery time.Duration
|
||||
}
|
||||
|
||||
@@ -77,8 +72,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
||||
unloadedCh: make(chan any, maxQueue),
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: llm.NewLlamaServer,
|
||||
getGpuFn: discover.GetGPUInfo,
|
||||
getCpuFn: discover.GetCPUInfo,
|
||||
getGpuFn: discover.GPUDevices,
|
||||
getSystemInfoFn: discover.GetSystemInfo,
|
||||
waitForRecovery: 5 * time.Second,
|
||||
}
|
||||
sched.loadFn = sched.load
|
||||
@@ -133,6 +128,8 @@ func (s *Scheduler) Run(ctx context.Context) {
|
||||
}
|
||||
|
||||
func (s *Scheduler) processPending(ctx context.Context) {
|
||||
maxRunners := envconfig.MaxRunners()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
@@ -152,7 +149,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
s.loadedMu.Lock()
|
||||
runner := s.loaded[pending.model.ModelPath]
|
||||
loadedCount := len(s.loaded)
|
||||
runnersSnapshot := make([]discover.FilteredRunnerDiscovery, 0, len(s.loaded))
|
||||
runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded))
|
||||
for _, r := range s.loaded {
|
||||
runnersSnapshot = append(runnersSnapshot, r)
|
||||
}
|
||||
@@ -167,39 +164,29 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
pending.useLoadedRunner(runner, s.finishedReqCh)
|
||||
break
|
||||
}
|
||||
} else if envconfig.MaxRunners() > 0 && loadedCount >= int(envconfig.MaxRunners()) {
|
||||
} else if maxRunners > 0 && loadedCount >= int(maxRunners) {
|
||||
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
||||
runnerToExpire = s.findRunnerToUnload()
|
||||
} else {
|
||||
// Either no models are loaded or below envconfig.MaxRunners
|
||||
// Get a refreshed GPU list
|
||||
var gpus discover.GpuInfoList
|
||||
var gpus []ml.DeviceInfo
|
||||
if pending.opts.NumGPU == 0 {
|
||||
gpus = discover.GpuInfoList{s.getCpuFn()}
|
||||
gpus = []ml.DeviceInfo{}
|
||||
} else {
|
||||
gpus = s.getGpuFn(ctx, runnersSnapshot)
|
||||
}
|
||||
|
||||
if envconfig.MaxRunners() <= 0 {
|
||||
// No user specified MaxRunners, so figure out what automatic setting to use
|
||||
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
|
||||
// if any GPU has unreliable free memory reporting, 1x the number of GPUs
|
||||
allReliable := true
|
||||
for _, gpu := range gpus {
|
||||
if gpu.UnreliableFreeMemory {
|
||||
allReliable = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allReliable {
|
||||
// HACK
|
||||
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(defaultModelsPerGPU*len(gpus)))
|
||||
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners(), "gpu_count", len(gpus))
|
||||
systemInfo := s.getSystemInfoFn()
|
||||
if maxRunners <= 0 {
|
||||
// No user specified MaxRunners, so figure out what automatic setting to use for the next load attempt
|
||||
if pending.opts.NumGPU == 0 {
|
||||
// Need to get actual GPU list to set the correct default max models
|
||||
g := s.getGpuFn(ctx, runnersSnapshot)
|
||||
maxRunners = uint(defaultModelsPerGPU * max(len(g), 1))
|
||||
} else {
|
||||
// HACK
|
||||
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(len(gpus)))
|
||||
slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
|
||||
maxRunners = uint(defaultModelsPerGPU * max(len(gpus), 1))
|
||||
}
|
||||
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "gpu_count", len(gpus))
|
||||
}
|
||||
|
||||
// Load model for fitting
|
||||
@@ -215,14 +202,14 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
if loadedCount == 0 {
|
||||
// No models loaded. Load the model but prefer the best fit.
|
||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||
s.loadFn(pending, ggml, gpus, false)
|
||||
s.loadFn(pending, ggml, systemInfo, gpus, false)
|
||||
break
|
||||
}
|
||||
|
||||
// More than one loaded model, so we have to see if the
|
||||
// new one fits
|
||||
|
||||
needEvict := s.loadFn(pending, ggml, gpus, true)
|
||||
needEvict := s.loadFn(pending, ggml, systemInfo, gpus, true)
|
||||
if !needEvict {
|
||||
slog.Debug("new model fits with existing models, loading")
|
||||
break
|
||||
@@ -353,7 +340,7 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
|
||||
runner.refMu.Unlock()
|
||||
} else {
|
||||
slog.Debug("starting background wait for VRAM recovery", "runner", runner)
|
||||
runnersSnapshot := make([]discover.FilteredRunnerDiscovery, 0, len(s.loaded))
|
||||
runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded))
|
||||
for _, r := range s.loaded {
|
||||
runnersSnapshot = append(runnersSnapshot, r)
|
||||
}
|
||||
@@ -395,7 +382,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
||||
|
||||
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
|
||||
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
|
||||
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
|
||||
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
|
||||
numParallel := max(int(envconfig.NumParallel()), 1)
|
||||
|
||||
// Embedding models should always be loaded with parallel=1
|
||||
@@ -420,7 +407,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
|
||||
|
||||
if llama == nil {
|
||||
var err error
|
||||
llama, err = s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||
if err != nil {
|
||||
// some older models are not compatible with newer versions of llama.cpp
|
||||
// show a generalized compatibility error until there is a better way to
|
||||
@@ -443,9 +430,16 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
|
||||
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
gpuIDs, err := llama.Load(req.ctx, gpus, requireFull)
|
||||
gpuIDs, err := llama.Load(req.ctx, systemInfo, gpus, requireFull)
|
||||
if err != nil {
|
||||
if errors.Is(err, llm.ErrLoadRequiredFull) {
|
||||
if !requireFull {
|
||||
// No other models loaded, yet we still don't fit, so report an error
|
||||
slog.Info("model is too large for system memory", "requireFull", requireFull)
|
||||
s.activeLoading.Close()
|
||||
s.activeLoading = nil
|
||||
req.errCh <- err
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -456,6 +450,20 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
|
||||
return false
|
||||
}
|
||||
|
||||
// Determine if we have discrete GPUs which we should monitor VRAM usage on during shutdown
|
||||
discreteGPUs := false
|
||||
iGPUScan:
|
||||
for _, devid := range gpuIDs {
|
||||
for _, dev := range gpus {
|
||||
if dev.DeviceID == devid {
|
||||
if !dev.Integrated {
|
||||
discreteGPUs = true
|
||||
break iGPUScan
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
runner := &runnerRef{
|
||||
model: req.model,
|
||||
modelPath: req.model.ModelPath,
|
||||
@@ -463,6 +471,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
|
||||
Options: &req.opts,
|
||||
sessionDuration: sessionDuration,
|
||||
gpus: gpuIDs,
|
||||
discreteGPUs: discreteGPUs,
|
||||
vramSize: llama.VRAMSize(),
|
||||
totalSize: llama.TotalSize(),
|
||||
loading: true,
|
||||
@@ -510,7 +519,10 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
|
||||
return false
|
||||
}
|
||||
|
||||
func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
|
||||
func (s *Scheduler) updateFreeSpace(allGpus []ml.DeviceInfo) {
|
||||
if len(allGpus) == 0 {
|
||||
return
|
||||
}
|
||||
predMap := map[ml.DeviceID]uint64{} // Sum up the total predicted usage per GPU for all runners
|
||||
s.loadedMu.Lock()
|
||||
runners := make([]*runnerRef, 0, len(s.loaded))
|
||||
@@ -554,12 +566,13 @@ type runnerRef struct {
|
||||
refMu sync.Mutex
|
||||
refCount uint // prevent unloading if > 0
|
||||
|
||||
llama llm.LlamaServer
|
||||
pid int
|
||||
loading bool // True only during initial load, then false forever
|
||||
gpus []ml.DeviceID // Recorded at time of provisioning
|
||||
vramSize uint64
|
||||
totalSize uint64
|
||||
llama llm.LlamaServer
|
||||
pid int
|
||||
loading bool // True only during initial load, then false forever
|
||||
gpus []ml.DeviceID // Recorded at time of provisioning
|
||||
discreteGPUs bool // True if all devices are discrete GPUs - used to skip VRAM recovery check for iGPUs
|
||||
vramSize uint64
|
||||
totalSize uint64
|
||||
|
||||
sessionDuration time.Duration
|
||||
expireTimer *time.Timer
|
||||
@@ -627,14 +640,12 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
|
||||
// a before and after GPU memory allocation. The returned channel
|
||||
// will be notified when we're done waiting, or have timed out and should
|
||||
// proceed anyway
|
||||
func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.FilteredRunnerDiscovery) chan any {
|
||||
func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []ml.FilteredRunnerDiscovery) chan any {
|
||||
finished := make(chan any, 1)
|
||||
|
||||
// CPU or Metal don't need checking, so no waiting required
|
||||
// windows can page VRAM, only cuda currently can report accurate used vram usage
|
||||
if len(runner.gpus) == 0 ||
|
||||
(len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "Metal")) ||
|
||||
(runtime.GOOS == "windows" && runner.gpus[0].Library != "CUDA") {
|
||||
// CPU, Metal and iGPUs don't need checking, so no waiting required
|
||||
if len(runner.gpus) == 0 || !runner.discreteGPUs ||
|
||||
(len(runner.gpus) == 1 && runner.gpus[0].Library == "Metal") {
|
||||
finished <- struct{}{}
|
||||
slog.Debug("no need to wait for VRAM recovery", "runner", runner)
|
||||
return finished
|
||||
@@ -668,7 +679,11 @@ func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.Fi
|
||||
totalMemoryNow += gpu.TotalMemory
|
||||
freeMemoryNow += gpu.FreeMemory
|
||||
}
|
||||
logutil.Trace("gpu VRAM convergence", "percent", int(max(float32(freeMemoryNow-freeMemoryBefore), 0.0)/float32(runner.vramSize)*100))
|
||||
if freeMemoryNow > freeMemoryBefore {
|
||||
logutil.Trace("gpu VRAM convergence", "percent", int(float32(freeMemoryNow-freeMemoryBefore)/float32(runner.vramSize)*100))
|
||||
} else {
|
||||
logutil.Trace("gpu VRAM convergence", "percent", 0)
|
||||
}
|
||||
// If we're within ~75% of the estimated memory usage recovered, bail out
|
||||
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.75 {
|
||||
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "free_before", format.HumanBytes2(freeMemoryBefore), "free_now", format.HumanBytes2(freeMemoryNow), "runner", runner)
|
||||
|
||||
@@ -13,7 +13,6 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/app/lifecycle"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
@@ -50,11 +49,12 @@ func TestSchedLoad(t *testing.T) {
|
||||
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
||||
}
|
||||
// Fail to load model first
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return nil, errors.New("something failed to load model blah")
|
||||
}
|
||||
gpus := discover.GpuInfoList{}
|
||||
s.load(req, f, gpus, false)
|
||||
gpus := []ml.DeviceInfo{}
|
||||
systemInfo := ml.SystemInfo{}
|
||||
s.load(req, f, systemInfo, gpus, false)
|
||||
require.Empty(t, req.successCh)
|
||||
require.Len(t, req.errCh, 1)
|
||||
s.loadedMu.Lock()
|
||||
@@ -64,11 +64,11 @@ func TestSchedLoad(t *testing.T) {
|
||||
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||
|
||||
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
server.modelPath = model
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, f, gpus, false)
|
||||
s.load(req, f, systemInfo, gpus, false)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.NoError(t, err)
|
||||
@@ -82,7 +82,7 @@ func TestSchedLoad(t *testing.T) {
|
||||
|
||||
req.model.ModelPath = "dummy_model_path"
|
||||
server.waitResp = errors.New("wait failure")
|
||||
s.load(req, f, gpus, false)
|
||||
s.load(req, f, systemInfo, gpus, false)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.Contains(t, err.Error(), "wait failure")
|
||||
@@ -106,7 +106,7 @@ type reqBundle struct {
|
||||
f *ggml.GGML
|
||||
}
|
||||
|
||||
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
scenario.srv.modelPath = model
|
||||
return scenario.srv, nil
|
||||
}
|
||||
@@ -152,20 +152,20 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
|
||||
return b
|
||||
}
|
||||
|
||||
func getGpuFn(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
|
||||
func getGpuFn(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
|
||||
slog.Info("test getGpuFn called", "runners", runners)
|
||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
|
||||
g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
g.FreeMemory = 12 * format.GigaByte
|
||||
return []discover.GpuInfo{g}
|
||||
return []ml.DeviceInfo{g}
|
||||
}
|
||||
|
||||
func getCpuFn() discover.GpuInfo {
|
||||
slog.Info("test getCpuFn called")
|
||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "cpu"}}
|
||||
g.TotalMemory = 32 * format.GigaByte
|
||||
g.FreeMemory = 26 * format.GigaByte
|
||||
return g
|
||||
func getSystemInfoFn() ml.SystemInfo {
|
||||
slog.Info("test getSystemInfoFn called")
|
||||
return ml.SystemInfo{
|
||||
TotalMemory: 32 * format.GigaByte,
|
||||
FreeMemory: 26 * format.GigaByte,
|
||||
}
|
||||
}
|
||||
|
||||
func TestSchedRequestsSameModelSameRequest(t *testing.T) {
|
||||
@@ -174,7 +174,7 @@ func TestSchedRequestsSameModelSameRequest(t *testing.T) {
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = getGpuFn
|
||||
s.getCpuFn = getCpuFn
|
||||
s.getSystemInfoFn = getSystemInfoFn
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
||||
b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0}, nil)
|
||||
b.req.model = a.req.model
|
||||
@@ -218,7 +218,7 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = getGpuFn
|
||||
s.getCpuFn = getCpuFn
|
||||
s.getSystemInfoFn = getSystemInfoFn
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
||||
b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
||||
tmpModel := *a.req.model
|
||||
@@ -251,12 +251,12 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
|
||||
a.ctxDone()
|
||||
// Report recovered VRAM usage
|
||||
time.Sleep(1 * time.Millisecond)
|
||||
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
|
||||
slog.Info("XXX altered getGpuFn called")
|
||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
|
||||
s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
|
||||
slog.Info("altered getGpuFn called")
|
||||
g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
g.FreeMemory = 24 * format.GigaByte
|
||||
return []discover.GpuInfo{g}
|
||||
return []ml.DeviceInfo{g}
|
||||
}
|
||||
select {
|
||||
case resp := <-b.req.successCh:
|
||||
@@ -271,26 +271,26 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
|
||||
slog.Info("TestRequestsMultipleLoadedModels")
|
||||
ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = getGpuFn // 1 metal GPU
|
||||
s.getCpuFn = getCpuFn // 1 CPU
|
||||
s.getGpuFn = getGpuFn // 1 Metal GPU
|
||||
s.getSystemInfoFn = getSystemInfoFn
|
||||
|
||||
// Multiple loaded models
|
||||
a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 1 * format.GigaByte})
|
||||
a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 1 * format.GigaByte})
|
||||
a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
|
||||
b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 10 * format.GigaByte})
|
||||
b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 10 * format.GigaByte})
|
||||
b.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
|
||||
c := newScenarioRequest(t, ctx, "model-c-10g-cpu", 10*format.GigaByte, nil, nil /* No GPU load */)
|
||||
c.req.opts.NumGPU = 0 // CPU load, will be allowed
|
||||
b.req.sessionDuration = &api.Duration{Duration: 10 * time.Millisecond} // longer than b to cause the scheduler to favor unloading b over c
|
||||
d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 13 * format.GigaByte}) // Needs prior unloaded
|
||||
d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 13 * format.GigaByte}) // Needs prior unloaded
|
||||
|
||||
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
|
||||
s.newServerFn = a.newServer
|
||||
slog.Info("a")
|
||||
slog.Info("Loading A")
|
||||
s.pendingReqCh <- a.req
|
||||
s.Run(ctx)
|
||||
select {
|
||||
@@ -309,7 +309,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||
|
||||
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
|
||||
s.newServerFn = b.newServer
|
||||
slog.Info("b")
|
||||
slog.Info("Loading B")
|
||||
s.pendingReqCh <- b.req
|
||||
select {
|
||||
case resp := <-b.req.successCh:
|
||||
@@ -327,7 +327,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||
|
||||
// This is a CPU load with NumGPU = 0 so it should load
|
||||
s.newServerFn = c.newServer
|
||||
slog.Info("c")
|
||||
slog.Info("Loading C")
|
||||
s.pendingReqCh <- c.req
|
||||
select {
|
||||
case resp := <-c.req.successCh:
|
||||
@@ -337,6 +337,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||
case err := <-c.req.errCh:
|
||||
t.Fatal(err.Error())
|
||||
case <-ctx.Done():
|
||||
slog.Info("FAIL: scheduler state", "s.loaded", s.loaded)
|
||||
t.Fatal("timeout")
|
||||
}
|
||||
s.loadedMu.Lock()
|
||||
@@ -361,11 +362,11 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||
b.ctxDone()
|
||||
// Report recovered VRAM usage so scheduler will finish waiting and unload
|
||||
time.Sleep(1 * time.Millisecond)
|
||||
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
|
||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
|
||||
s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
|
||||
g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
g.FreeMemory = 24 * format.GigaByte
|
||||
return []discover.GpuInfo{g}
|
||||
return []ml.DeviceInfo{g}
|
||||
}
|
||||
select {
|
||||
case resp := <-d.req.successCh:
|
||||
@@ -404,7 +405,7 @@ func TestSchedGetRunner(t *testing.T) {
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = getGpuFn
|
||||
s.getCpuFn = getCpuFn
|
||||
s.getSystemInfoFn = getSystemInfoFn
|
||||
s.newServerFn = a.newServer
|
||||
slog.Info("a")
|
||||
successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
|
||||
@@ -462,13 +463,14 @@ func TestSchedExpireRunner(t *testing.T) {
|
||||
}
|
||||
|
||||
var f *ggml.GGML
|
||||
gpus := discover.GpuInfoList{}
|
||||
gpus := []ml.DeviceInfo{}
|
||||
systemInfo := ml.SystemInfo{}
|
||||
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
server.modelPath = model
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, f, gpus, false)
|
||||
s.load(req, f, systemInfo, gpus, false)
|
||||
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
@@ -497,19 +499,15 @@ func TestSchedExpireRunner(t *testing.T) {
|
||||
|
||||
// TODO - add one scenario that triggers the bogus finished event with positive ref count
|
||||
func TestSchedPrematureExpired(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
|
||||
ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
// Same model, same request
|
||||
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil, nil)
|
||||
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 100 * time.Millisecond}, nil)
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
|
||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
g.FreeMemory = 12 * format.GigaByte
|
||||
return []discover.GpuInfo{g}
|
||||
}
|
||||
s.getGpuFn = getGpuFn
|
||||
s.getSystemInfoFn = getSystemInfoFn
|
||||
s.newServerFn = scenario1a.newServer
|
||||
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
|
||||
require.Len(t, s.pendingReqCh, 1)
|
||||
@@ -574,7 +572,7 @@ func TestSchedUseLoadedRunner(t *testing.T) {
|
||||
func TestSchedUpdateFreeSpace(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||
defer done()
|
||||
gpus := discover.GpuInfoList{
|
||||
gpus := []ml.DeviceInfo{
|
||||
{
|
||||
DeviceID: ml.DeviceID{
|
||||
ID: "1",
|
||||
@@ -756,8 +754,12 @@ func (s *mockLlm) ModelPath() string {
|
||||
return s.modelPath
|
||||
}
|
||||
|
||||
func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
|
||||
func (s *mockLlm) Load(ctx context.Context, sytemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
|
||||
if requireFull {
|
||||
if len(gpus) == 0 {
|
||||
slog.Info("mockLlm.Load CPU based load")
|
||||
return nil, nil
|
||||
}
|
||||
for _, g := range gpus {
|
||||
if g.FreeMemory >= s.vramSize {
|
||||
return []ml.DeviceID{g.DeviceID}, nil
|
||||
|
||||
Reference in New Issue
Block a user