From 073fa31df501d7cb943d3fa0fb19841da9651479 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Aug 2025 12:51:45 -0700 Subject: [PATCH] llm: Don't always evict models in CPU-only mode With old memory estimates, it's currently impossible to load more than one model at a time when no GPUs are available. This is because the check for whether we need to evict a model looks to see if all layers of the new model can be loaded onto GPUs, which is never true if there are no GPUs. Before the memory management changes, there was a special code path for CPU-only systems. This problem does not exist with new memory estimates. Fixes #11974 --- llm/memory.go | 12 ++++++++---- llm/server.go | 5 +---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/llm/memory.go b/llm/memory.go index ee4be741..d8ae5e44 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -30,7 +30,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin // Try to pack into as few GPUs as possible, starting from 1 GPU for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ { gpuSubset := sgl[:numGPUs] - ok, estimatedVRAM := PredictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel) + ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel) if ok { slog.Info("new model will fit in available VRAM across minimum required GPUs, loading", @@ -48,7 +48,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin // - try subsets of GPUs instead of just falling back to 1 or all in a family // Now try all the GPUS (OLLAMA_SCHED_SPREAD is set) - if ok, estimatedVRAM := PredictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok { + if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok { slog.Info("new model will fit in available VRAM, loading", "model", modelPath, "library", sgl[0].Library, @@ -71,7 +71,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s var bestEstimate uint64 var bestFit int for i, gl := range byLibrary { - _, estimatedVRAM := PredictServerFit(gl, f, adapters, projectors, opts, numParallel) + _, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel) if estimatedVRAM > bestEstimate { bestEstimate = estimatedVRAM bestFit = i @@ -81,7 +81,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s } // This algorithm looks for a complete fit to determine if we need to unload other models -func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) { +func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) { // Split up the GPUs by type and try them var estimatedVRAM uint64 for _, gpus := range allGpus.ByLibrary() { @@ -97,6 +97,10 @@ func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj return true, estimatedVRAM } } + + if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory { + return true, estimatedVRAM + } } return false, estimatedVRAM } diff --git a/llm/server.go b/llm/server.go index ecdaa90e..b05e9b82 100644 --- a/llm/server.go +++ b/llm/server.go @@ -492,6 +492,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi if !requireFull { g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel) } else { + slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate) return ErrLoadRequiredFull } } @@ -524,10 +525,6 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi } } - if requireFull && len(gpus) == 1 && gpus[0].Library == "cpu" && s.estimate.TotalSize > gpus[0].FreeMemory { - return ErrLoadRequiredFull - } - slog.Info("offload", "", s.estimate) s.gpus = gpus