From 5317202c38437867bc6c9ed21ffc5c949ab6794c Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 25 Nov 2025 14:51:02 -0800 Subject: [PATCH] llm: Don't always evict models on CPU-only systems Model eviction happens when we have at least one other model loaded and are unable to load all layers into VRAM. However, on CPU-only systems we can never load layers into VRAM, so this constantly triggered eviction. Fixes #13227 --- llm/server.go | 10 +++++----- llm/server_test.go | 9 +++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/llm/server.go b/llm/server.go index 4eaa88df..fa4e438d 100644 --- a/llm/server.go +++ b/llm/server.go @@ -874,7 +874,7 @@ func (s *llmServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.Devic }} } gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff) - err := s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers) + err := s.verifyLayout(systemInfo, systemGPUs, memory, requireFull, gpuLayers, layers) if err != nil { return nil, err } @@ -943,7 +943,7 @@ func (s *llmServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMe } // verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory -func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error { +func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error { // These sizes will only increase as we go through additional iterations and get additional information. cpuSize := memory.InputWeights + memory.CPU.Graph var vramSize uint64 @@ -970,8 +970,8 @@ nextLayer: } if requireFull { - if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) { - slog.Info("model requires more memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum()) + if len(systemGPUs) > 0 && gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) { + slog.Info("model requires more gpu memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum()) return ErrLoadRequiredFull } @@ -998,7 +998,7 @@ nextLayer: } } - if gpuLayers.Sum() == 0 { + if len(systemGPUs) > 0 && gpuLayers.Sum() == 0 { slog.Debug("insufficient VRAM to load any model layers") } diff --git a/llm/server_test.go b/llm/server_test.go index 1f5d5cda..5dc0aa9b 100644 --- a/llm/server_test.go +++ b/llm/server_test.go @@ -26,10 +26,11 @@ func TestLLMServerFitGPU(t *testing.T) { expectedErr error }{ { - name: "No GPU", - layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, - numGPU: -1, - expected: ml.GPULayersList{}, + name: "No GPU", + layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, + numGPU: -1, + expected: ml.GPULayersList{}, + requireFull: true, // Should not try to evict even though we can't load any layers }, { name: "Full single GPU",