From 5317202c38437867bc6c9ed21ffc5c949ab6794c Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 25 Nov 2025 14:51:02 -0800
Subject: [PATCH] llm: Don't always evict models on CPU-only systems

Model eviction happens when we have at least one other model
loaded and are unable to load all layers into VRAM. However, on
CPU-only systems we can never load layers into VRAM, so this
constantly triggered eviction.

Fixes #13227
---
 llm/server.go      | 10 +++++-----
 llm/server_test.go |  9 +++++----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/llm/server.go b/llm/server.go
index 4eaa88df..fa4e438d 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -874,7 +874,7 @@ func (s *llmServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.Devic
 		}}
 	}
 	gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff)
-	err := s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
+	err := s.verifyLayout(systemInfo, systemGPUs, memory, requireFull, gpuLayers, layers)
 	if err != nil {
 		return nil, err
 	}
@@ -943,7 +943,7 @@ func (s *llmServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMe
 }
 
 // verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory
-func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
+func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
 	// These sizes will only increase as we go through additional iterations and get additional information.
 	cpuSize := memory.InputWeights + memory.CPU.Graph
 	var vramSize uint64
@@ -970,8 +970,8 @@ nextLayer:
 	}
 
 	if requireFull {
-		if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
-			slog.Info("model requires more memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
+		if len(systemGPUs) > 0 && gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
+			slog.Info("model requires more gpu memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
 			return ErrLoadRequiredFull
 		}
 
@@ -998,7 +998,7 @@ nextLayer:
 		}
 	}
 
-	if gpuLayers.Sum() == 0 {
+	if len(systemGPUs) > 0 && gpuLayers.Sum() == 0 {
 		slog.Debug("insufficient VRAM to load any model layers")
 	}
 
diff --git a/llm/server_test.go b/llm/server_test.go
index 1f5d5cda..5dc0aa9b 100644
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -26,10 +26,11 @@ func TestLLMServerFitGPU(t *testing.T) {
 		expectedErr error
 	}{
 		{
-			name:     "No GPU",
-			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
-			numGPU:   -1,
-			expected: ml.GPULayersList{},
+			name:        "No GPU",
+			layers:      []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
+			numGPU:      -1,
+			expected:    ml.GPULayersList{},
+			requireFull: true, // Should not try to evict even though we can't load any layers
 		},
 		{
 			name:     "Full single GPU",