llm: New memory management

This changes the memory allocation strategy from upfront estimation to tracking actual allocations done by the engine and reacting to that. The goal is avoid issues caused by both under-estimation (crashing) and over-estimation (low performance due to under-utilized GPUs). It is currently opt-in and can be enabled for models running on the Ollama engine by setting OLLAMA_NEW_ESTIMATES=1. Behavior in other cases is unchanged and will continue to use the existing estimates.
2025-12-22 14:53:56 +00:00 · 2025-05-29 12:21:48 -07:00
parent ef7d26ba2c
commit d5a0d8d904
26 changed files with 1860 additions and 900 deletions
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -8,9 +8,178 @@ import (
 	"testing"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/discover"
+	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/ml"
 	"golang.org/x/sync/semaphore"
 )

+func TestLLMServerFitGPU(t *testing.T) {
+	type gpu struct {
+		library string
+		free    int
+	}
+
+	tests := []struct {
+		name        string
+		gpus        []gpu
+		layers      []int
+		numGPU      int
+		requireFull bool
+		expected    ml.GPULayersList
+		expectedErr error
+	}{
+		{
+			name:     "No GPU",
+			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
+			numGPU:   -1,
+			expected: ml.GPULayersList{},
+		},
+		{
+			name:     "Full single GPU",
+			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
+			numGPU:   -1,
+			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
+		},
+		{
+			name:     "Partial single GPU",
+			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
+			numGPU:   -1,
+			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
+		},
+		{
+			name:     "Single GPU with numGPU 1",
+			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
+			numGPU:   1,
+			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
+		},
+		{
+			name:     "Single GPU with numGPU 0",
+			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
+			numGPU:   0,
+			expected: ml.GPULayersList{},
+		},
+		{
+			name:     "Single GPU with numGPU 999",
+			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
+			numGPU:   999,
+			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
+		},
+		{
+			name:     "Multi GPU fits on one",
+			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
+			numGPU:   -1,
+			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
+		},
+		{
+			name:     "Multi GPU split",
+			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
+			numGPU:   -1,
+			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
+		},
+		{
+			name:     "Multi GPU partial",
+			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
+			numGPU:   -1,
+			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
+		},
+		{
+			name:     "Multi GPU numGPU 1",
+			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
+			numGPU:   1,
+			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
+		},
+		{
+			name:     "Multi GPU numGPU 2",
+			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
+			numGPU:   2,
+			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
+		},
+		{
+			name:     "Multi GPU numGPU 999",
+			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
+			numGPU:   999,
+			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
+		},
+		{
+			name:     "Multi GPU different libraries",
+			gpus:     []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
+			layers:   []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
+			numGPU:   -1,
+			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
+		},
+		{
+			name:        "requireFull",
+			gpus:        []gpu{{free: 256 * format.MebiByte}},
+			layers:      []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
+			numGPU:      -1,
+			requireFull: true,
+			expectedErr: ErrLoadRequiredFull,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			var systemInfo discover.SystemInfo
+			systemInfo.System.TotalMemory = format.GibiByte
+			systemInfo.System.FreeMemory = 512 * format.MebiByte
+			systemInfo.System.FreeSwap = 256 * format.MebiByte
+
+			gpus := make(discover.GpuInfoList, len(tt.gpus))
+			for i := range tt.gpus {
+				gpus[i].ID = fmt.Sprintf("gpu%d", i)
+				gpus[i].Library = tt.gpus[i].library
+				gpus[i].FreeMemory = uint64(tt.gpus[i].free)
+			}
+
+			s := &ollamaServer{
+				llmServer: llmServer{
+					totalLayers: uint64(len(tt.layers)),
+					options: api.Options{
+						Runner: api.Runner{
+							NumGPU: tt.numGPU,
+						},
+					},
+				},
+			}
+
+			s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
+				Weights: make([]ml.Memory, s.totalLayers),
+				Cache:   make([]ml.Memory, s.totalLayers),
+			}, GPUs: make([]ml.DeviceMemory, len(gpus))}
+
+			for i := range tt.layers {
+				s.mem.CPU.Weights[i].Size = uint64(tt.layers[i])
+			}
+
+			for i := range s.mem.GPUs {
+				s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
+				s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers)
+				s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers)
+			}
+
+			gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)
+			if err != tt.expectedErr {
+				t.Fatalf("fitGPU returned error: %v", err)
+			}
+			if gpuLayers.Hash() != tt.expected.Hash() {
+				t.Errorf("fitGPU assigned %v, want %v", gpuLayers, tt.expected)
+			}
+		})
+	}
+}
+
 func TestLLMServerCompletionFormat(t *testing.T) {
 	// This test was written to fix an already deployed issue. It is a bit
 	// of a mess, and but it's good enough, until we can refactoring the