diff --git a/llm/server.go b/llm/server.go index c4b84950..4eaa88df 100644 --- a/llm/server.go +++ b/llm/server.go @@ -1007,6 +1007,13 @@ nextLayer: // assignLayers packs the maximum number of layers onto the smallest set of GPUs and comes up with a layer assignment func assignLayers(layers []uint64, gpus []ml.DeviceInfo, requireFull bool, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) { + // If the user is manually overriding parameters, treat all GPUs equally so they split according to VRAM + if requestedLayers >= 0 || envconfig.SchedSpread() { + for i := range gpus { + gpus[i].Integrated = false + } + } + // If we can't fit everything then prefer offloading layers other than the output layer for range 2 { // requestedLayers may be -1 if nothing was requested @@ -1039,33 +1046,38 @@ func assignLayers(layers []uint64, gpus []ml.DeviceInfo, requireFull bool, reque // findBestFit binary searches to find the smallest capacity factor that can fit // the max number of layers. The capacity factor is multiplied by the free space on -// each GPU and a small one will force even balancing. +// each GPU and a small one will force even balancing. Higher performance GPUs are +// used first. func findBestFit(layers []uint64, gpus []ml.DeviceInfo, requestedLayers int, forceRequest bool) (gpuLayers ml.GPULayersList) { - var high float32 = 1 - var low float32 = 0 + for _, gl := range ml.ByPerformance(gpus) { + var high float32 = 1 + var low float32 = 0 - // If we need to fulfill the requested number of layers, pretend we have almost infinite VRAM - if requestedLayers >= 0 && forceRequest { - high = 1000 - } - - bestAssignments := greedyFit(layers, gpus, high, requestedLayers) - maxNumGPU := bestAssignments.Sum() - if maxNumGPU == 0 { - return bestAssignments - } - - for high-low > 1e-6 { - mid := (low + high) / 2 - assignments := greedyFit(layers, gpus, mid, requestedLayers) - if assignments.Sum() == maxNumGPU { - high = mid - bestAssignments = assignments - } else { - low = mid + // If we need to fulfill the requested number of layers, pretend we have almost infinite VRAM + if requestedLayers >= 0 && forceRequest { + high = 1000 } + + bestAssignments := greedyFit(layers, gl, high, requestedLayers) + maxNumGPU := bestAssignments.Sum() + + for high-low > 1e-6 { + mid := (low + high) / 2 + assignments := greedyFit(layers, gl, mid, requestedLayers) + if assignments.Sum() == maxNumGPU { + high = mid + bestAssignments = assignments + } else { + low = mid + } + } + + layers = layers[:len(layers)-bestAssignments.Sum()] + requestedLayers -= bestAssignments.Sum() + gpuLayers = append(bestAssignments, gpuLayers...) } - return bestAssignments + + return gpuLayers } // greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space diff --git a/llm/server_test.go b/llm/server_test.go index 2d3bf6be..1f5d5cda 100644 --- a/llm/server_test.go +++ b/llm/server_test.go @@ -14,16 +14,11 @@ import ( ) func TestLLMServerFitGPU(t *testing.T) { - type gpu struct { - id ml.DeviceID - free int - } - minMemory := 457 * format.MebiByte tests := []struct { name string - gpus []gpu + gpus []ml.DeviceInfo layers []int numGPU int requireFull bool @@ -38,91 +33,91 @@ func TestLLMServerFitGPU(t *testing.T) { }, { name: "Full single GPU", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: -1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}}, }, { name: "Partial single GPU", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, numGPU: -1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}}, }, { name: "Single GPU with numGPU 1", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: 1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}}, }, { name: "Single GPU with numGPU 0", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: 0, expected: ml.GPULayersList{}, }, { name: "Single GPU with numGPU 999", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, numGPU: 999, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}}, }, { name: "Multi GPU fits on one", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: -1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}}, }, { name: "Multi GPU split", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: -1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}}, }, { name: "Multi GPU partial", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte}, numGPU: -1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}}, }, { name: "Multi GPU numGPU 1", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: 1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}}, }, { name: "Multi GPU numGPU 2", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: 2, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}}, }, { name: "Multi GPU numGPU 999", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte}, numGPU: 999, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}}, }, { name: "Multi GPU different libraries", - gpus: []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte}, numGPU: -1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}}, }, { name: "requireFull", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, numGPU: -1, requireFull: true, @@ -130,12 +125,54 @@ func TestLLMServerFitGPU(t *testing.T) { }, { name: "requireFull numGPU", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256 * format.MebiByte)}}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, numGPU: 4, requireFull: true, expectedErr: ErrLoadRequiredFull, }, + { + name: "iGPU", + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, + layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, + numGPU: -1, + expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}}, + }, + { + name: "iGPU + dGPU", + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, + layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, + numGPU: -1, + expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}}, + }, + { + name: "iGPU + dGPU fits on one", + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, + layers: []int{50 * format.MebiByte, 50 * format.MebiByte}, + numGPU: -1, + expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1}}}, + }, + { + name: "iGPU + dGPU partial", + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, + layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, + numGPU: -1, + expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}}, + }, + { + name: "iGPU + dGPU numGPU 1", + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, + layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, + numGPU: 1, + expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}}, + }, + { + name: "iGPU + dGPU numGPU 999", + gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}}, + layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, + numGPU: 999, + expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1, 2, 3}}}, + }, } for _, tt := range tests { @@ -145,12 +182,6 @@ func TestLLMServerFitGPU(t *testing.T) { systemInfo.FreeMemory = 512 * format.MebiByte systemInfo.FreeSwap = 256 * format.MebiByte - gpus := make([]ml.DeviceInfo, len(tt.gpus)) - for i := range tt.gpus { - gpus[i].DeviceID = tt.gpus[i].id - gpus[i].FreeMemory = uint64(tt.gpus[i].free) - } - s := &ollamaServer{ llmServer: llmServer{ totalLayers: uint64(len(tt.layers)), @@ -165,19 +196,19 @@ func TestLLMServerFitGPU(t *testing.T) { s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{ Weights: make([]uint64, s.totalLayers), Cache: make([]uint64, s.totalLayers), - }, GPUs: make([]ml.DeviceMemory, len(gpus))} + }, GPUs: make([]ml.DeviceMemory, len(tt.gpus))} for i := range tt.layers { s.mem.CPU.Weights[i] = uint64(tt.layers[i]) } for i := range s.mem.GPUs { - s.mem.GPUs[i].DeviceID = gpus[i].DeviceID + s.mem.GPUs[i].DeviceID = tt.gpus[i].DeviceID s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers) s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers) } - gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0) + gpuLayers, err := s.createLayout(systemInfo, tt.gpus, s.mem, tt.requireFull, 0) if err != tt.expectedErr { t.Fatalf("fitGPU returned error: %v", err) } diff --git a/ml/device.go b/ml/device.go index 040764fe..f0654127 100644 --- a/ml/device.go +++ b/ml/device.go @@ -367,6 +367,28 @@ func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory } +// ByPerformance groups devices by similar speed +func ByPerformance(l []DeviceInfo) [][]DeviceInfo { + resp := [][]DeviceInfo{} + scores := []bool{} + for _, info := range l { + found := false + requested := info.Integrated + for i, score := range scores { + if score == requested { + resp[i] = append(resp[i], info) + found = true + break + } + } + if !found { + scores = append(scores, requested) + resp = append(resp, []DeviceInfo{info}) + } + } + return resp +} + func ByLibrary(l []DeviceInfo) [][]DeviceInfo { resp := [][]DeviceInfo{} libs := []string{}