diff --git a/llm/server.go b/llm/server.go index 1a997e4d..6ba8f8d2 100644 --- a/llm/server.go +++ b/llm/server.go @@ -928,7 +928,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d } } - libraryGpuLayers := assignLayers(layers, gl, s.options.NumGPU, lastUsedGPU) + libraryGpuLayers := assignLayers(layers, gl, requireFull, s.options.NumGPU, lastUsedGPU) if libraryGpuLayers.Sum() > gpuLayers.Sum() { gpuLayers = libraryGpuLayers } @@ -994,7 +994,7 @@ nextLayer: } // assignLayers packs the maximum number of layers onto the smallest set of GPUs and comes up with a layer assignment -func assignLayers(layers []uint64, gpus discover.GpuInfoList, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) { +func assignLayers(layers []uint64, gpus discover.GpuInfoList, requireFull bool, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) { // If we can't fit everything then prefer offloading layers other than the output layer for range 2 { // requestedLayers may be -1 if nothing was requested @@ -1003,14 +1003,14 @@ func assignLayers(layers []uint64, gpus discover.GpuInfoList, requestedLayers in if !envconfig.SchedSpread() { for i := lastUsedGPU; i < len(gpus); i++ { // Try to pack things into as few GPUs as possible - forceRequest := i == len(gpus)-1 + forceRequest := i == len(gpus)-1 && !requireFull gpuLayers = findBestFit(layers, gpus[:i+1], requestedLayers, forceRequest) if gpuLayers.Sum() == len(layers) || gpuLayers.Sum() == requestedLayers { break } } } else { - gpuLayers = findBestFit(layers, gpus, requestedLayers, true) + gpuLayers = findBestFit(layers, gpus, requestedLayers, !requireFull) } // We only stop if we've gotten all of the layers - even if we got requestedLayers, we still diff --git a/llm/server_test.go b/llm/server_test.go index f1e67c34..bdedc960 100644 --- a/llm/server_test.go +++ b/llm/server_test.go @@ -127,6 +127,14 @@ func TestLLMServerFitGPU(t *testing.T) { requireFull: true, expectedErr: ErrLoadRequiredFull, }, + { + name: "requireFull numGPU", + gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, + layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, + numGPU: 4, + requireFull: true, + expectedErr: ErrLoadRequiredFull, + }, } for _, tt := range tests {