llamarunner: Respect device ordering for offloaded layers

We used to control the way that llama.cpp saw devices using CUDA_VISIBLE_DEVICES or similar. This would ensure that the layers offloaded to a device were actually the ones intended. This is particularly important because we might reorder devices based on free memory or performance. When we started explicitly scheduling layers, this logic went away but the llamarunner didn't have any way to set the correct order of devices. This meant that the correct number of layers would be assigned to a device but not necessarily the layers that were expected. This change sets up the devices correctly based on the offload information.
2025-12-25 07:58:01 +00:00 · 2025-11-10 14:49:46 -08:00
parent 31361c4d3c
commit 4372d0bfef
3 changed files with 69 additions and 11 deletions
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -12,6 +12,7 @@ import (
 	"net/http"
 	"os"
 	"regexp"
+	"sort"
 	"strconv"
 	"strings"
 	"sync"
@@ -900,19 +901,24 @@ func (s *Server) load(w http.ResponseWriter, r *http.Request) {
 		s.seqs = make([]*Sequence, s.parallel)
 		s.seqsSem = semaphore.NewWeighted(int64(s.parallel))

-		gpuIDs := llama.EnumerateGPUs()
-		tensorSplit := make([]float32, len(gpuIDs))
 		numGPU := 0
-		for i := range gpuIDs {
-			for _, layers := range req.GPULayers {
-				if gpuIDs[i] == layers.DeviceID {
-					tensorSplit[i] = float32(len(layers.Layers))
+		var tensorSplit []float32
+		var llamaIDs []uint64
+
+		gpuIDs := llama.EnumerateGPUs()
+		sort.Sort(req.GPULayers)
+		for _, layers := range req.GPULayers {
+			for i := range gpuIDs {
+				if gpuIDs[i].DeviceID == layers.DeviceID {
 					numGPU += len(layers.Layers)
+					tensorSplit = append(tensorSplit, float32(len(layers.Layers)))
+					llamaIDs = append(llamaIDs, gpuIDs[i].LlamaID)
 				}
 			}
 		}

 		params := llama.ModelParams{
+			Devices:      llamaIDs,
 			NumGpuLayers: numGPU,
 			MainGpu:      req.MainGPU,
 			UseMmap:      req.UseMmap && len(req.LoraPath) == 0,