llm: Prefer dedicated GPUs over iGPUs when allocating memory

We currently assign model layers to GPUs according to free VRAM, which assumes that GPU performance is roughly equal. This does not work well for mixed dGPU and iGPU systems because iGPUs typically use system memory which is large but their performance is slow. This instead assigns layers to dGPUs first and then iGPUs. In the future, this could be generalized to have a more fine grained notion of GPU performance but dGPU vs. iGPU performance is the most extreme.
2025-12-21 14:26:30 +00:00 · 2025-11-04 11:46:11 -08:00
parent b13fbad0fe
commit 8bf38552de
3 changed files with 117 additions and 52 deletions
--- a/ml/device.go
+++ b/ml/device.go
@@ -367,6 +367,28 @@ func (a ByFreeMemory) Less(i, j int) bool {
 	return a[i].FreeMemory < a[j].FreeMemory
 }

+// ByPerformance groups devices by similar speed
+func ByPerformance(l []DeviceInfo) [][]DeviceInfo {
+	resp := [][]DeviceInfo{}
+	scores := []bool{}
+	for _, info := range l {
+		found := false
+		requested := info.Integrated
+		for i, score := range scores {
+			if score == requested {
+				resp[i] = append(resp[i], info)
+				found = true
+				break
+			}
+		}
+		if !found {
+			scores = append(scores, requested)
+			resp = append(resp, []DeviceInfo{info})
+		}
+	}
+	return resp
+}
+
 func ByLibrary(l []DeviceInfo) [][]DeviceInfo {
 	resp := [][]DeviceInfo{}
 	libs := []string{}