discover: Disable flash attention for Jetson Xavier (CC 7.2)

GGML picks the wrong kernel and these systems fail with: Sep 28 22:25:39 xavier ollama[48999]: //ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu:437: ERROR: CUDA kernel flash_attn_ext_f16 has no device code compatible with CUDA arch 720. ggml-cuda.cu was compiled for: __CUDA_ARCH_LIST__ Fixes #12442
2025-12-21 22:33:56 +00:00 · 2025-10-07 11:37:58 -07:00
parent 4e5d862ec4
commit aa45f7ce27
3 changed files with 17 additions and 13 deletions
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -2,7 +2,6 @@ package discover
 import (
 	"context"
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
@@ -62,17 +61,14 @@ func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList {
 			DependencyPath: dev.LibraryPath,
 			DriverMajor:    dev.DriverMajor,
 			DriverMinor:    dev.DriverMinor,
 			ComputeMajor:   dev.ComputeMajor,
 			ComputeMinor:   dev.ComputeMinor,
 		}
 		if dev.Library == "CUDA" || dev.Library == "ROCm" {
 			info.MinimumMemory = 457 * format.MebiByte
 		}
-		if dev.Library == "ROCm" {
+		if dev.Library == "ROCm" && rocmDir != "" {
-			info.Compute = fmt.Sprintf("gfx%x%02x", dev.ComputeMajor, dev.ComputeMinor)
+			info.DependencyPath = append(info.DependencyPath, rocmDir)
 			if rocmDir != "" {
 				info.DependencyPath = append(info.DependencyPath, rocmDir)
 			}
 		} else {
 			info.Compute = fmt.Sprintf("%d.%d", dev.ComputeMajor, dev.ComputeMinor)
 		}
 		resp = append(resp, info)
 	}
--- a/discover/types.go
+++ b/discover/types.go
@@ -37,9 +37,10 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	UnreliableFreeMemory bool
 	// GPU information
-	filterID string // AMD Workaround: The numeric ID of the device used to filter out other devices
+	filterID     string // AMD Workaround: The numeric ID of the device used to filter out other devices
-	Name     string `json:"name"`    // user friendly name if available
+	Name         string `json:"name"`          // user friendly name if available
-	Compute  string `json:"compute"` // Compute Capability or gfx
+	ComputeMajor int    `json:"compute_major"` // Compute Capability or gfx
 	ComputeMinor int    `json:"compute_minor"`
 	// Driver Information - TODO no need to put this on each GPU
 	DriverMajor int `json:"driver_major,omitempty"`
@@ -173,7 +174,7 @@ func (l GpuInfoList) FlashAttentionSupported() bool {
 	for _, gpu := range l {
 		supportsFA := gpu.Library == "cpu" ||
 			gpu.Name == "Metal" || gpu.Library == "Metal" ||
-			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7) ||
+			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || // We don't have kernels for Jetson Xavier
 			gpu.Library == "ROCm"
 		if !supportsFA {
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -266,11 +266,18 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		}
 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
 		if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
 			var compute string
 			if gpus[i].Library == "ROCm" {
 				compute = fmt.Sprintf("gfx%x%02x", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
 			} else {
 				compute = fmt.Sprintf("%d.%d", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
 			}
 			slog.Debug("gpu has too little memory to allocate any layers",
 				"id", gpus[i].ID,
 				"library", gpus[i].Library,
 				"variant", gpus[i].Variant,
-				"compute", gpus[i].Compute,
+				"compute", compute,
 				"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
 				"name", gpus[i].Name,
 				"total", format.HumanBytes2(gpus[i].TotalMemory),