discover: Disable flash attention for Jetson Xavier (CC 7.2)

GGML picks the wrong kernel and these systems fail with: Sep 28 22:25:39 xavier ollama[48999]: //ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu:437: ERROR: CUDA kernel flash_attn_ext_f16 has no device code compatible with CUDA arch 720. ggml-cuda.cu was compiled for: __CUDA_ARCH_LIST__ Fixes #12442
2025-12-21 22:33:56 +00:00 · 2025-10-07 11:37:58 -07:00
parent 4e5d862ec4
commit aa45f7ce27
3 changed files with 17 additions and 13 deletions
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -266,11 +266,18 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		}
 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
 		if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
+			var compute string
+			if gpus[i].Library == "ROCm" {
+				compute = fmt.Sprintf("gfx%x%02x", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
+			} else {
+				compute = fmt.Sprintf("%d.%d", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
+			}
+
 			slog.Debug("gpu has too little memory to allocate any layers",
 				"id", gpus[i].ID,
 				"library", gpus[i].Library,
 				"variant", gpus[i].Variant,
-				"compute", gpus[i].Compute,
+				"compute", compute,
 				"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
 				"name", gpus[i].Name,
 				"total", format.HumanBytes2(gpus[i].TotalMemory),