From aa45f7ce27f41ce28e08701cd7b0ef6671646053 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 7 Oct 2025 11:37:58 -0700
Subject: [PATCH] discover: Disable flash attention for Jetson Xavier (CC 7.2)

GGML picks the wrong kernel and these systems fail with:
Sep 28 22:25:39 xavier ollama[48999]: //ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu:437:
ERROR: CUDA kernel flash_attn_ext_f16 has no device code compatible with CUDA arch 720. ggml-cuda.cu
was compiled for: __CUDA_ARCH_LIST__

Fixes #12442
---
 discover/gpu.go   | 12 ++++--------
 discover/types.go |  9 +++++----
 llm/memory.go     |  9 ++++++++-
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/discover/gpu.go b/discover/gpu.go
index f4d002f2..15d1e79f 100644
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -2,7 +2,6 @@ package discover
 
 import (
 	"context"
-	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
@@ -62,17 +61,14 @@ func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList {
 			DependencyPath: dev.LibraryPath,
 			DriverMajor:    dev.DriverMajor,
 			DriverMinor:    dev.DriverMinor,
+			ComputeMajor:   dev.ComputeMajor,
+			ComputeMinor:   dev.ComputeMinor,
 		}
 		if dev.Library == "CUDA" || dev.Library == "ROCm" {
 			info.MinimumMemory = 457 * format.MebiByte
 		}
-		if dev.Library == "ROCm" {
-			info.Compute = fmt.Sprintf("gfx%x%02x", dev.ComputeMajor, dev.ComputeMinor)
-			if rocmDir != "" {
-				info.DependencyPath = append(info.DependencyPath, rocmDir)
-			}
-		} else {
-			info.Compute = fmt.Sprintf("%d.%d", dev.ComputeMajor, dev.ComputeMinor)
+		if dev.Library == "ROCm" && rocmDir != "" {
+			info.DependencyPath = append(info.DependencyPath, rocmDir)
 		}
 		resp = append(resp, info)
 	}
diff --git a/discover/types.go b/discover/types.go
index 718809f4..f0c3989d 100644
--- a/discover/types.go
+++ b/discover/types.go
@@ -37,9 +37,10 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	UnreliableFreeMemory bool
 
 	// GPU information
-	filterID string // AMD Workaround: The numeric ID of the device used to filter out other devices
-	Name     string `json:"name"`    // user friendly name if available
-	Compute  string `json:"compute"` // Compute Capability or gfx
+	filterID     string // AMD Workaround: The numeric ID of the device used to filter out other devices
+	Name         string `json:"name"`          // user friendly name if available
+	ComputeMajor int    `json:"compute_major"` // Compute Capability or gfx
+	ComputeMinor int    `json:"compute_minor"`
 
 	// Driver Information - TODO no need to put this on each GPU
 	DriverMajor int `json:"driver_major,omitempty"`
@@ -173,7 +174,7 @@ func (l GpuInfoList) FlashAttentionSupported() bool {
 	for _, gpu := range l {
 		supportsFA := gpu.Library == "cpu" ||
 			gpu.Name == "Metal" || gpu.Library == "Metal" ||
-			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7) ||
+			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || // We don't have kernels for Jetson Xavier
 			gpu.Library == "ROCm"
 
 		if !supportsFA {
diff --git a/llm/memory.go b/llm/memory.go
index 4a54b331..aa4927f1 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -266,11 +266,18 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		}
 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
 		if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
+			var compute string
+			if gpus[i].Library == "ROCm" {
+				compute = fmt.Sprintf("gfx%x%02x", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
+			} else {
+				compute = fmt.Sprintf("%d.%d", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
+			}
+
 			slog.Debug("gpu has too little memory to allocate any layers",
 				"id", gpus[i].ID,
 				"library", gpus[i].Library,
 				"variant", gpus[i].Variant,
-				"compute", gpus[i].Compute,
+				"compute", compute,
 				"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
 				"name", gpus[i].Name,
 				"total", format.HumanBytes2(gpus[i].TotalMemory),