From aa45f7ce27f41ce28e08701cd7b0ef6671646053 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 7 Oct 2025 11:37:58 -0700 Subject: [PATCH] discover: Disable flash attention for Jetson Xavier (CC 7.2) GGML picks the wrong kernel and these systems fail with: Sep 28 22:25:39 xavier ollama[48999]: //ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu:437: ERROR: CUDA kernel flash_attn_ext_f16 has no device code compatible with CUDA arch 720. ggml-cuda.cu was compiled for: __CUDA_ARCH_LIST__ Fixes #12442 --- discover/gpu.go | 12 ++++-------- discover/types.go | 9 +++++---- llm/memory.go | 9 ++++++++- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/discover/gpu.go b/discover/gpu.go index f4d002f2..15d1e79f 100644 --- a/discover/gpu.go +++ b/discover/gpu.go @@ -2,7 +2,6 @@ package discover import ( "context" - "fmt" "log/slog" "os" "path/filepath" @@ -62,17 +61,14 @@ func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList { DependencyPath: dev.LibraryPath, DriverMajor: dev.DriverMajor, DriverMinor: dev.DriverMinor, + ComputeMajor: dev.ComputeMajor, + ComputeMinor: dev.ComputeMinor, } if dev.Library == "CUDA" || dev.Library == "ROCm" { info.MinimumMemory = 457 * format.MebiByte } - if dev.Library == "ROCm" { - info.Compute = fmt.Sprintf("gfx%x%02x", dev.ComputeMajor, dev.ComputeMinor) - if rocmDir != "" { - info.DependencyPath = append(info.DependencyPath, rocmDir) - } - } else { - info.Compute = fmt.Sprintf("%d.%d", dev.ComputeMajor, dev.ComputeMinor) + if dev.Library == "ROCm" && rocmDir != "" { + info.DependencyPath = append(info.DependencyPath, rocmDir) } resp = append(resp, info) } diff --git a/discover/types.go b/discover/types.go index 718809f4..f0c3989d 100644 --- a/discover/types.go +++ b/discover/types.go @@ -37,9 +37,10 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"? UnreliableFreeMemory bool // GPU information - filterID string // AMD Workaround: The numeric ID of the device used to filter out other devices - Name string `json:"name"` // user friendly name if available - Compute string `json:"compute"` // Compute Capability or gfx + filterID string // AMD Workaround: The numeric ID of the device used to filter out other devices + Name string `json:"name"` // user friendly name if available + ComputeMajor int `json:"compute_major"` // Compute Capability or gfx + ComputeMinor int `json:"compute_minor"` // Driver Information - TODO no need to put this on each GPU DriverMajor int `json:"driver_major,omitempty"` @@ -173,7 +174,7 @@ func (l GpuInfoList) FlashAttentionSupported() bool { for _, gpu := range l { supportsFA := gpu.Library == "cpu" || gpu.Name == "Metal" || gpu.Library == "Metal" || - (gpu.Library == "CUDA" && gpu.DriverMajor >= 7) || + (gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || // We don't have kernels for Jetson Xavier gpu.Library == "ROCm" if !supportsFA { diff --git a/llm/memory.go b/llm/memory.go index 4a54b331..aa4927f1 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -266,11 +266,18 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin } // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize { + var compute string + if gpus[i].Library == "ROCm" { + compute = fmt.Sprintf("gfx%x%02x", gpus[i].ComputeMajor, gpus[i].ComputeMinor) + } else { + compute = fmt.Sprintf("%d.%d", gpus[i].ComputeMajor, gpus[i].ComputeMinor) + } + slog.Debug("gpu has too little memory to allocate any layers", "id", gpus[i].ID, "library", gpus[i].Library, "variant", gpus[i].Variant, - "compute", gpus[i].Compute, + "compute", compute, "driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor), "name", gpus[i].Name, "total", format.HumanBytes2(gpus[i].TotalMemory),