mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 22:33:56 +00:00
discover: Disable flash attention for Jetson Xavier (CC 7.2)
GGML picks the wrong kernel and these systems fail with: Sep 28 22:25:39 xavier ollama[48999]: //ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu:437: ERROR: CUDA kernel flash_attn_ext_f16 has no device code compatible with CUDA arch 720. ggml-cuda.cu was compiled for: __CUDA_ARCH_LIST__ Fixes #12442
This commit is contained in:
@@ -2,7 +2,6 @@ package discover
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@@ -62,17 +61,14 @@ func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList {
|
|||||||
DependencyPath: dev.LibraryPath,
|
DependencyPath: dev.LibraryPath,
|
||||||
DriverMajor: dev.DriverMajor,
|
DriverMajor: dev.DriverMajor,
|
||||||
DriverMinor: dev.DriverMinor,
|
DriverMinor: dev.DriverMinor,
|
||||||
|
ComputeMajor: dev.ComputeMajor,
|
||||||
|
ComputeMinor: dev.ComputeMinor,
|
||||||
}
|
}
|
||||||
if dev.Library == "CUDA" || dev.Library == "ROCm" {
|
if dev.Library == "CUDA" || dev.Library == "ROCm" {
|
||||||
info.MinimumMemory = 457 * format.MebiByte
|
info.MinimumMemory = 457 * format.MebiByte
|
||||||
}
|
}
|
||||||
if dev.Library == "ROCm" {
|
if dev.Library == "ROCm" && rocmDir != "" {
|
||||||
info.Compute = fmt.Sprintf("gfx%x%02x", dev.ComputeMajor, dev.ComputeMinor)
|
info.DependencyPath = append(info.DependencyPath, rocmDir)
|
||||||
if rocmDir != "" {
|
|
||||||
info.DependencyPath = append(info.DependencyPath, rocmDir)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
info.Compute = fmt.Sprintf("%d.%d", dev.ComputeMajor, dev.ComputeMinor)
|
|
||||||
}
|
}
|
||||||
resp = append(resp, info)
|
resp = append(resp, info)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -37,9 +37,10 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
|
|||||||
UnreliableFreeMemory bool
|
UnreliableFreeMemory bool
|
||||||
|
|
||||||
// GPU information
|
// GPU information
|
||||||
filterID string // AMD Workaround: The numeric ID of the device used to filter out other devices
|
filterID string // AMD Workaround: The numeric ID of the device used to filter out other devices
|
||||||
Name string `json:"name"` // user friendly name if available
|
Name string `json:"name"` // user friendly name if available
|
||||||
Compute string `json:"compute"` // Compute Capability or gfx
|
ComputeMajor int `json:"compute_major"` // Compute Capability or gfx
|
||||||
|
ComputeMinor int `json:"compute_minor"`
|
||||||
|
|
||||||
// Driver Information - TODO no need to put this on each GPU
|
// Driver Information - TODO no need to put this on each GPU
|
||||||
DriverMajor int `json:"driver_major,omitempty"`
|
DriverMajor int `json:"driver_major,omitempty"`
|
||||||
@@ -173,7 +174,7 @@ func (l GpuInfoList) FlashAttentionSupported() bool {
|
|||||||
for _, gpu := range l {
|
for _, gpu := range l {
|
||||||
supportsFA := gpu.Library == "cpu" ||
|
supportsFA := gpu.Library == "cpu" ||
|
||||||
gpu.Name == "Metal" || gpu.Library == "Metal" ||
|
gpu.Name == "Metal" || gpu.Library == "Metal" ||
|
||||||
(gpu.Library == "CUDA" && gpu.DriverMajor >= 7) ||
|
(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || // We don't have kernels for Jetson Xavier
|
||||||
gpu.Library == "ROCm"
|
gpu.Library == "ROCm"
|
||||||
|
|
||||||
if !supportsFA {
|
if !supportsFA {
|
||||||
|
|||||||
@@ -266,11 +266,18 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
}
|
}
|
||||||
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
||||||
if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
||||||
|
var compute string
|
||||||
|
if gpus[i].Library == "ROCm" {
|
||||||
|
compute = fmt.Sprintf("gfx%x%02x", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
|
||||||
|
} else {
|
||||||
|
compute = fmt.Sprintf("%d.%d", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
|
||||||
|
}
|
||||||
|
|
||||||
slog.Debug("gpu has too little memory to allocate any layers",
|
slog.Debug("gpu has too little memory to allocate any layers",
|
||||||
"id", gpus[i].ID,
|
"id", gpus[i].ID,
|
||||||
"library", gpus[i].Library,
|
"library", gpus[i].Library,
|
||||||
"variant", gpus[i].Variant,
|
"variant", gpus[i].Variant,
|
||||||
"compute", gpus[i].Compute,
|
"compute", compute,
|
||||||
"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
|
"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
|
||||||
"name", gpus[i].Name,
|
"name", gpus[i].Name,
|
||||||
"total", format.HumanBytes2(gpus[i].TotalMemory),
|
"total", format.HumanBytes2(gpus[i].TotalMemory),
|
||||||
|
|||||||
Reference in New Issue
Block a user