llm: Allow overriding flash attention setting

As we automatically enable flash attention for more models, there
are likely some cases where we get it wrong. This allows setting
OLLAMA_FLASH_ATTENTION=0 to disable it, even for models that usually
have flash attention.
This commit is contained in:
Jesse Gross
2025-10-01 14:38:09 -07:00
committed by Jesse Gross
parent 05a43e078a
commit fdb109469f
3 changed files with 15 additions and 12 deletions

View File

@@ -196,14 +196,10 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
loadRequest.ProjectorPath = projectors[0]
}
fa := envconfig.FlashAttention(f.FlashAttention())
// This will disable flash attention unless all GPUs on the system support it, even if we end up selecting a subset
// that can handle it.
fa := envconfig.FlashAttention()
if f.FlashAttention() {
slog.Info("model wants flash attention")
fa = true
}
if fa && !gpus.FlashAttentionSupported() {
slog.Warn("flash attention enabled but not supported by gpu")
fa = false