mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-23 23:18:26 +00:00
llm: Allow overriding flash attention setting
As we automatically enable flash attention for more models, there are likely some cases where we get it wrong. This allows setting OLLAMA_FLASH_ATTENTION=0 to disable it, even for models that usually have flash attention.
This commit is contained in:
@@ -195,7 +195,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
slog.Warn("model missing blk.0 layer size")
|
||||
}
|
||||
|
||||
useFlashAttention := (envconfig.FlashAttention() || f.FlashAttention()) &&
|
||||
useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) &&
|
||||
(discover.GpuInfoList)(gpus).FlashAttentionSupported() &&
|
||||
f.SupportsFlashAttention()
|
||||
|
||||
|
||||
@@ -196,14 +196,10 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
loadRequest.ProjectorPath = projectors[0]
|
||||
}
|
||||
|
||||
fa := envconfig.FlashAttention(f.FlashAttention())
|
||||
|
||||
// This will disable flash attention unless all GPUs on the system support it, even if we end up selecting a subset
|
||||
// that can handle it.
|
||||
fa := envconfig.FlashAttention()
|
||||
if f.FlashAttention() {
|
||||
slog.Info("model wants flash attention")
|
||||
fa = true
|
||||
}
|
||||
|
||||
if fa && !gpus.FlashAttentionSupported() {
|
||||
slog.Warn("flash attention enabled but not supported by gpu")
|
||||
fa = false
|
||||
|
||||
Reference in New Issue
Block a user