gptoss: enable flash attention by default (#11996)

2025-12-21 22:33:56 +00:00 · 2025-08-26 13:34:45 -07:00
parent 30fb7e19f8
commit 85ccf7354d
3 changed files with 25 additions and 5 deletions
--- a/llm/server.go
+++ b/llm/server.go
@@ -195,6 +195,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 	// This will disable flash attention unless all GPUs on the system support it, even if we end up selecting a subset
 	// that can handle it.
 	fa := envconfig.FlashAttention()
+	if f.FlashAttention() {
+		slog.Info("model wants flash attention")
+		fa = true
+	}
+
 	if fa && !gpus.FlashAttentionSupported() {
 		slog.Warn("flash attention enabled but not supported by gpu")
 		fa = false