flash attn: add auto mode for llama engine (#13052)

* flash attn: add auto mode for llama engine If the user does not specify fa in the environment, use auto-mode. * review comments * ensure kv cache quantized types have FA explicitly enabled additional review comments
2025-12-21 14:26:30 +00:00 · 2025-12-12 13:27:19 -08:00
parent 3af5d3b738
commit bd6c1d6b49
7 changed files with 101 additions and 25 deletions
--- a/llm/server.go
+++ b/llm/server.go
@@ -188,6 +188,11 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 	if len(projectors) > 0 && llamaModel != nil {
 		loadRequest.ProjectorPath = projectors[0]
 	}
+	// Determine if the user has forced FA on or off
+	faUserSet := false
+	if envconfig.FlashAttention(true) == envconfig.FlashAttention(false) {
+		faUserSet = true
+	}

 	fa := envconfig.FlashAttention(f.FlashAttention())

@@ -205,19 +210,51 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st

 	kvct := strings.ToLower(envconfig.KvCacheType())

-	if fa {
-		slog.Info("enabling flash attention")
-		loadRequest.FlashAttention = true
-
-		// Flash Attention also supports kv cache quantization
-		// Enable if the requested and kv cache type is supported by the model
-		if f.SupportsKVCacheType(kvct) {
-			loadRequest.KvCacheType = kvct
-		} else {
-			slog.Warn("kv cache type not supported by model", "type", kvct)
+	if textProcessor == nil {
+		flashAttention := ml.FlashAttentionAuto
+		if faUserSet {
+			if fa {
+				flashAttention = ml.FlashAttentionEnabled
+			} else {
+				flashAttention = ml.FlashAttentionDisabled
+			}
+		}
+
+		if kvct != "" {
+			if f.KVCacheTypeIsQuantized(kvct) {
+				if flashAttention != ml.FlashAttentionEnabled {
+					slog.Warn("OLLAMA_FLASH_ATTENTION must be enabled to use a quantized OLLAMA_KV_CACHE_TYPE", "type", kvct)
+					loadRequest.KvCacheType = ""
+				} else if f.SupportsKVCacheType(kvct) {
+					loadRequest.KvCacheType = kvct
+				} else {
+					slog.Warn("unsupported OLLAMA_KV_CACHE_TYPE", "type", kvct)
+				}
+			} else {
+				if f.SupportsKVCacheType(kvct) {
+					loadRequest.KvCacheType = kvct
+				} else {
+					slog.Warn("unsupported OLLAMA_KV_CACHE_TYPE", "type", kvct)
+				}
+			}
+		}
+		loadRequest.FlashAttention = flashAttention
+	} else {
+		// For Ollama engine, use our SupportsFlashAttention logic
+		if fa {
+			slog.Info("enabling flash attention")
+			loadRequest.FlashAttention = ml.FlashAttentionEnabled
+
+			// Flash Attention also supports kv cache quantization
+			// Enable if the requested and kv cache type is supported by the model
+			if f.SupportsKVCacheType(kvct) {
+				loadRequest.KvCacheType = kvct
+			} else {
+				slog.Warn("kv cache type not supported by model", "type", kvct)
+			}
+		} else if kvct != "" && kvct != "f16" {
+			slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
 		}
-	} else if kvct != "" && kvct != "f16" {
-		slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
 	}

 	gpuLibs := ml.LibraryPaths(gpus)
@@ -435,7 +472,7 @@ type LoadRequest struct {
 	LoraPath       []string
 	Parallel       int
 	BatchSize      int
-	FlashAttention bool
+	FlashAttention ml.FlashAttentionType
 	KvSize         int
 	KvCacheType    string
 	NumThreads     int