From 71cb86af3e8b8006540550a8eeb9fed106b77eee Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 9 Sep 2025 10:37:28 -0700
Subject: [PATCH] llm: Remove unneeded warning with flash attention enabled

If flash attention is enabled without KV cache quanitization, we will
currently always get this warning:
level=WARN source=server.go:226 msg="kv cache type not supported by model" type=""
---
 fs/ggml/ggml.go | 10 +++++++---
 llm/memory.go   |  2 +-
 llm/server.go   |  2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index 56ad420e..57476a9a 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -864,12 +864,16 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 
 // SupportsKVCacheType checks if the requested cache type is supported
 func (f GGML) SupportsKVCacheType(cacheType string) bool {
+	if cacheType == "" || cacheType == "f16" {
+		return true
+	}
+
 	if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
 		// gpt-oss uses attention with sinks which does not support quantized cache types
-		slog.Warn("model only supports non-quantized cache types ", "mode", arch)
-		return cacheType == "f16"
+		slog.Warn("model only supports non-quantized cache types", "model", arch)
+		return false
 	}
-	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
+	return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
 }
 
 // SupportsFlashAttention checks if the model supports flash attention
diff --git a/llm/memory.go b/llm/memory.go
index ce128eb5..7a87b28f 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -202,7 +202,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	var kvct string
 	if useFlashAttention {
 		requested := strings.ToLower(envconfig.KvCacheType())
-		if requested != "" && f.SupportsKVCacheType(requested) {
+		if f.SupportsKVCacheType(requested) {
 			kvct = requested
 		}
 	}
diff --git a/llm/server.go b/llm/server.go
index 4740a1fd..a22ae972 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -221,7 +221,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 
 		// Flash Attention also supports kv cache quantization
 		// Enable if the requested and kv cache type is supported by the model
-		if kvct != "" && f.SupportsKVCacheType(kvct) {
+		if f.SupportsKVCacheType(kvct) {
 			loadRequest.KvCacheType = kvct
 		} else {
 			slog.Warn("kv cache type not supported by model", "type", kvct)