llm: Remove unneeded warning with flash attention enabled

If flash attention is enabled without KV cache quanitization, we will currently always get this warning: level=WARN source=server.go:226 msg="kv cache type not supported by model" type=""
2025-12-21 22:33:56 +00:00 · 2025-09-09 10:37:28 -07:00
parent 5198956372
commit 71cb86af3e
3 changed files with 9 additions and 5 deletions
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -202,7 +202,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	var kvct string
 	if useFlashAttention {
 		requested := strings.ToLower(envconfig.KvCacheType())
-		if requested != "" && f.SupportsKVCacheType(requested) {
+		if f.SupportsKVCacheType(requested) {
 			kvct = requested
 		}
 	}