llm: Remove unneeded warning with flash attention enabled

If flash attention is enabled without KV cache quanitization, we will currently always get this warning: level=WARN source=server.go:226 msg="kv cache type not supported by model" type=""
2025-12-21 22:33:56 +00:00 · 2025-09-09 10:37:28 -07:00
parent 5198956372
commit 71cb86af3e
3 changed files with 9 additions and 5 deletions
--- a/llm/server.go
+++ b/llm/server.go
@@ -221,7 +221,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a

 		// Flash Attention also supports kv cache quantization
 		// Enable if the requested and kv cache type is supported by the model
-		if kvct != "" && f.SupportsKVCacheType(kvct) {
+		if f.SupportsKVCacheType(kvct) {
 			loadRequest.KvCacheType = kvct
 		} else {
 			slog.Warn("kv cache type not supported by model", "type", kvct)