llm: Remove unneeded warning with flash attention enabled

If flash attention is enabled without KV cache quanitization, we will
currently always get this warning:
level=WARN source=server.go:226 msg="kv cache type not supported by model" type=""
This commit is contained in:
Jesse Gross
2025-09-09 10:37:28 -07:00
committed by Jesse Gross
parent 5198956372
commit 71cb86af3e
3 changed files with 9 additions and 5 deletions

View File

@@ -221,7 +221,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
// Flash Attention also supports kv cache quantization
// Enable if the requested and kv cache type is supported by the model
if kvct != "" && f.SupportsKVCacheType(kvct) {
if f.SupportsKVCacheType(kvct) {
loadRequest.KvCacheType = kvct
} else {
slog.Warn("kv cache type not supported by model", "type", kvct)