From 71cb86af3e8b8006540550a8eeb9fed106b77eee Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 9 Sep 2025 10:37:28 -0700 Subject: [PATCH] llm: Remove unneeded warning with flash attention enabled If flash attention is enabled without KV cache quanitization, we will currently always get this warning: level=WARN source=server.go:226 msg="kv cache type not supported by model" type="" --- fs/ggml/ggml.go | 10 +++++++--- llm/memory.go | 2 +- llm/server.go | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 56ad420e..57476a9a 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -864,12 +864,16 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { // SupportsKVCacheType checks if the requested cache type is supported func (f GGML) SupportsKVCacheType(cacheType string) bool { + if cacheType == "" || cacheType == "f16" { + return true + } + if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) { // gpt-oss uses attention with sinks which does not support quantized cache types - slog.Warn("model only supports non-quantized cache types ", "mode", arch) - return cacheType == "f16" + slog.Warn("model only supports non-quantized cache types", "model", arch) + return false } - return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType) + return slices.Contains([]string{"q8_0", "q4_0"}, cacheType) } // SupportsFlashAttention checks if the model supports flash attention diff --git a/llm/memory.go b/llm/memory.go index ce128eb5..7a87b28f 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -202,7 +202,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin var kvct string if useFlashAttention { requested := strings.ToLower(envconfig.KvCacheType()) - if requested != "" && f.SupportsKVCacheType(requested) { + if f.SupportsKVCacheType(requested) { kvct = requested } } diff --git a/llm/server.go b/llm/server.go index 4740a1fd..a22ae972 100644 --- a/llm/server.go +++ b/llm/server.go @@ -221,7 +221,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a // Flash Attention also supports kv cache quantization // Enable if the requested and kv cache type is supported by the model - if kvct != "" && f.SupportsKVCacheType(kvct) { + if f.SupportsKVCacheType(kvct) { loadRequest.KvCacheType = kvct } else { slog.Warn("kv cache type not supported by model", "type", kvct)