mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 14:26:30 +00:00
llm: Remove unneeded warning with flash attention enabled
If flash attention is enabled without KV cache quanitization, we will currently always get this warning: level=WARN source=server.go:226 msg="kv cache type not supported by model" type=""
This commit is contained in:
@@ -202,7 +202,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
var kvct string
|
||||
if useFlashAttention {
|
||||
requested := strings.ToLower(envconfig.KvCacheType())
|
||||
if requested != "" && f.SupportsKVCacheType(requested) {
|
||||
if f.SupportsKVCacheType(requested) {
|
||||
kvct = requested
|
||||
}
|
||||
}
|
||||
|
||||
@@ -221,7 +221,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
|
||||
// Flash Attention also supports kv cache quantization
|
||||
// Enable if the requested and kv cache type is supported by the model
|
||||
if kvct != "" && f.SupportsKVCacheType(kvct) {
|
||||
if f.SupportsKVCacheType(kvct) {
|
||||
loadRequest.KvCacheType = kvct
|
||||
} else {
|
||||
slog.Warn("kv cache type not supported by model", "type", kvct)
|
||||
|
||||
Reference in New Issue
Block a user