mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 14:26:30 +00:00
llm: Remove unneeded warning with flash attention enabled
If flash attention is enabled without KV cache quanitization, we will currently always get this warning: level=WARN source=server.go:226 msg="kv cache type not supported by model" type=""
This commit is contained in:
@@ -864,12 +864,16 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
|
|||||||
|
|
||||||
// SupportsKVCacheType checks if the requested cache type is supported
|
// SupportsKVCacheType checks if the requested cache type is supported
|
||||||
func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
||||||
|
if cacheType == "" || cacheType == "f16" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
|
if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
|
||||||
// gpt-oss uses attention with sinks which does not support quantized cache types
|
// gpt-oss uses attention with sinks which does not support quantized cache types
|
||||||
slog.Warn("model only supports non-quantized cache types ", "mode", arch)
|
slog.Warn("model only supports non-quantized cache types", "model", arch)
|
||||||
return cacheType == "f16"
|
return false
|
||||||
}
|
}
|
||||||
return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
|
return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
|
||||||
}
|
}
|
||||||
|
|
||||||
// SupportsFlashAttention checks if the model supports flash attention
|
// SupportsFlashAttention checks if the model supports flash attention
|
||||||
|
|||||||
@@ -202,7 +202,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||||||
var kvct string
|
var kvct string
|
||||||
if useFlashAttention {
|
if useFlashAttention {
|
||||||
requested := strings.ToLower(envconfig.KvCacheType())
|
requested := strings.ToLower(envconfig.KvCacheType())
|
||||||
if requested != "" && f.SupportsKVCacheType(requested) {
|
if f.SupportsKVCacheType(requested) {
|
||||||
kvct = requested
|
kvct = requested
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -221,7 +221,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
|||||||
|
|
||||||
// Flash Attention also supports kv cache quantization
|
// Flash Attention also supports kv cache quantization
|
||||||
// Enable if the requested and kv cache type is supported by the model
|
// Enable if the requested and kv cache type is supported by the model
|
||||||
if kvct != "" && f.SupportsKVCacheType(kvct) {
|
if f.SupportsKVCacheType(kvct) {
|
||||||
loadRequest.KvCacheType = kvct
|
loadRequest.KvCacheType = kvct
|
||||||
} else {
|
} else {
|
||||||
slog.Warn("kv cache type not supported by model", "type", kvct)
|
slog.Warn("kv cache type not supported by model", "type", kvct)
|
||||||
|
|||||||
Reference in New Issue
Block a user