mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-23 23:18:26 +00:00
flash attn: add auto mode for llama engine (#13052)
* flash attn: add auto mode for llama engine If the user does not specify fa in the environment, use auto-mode. * review comments * ensure kv cache quantized types have FA explicitly enabled additional review comments
This commit is contained in:
@@ -26,6 +26,7 @@ import (
|
||||
"github.com/ollama/ollama/llama"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/logutil"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/runner/common"
|
||||
)
|
||||
|
||||
@@ -832,7 +833,7 @@ func (s *Server) loadModel(
|
||||
ppath string,
|
||||
kvSize int,
|
||||
kvCacheType string,
|
||||
flashAttention bool,
|
||||
flashAttention ml.FlashAttentionType,
|
||||
threads int,
|
||||
multiUserCache bool,
|
||||
) {
|
||||
|
||||
Reference in New Issue
Block a user