mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 14:26:30 +00:00
flash attn: add auto mode for llama engine (#13052)
* flash attn: add auto mode for llama engine If the user does not specify fa in the environment, use auto-mode. * review comments * ensure kv cache quantized types have FA explicitly enabled additional review comments
This commit is contained in:
@@ -188,6 +188,11 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
||||
if len(projectors) > 0 && llamaModel != nil {
|
||||
loadRequest.ProjectorPath = projectors[0]
|
||||
}
|
||||
// Determine if the user has forced FA on or off
|
||||
faUserSet := false
|
||||
if envconfig.FlashAttention(true) == envconfig.FlashAttention(false) {
|
||||
faUserSet = true
|
||||
}
|
||||
|
||||
fa := envconfig.FlashAttention(f.FlashAttention())
|
||||
|
||||
@@ -205,19 +210,51 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
||||
|
||||
kvct := strings.ToLower(envconfig.KvCacheType())
|
||||
|
||||
if fa {
|
||||
slog.Info("enabling flash attention")
|
||||
loadRequest.FlashAttention = true
|
||||
|
||||
// Flash Attention also supports kv cache quantization
|
||||
// Enable if the requested and kv cache type is supported by the model
|
||||
if f.SupportsKVCacheType(kvct) {
|
||||
loadRequest.KvCacheType = kvct
|
||||
} else {
|
||||
slog.Warn("kv cache type not supported by model", "type", kvct)
|
||||
if textProcessor == nil {
|
||||
flashAttention := ml.FlashAttentionAuto
|
||||
if faUserSet {
|
||||
if fa {
|
||||
flashAttention = ml.FlashAttentionEnabled
|
||||
} else {
|
||||
flashAttention = ml.FlashAttentionDisabled
|
||||
}
|
||||
}
|
||||
|
||||
if kvct != "" {
|
||||
if f.KVCacheTypeIsQuantized(kvct) {
|
||||
if flashAttention != ml.FlashAttentionEnabled {
|
||||
slog.Warn("OLLAMA_FLASH_ATTENTION must be enabled to use a quantized OLLAMA_KV_CACHE_TYPE", "type", kvct)
|
||||
loadRequest.KvCacheType = ""
|
||||
} else if f.SupportsKVCacheType(kvct) {
|
||||
loadRequest.KvCacheType = kvct
|
||||
} else {
|
||||
slog.Warn("unsupported OLLAMA_KV_CACHE_TYPE", "type", kvct)
|
||||
}
|
||||
} else {
|
||||
if f.SupportsKVCacheType(kvct) {
|
||||
loadRequest.KvCacheType = kvct
|
||||
} else {
|
||||
slog.Warn("unsupported OLLAMA_KV_CACHE_TYPE", "type", kvct)
|
||||
}
|
||||
}
|
||||
}
|
||||
loadRequest.FlashAttention = flashAttention
|
||||
} else {
|
||||
// For Ollama engine, use our SupportsFlashAttention logic
|
||||
if fa {
|
||||
slog.Info("enabling flash attention")
|
||||
loadRequest.FlashAttention = ml.FlashAttentionEnabled
|
||||
|
||||
// Flash Attention also supports kv cache quantization
|
||||
// Enable if the requested and kv cache type is supported by the model
|
||||
if f.SupportsKVCacheType(kvct) {
|
||||
loadRequest.KvCacheType = kvct
|
||||
} else {
|
||||
slog.Warn("kv cache type not supported by model", "type", kvct)
|
||||
}
|
||||
} else if kvct != "" && kvct != "f16" {
|
||||
slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
|
||||
}
|
||||
} else if kvct != "" && kvct != "f16" {
|
||||
slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
|
||||
}
|
||||
|
||||
gpuLibs := ml.LibraryPaths(gpus)
|
||||
@@ -435,7 +472,7 @@ type LoadRequest struct {
|
||||
LoraPath []string
|
||||
Parallel int
|
||||
BatchSize int
|
||||
FlashAttention bool
|
||||
FlashAttention ml.FlashAttentionType
|
||||
KvSize int
|
||||
KvCacheType string
|
||||
NumThreads int
|
||||
|
||||
Reference in New Issue
Block a user