mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 22:33:56 +00:00
flash attn: add auto mode for llama engine (#13052)
* flash attn: add auto mode for llama engine If the user does not specify fa in the environment, use auto-mode. * review comments * ensure kv cache quantized types have FA explicitly enabled additional review comments
This commit is contained in:
26
ml/device.go
26
ml/device.go
@@ -492,6 +492,32 @@ func FlashAttentionSupported(l []DeviceInfo) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
type FlashAttentionType int32
|
||||
|
||||
const (
|
||||
// Aligned with llama_flash_attn_type
|
||||
FlashAttentionAuto FlashAttentionType = -1
|
||||
FlashAttentionDisabled FlashAttentionType = 0
|
||||
FlashAttentionEnabled FlashAttentionType = 1
|
||||
)
|
||||
|
||||
func (f FlashAttentionType) LogValue() slog.Value {
|
||||
return slog.AnyValue(f.String())
|
||||
}
|
||||
|
||||
func (f FlashAttentionType) String() string {
|
||||
switch f {
|
||||
case FlashAttentionAuto:
|
||||
return "Auto"
|
||||
case FlashAttentionDisabled:
|
||||
return "Disabled"
|
||||
case FlashAttentionEnabled:
|
||||
return "Enabled"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// Given the list of GPUs this instantiation is targeted for,
|
||||
// figure out the visible devices environment variables
|
||||
// Set mustFilter true to enable filtering of CUDA devices
|
||||
|
||||
Reference in New Issue
Block a user