mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 22:33:56 +00:00
kvcache: Use Cast instead of Copy for flash attention masks
Flash attention kernels require the mask of the KV cache be a F16 rather than an F32. We can use the GGML operation ggml_cast to do this rather than doing it ourselves, which allows reuse of a preallocated buffer in the graph rather than allocating a new one for each batch. This improves token generation performance with flash attention by 10-30% (with gpt-oss). This also makes performance with flash attention better than without it, as expected.
This commit is contained in:
@@ -378,9 +378,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
|
||||
maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
|
||||
|
||||
if c.config.MaskDType != ml.DTypeF32 {
|
||||
out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
|
||||
ctx.Forward(maskTensor.Copy(ctx, out))
|
||||
maskTensor = out
|
||||
maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
|
||||
}
|
||||
|
||||
return maskTensor
|
||||
|
||||
Reference in New Issue
Block a user