From 7837a5bc7e8651f41dc6cddde8fd13887e0ff62d Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Thu, 4 Dec 2025 11:42:30 -0800 Subject: [PATCH] ggml: Always set cache padding to 256 We currently use cache padding of 32 when not using flash attention and 256 with flash attention, which is based on the historic alignment requirements of these kernels. The restrictions have since been loosened but there are still performance benefits, such as better CUDA graph reuse. Since the requirement is no longer kernel-specific, set the padding uniformly to 256, as llama.cpp has. --- ml/backend/ggml/ggml.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 39457939..f1a19e0b 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -687,7 +687,7 @@ func (b *Backend) CacheConfig() ml.CacheConfig { if b.flashAttention { return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD} } else { - return ml.CacheConfig{CachePadding: 32, PermutedV: true} + return ml.CacheConfig{CachePadding: 256, PermutedV: true} } }