diff --git a/llama/patches/0026-ggml-Backport-scale-kernel-fixes.patch b/llama/patches/0026-ggml-Backport-scale-kernel-fixes.patch new file mode 100644 index 00000000..651c97ad --- /dev/null +++ b/llama/patches/0026-ggml-Backport-scale-kernel-fixes.patch @@ -0,0 +1,57 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jesse Gross +Date: Tue, 23 Sep 2025 15:41:58 -0700 +Subject: [PATCH] ggml: Backport scale kernel fixes + +The GGML scale kernel uses signed 32-bit ints to represent +the number of elements in the tensor. For large images, +mistral-small3.2 overflows this, triggering CUDA errors due +to negative arguments. + +Currently, this can happen when the user passes a large image +to mistral-small3.2. However, with upcoming changes to reserve +CUDA memory, it happens every time mistral-small is loaded as +we reserve using a worst case batch. + +This patch is part of an upstream GGML commit and should be removed +after GGML is updated past 0a1b398 "ggml: add ops for WAN video model +(cuda && cpu) (#15669)". + +Fixes #10388 +--- + ggml/src/ggml-cuda/scale.cu | 19 ++++++++++--------- + 1 file changed, 10 insertions(+), 9 deletions(-) + +diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu +index 2ee9e5889..0ddeff6a1 100644 +--- a/ggml/src/ggml-cuda/scale.cu ++++ b/ggml/src/ggml-cuda/scale.cu +@@ -1,18 +1,19 @@ + #include "scale.cuh" + +-static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) { +- const int i = blockDim.x*blockIdx.x + threadIdx.x; ++#define MAX_GRIDDIM_X 0x7FFFFFFF + +- if (i >= k) { +- return; +- } ++static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) { ++ int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x; ++ int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x; + +- dst[i] = scale * x[i] + bias; ++ for (int64_t i = tid; i < nelements; i += stride) { ++ dst[i] = scale * x[i] + bias; ++ } + } + +-static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) { +- const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; +- scale_f32<<>>(x, dst, scale, bias, k); ++static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) { ++ const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; ++ scale_f32<<>>(x, dst, scale, bias, nelements); + } + + void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/scale.cu b/ml/backend/ggml/ggml/src/ggml-cuda/scale.cu index 2ee9e588..0ddeff6a 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/scale.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/scale.cu @@ -1,18 +1,19 @@ #include "scale.cuh" -static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; +#define MAX_GRIDDIM_X 0x7FFFFFFF - if (i >= k) { - return; +static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) { + int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x; + int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x; + + for (int64_t i = tid; i < nelements; i += stride) { + dst[i] = scale * x[i] + bias; } - - dst[i] = scale * x[i] + bias; } -static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; - scale_f32<<>>(x, dst, scale, bias, k); +static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) { + const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; + scale_f32<<>>(x, dst, scale, bias, nelements); } void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {