From 0796d79d19345bc3724bd08108a96b669b0f1841 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Tue, 18 Nov 2025 11:13:37 -0800 Subject: [PATCH] cuda: skip large batches cuda panics on batches larger than 1024 so skip those and fallback to cpu --- ...-Add-memory-detection-using-DXGI-PDH.patch | 2 +- ..._vk_buffer_write_2d-from-ggml_vk_buf.patch | 2 +- ...er-Dot-Refactor-and-K-Quant-support-.patch | 2 +- ...pk_moe-fusion-to-handle-gpt-s-late-s.patch | 2 +- ...0032-vulkan-Fuse-rope-set_rows-16769.patch | 2 +- ...gsort-with-a-large-number-of-rows-16.patch | 2 +- ...-when-FP16-mul_mat-accumulation-is-n.patch | 2 +- .../0036-ggml-cuda-skip-large-batches.patch | 25 +++++++++++++++++++ .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++ 9 files changed, 35 insertions(+), 7 deletions(-) create mode 100644 llama/patches/0036-ggml-cuda-skip-large-batches.patch diff --git a/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch b/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch index 2c211095..1d7018b8 100644 --- a/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch +++ b/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch @@ -38,7 +38,7 @@ index 44ae76d66..639d551a2 100644 #ifdef __cplusplus } diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index d2c278a35..221e29509 100644 +index ca02ea079..c12b069e5 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -73,6 +73,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher(); diff --git a/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch b/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch index e9737aa4..a99b6120 100644 --- a/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch +++ b/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch @@ -11,7 +11,7 @@ vidmem optimization. 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index 221e29509..18b7cbccf 100644 +index c12b069e5..76c78c2ea 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5654,14 +5654,11 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr diff --git a/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch b/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch index 1b1f65e4..24286766 100644 --- a/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch +++ b/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch @@ -50,7 +50,7 @@ Subject: [PATCH] Vulkan MMQ Integer Dot Refactor and K-Quant support (#16536) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index 18b7cbccf..53b57c179 100644 +index 76c78c2ea..7669ed206 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -488,6 +488,7 @@ struct vk_device_struct { diff --git a/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch b/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch index 41cd9cd5..f48e25bb 100644 --- a/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch +++ b/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch @@ -58,7 +58,7 @@ index 639d551a2..e5c446d1d 100644 GGML_API size_t gguf_type_size(enum gguf_type type); GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index 53b57c179..b2855b078 100644 +index 7669ed206..63a762ec2 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -387,12 +387,76 @@ static constexpr uint32_t num_argsort_pipelines = 11; diff --git a/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch b/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch index 64c7ffa4..27e342dc 100644 --- a/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch +++ b/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch @@ -31,7 +31,7 @@ Add new backend tests. 6 files changed, 371 insertions(+), 117 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index b2855b078..aaf4334b5 100644 +index 63a762ec2..db92a7901 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -458,6 +458,11 @@ static topk_moe_mode ggml_vk_num_additional_ops_to_topk_moe_mode(uint32_t num) { diff --git a/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch b/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch index 27a50a5f..a7048e7f 100644 --- a/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch +++ b/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch @@ -9,7 +9,7 @@ Subject: [PATCH] vulkan: Handle argsort with a large number of rows (#16851) 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index aaf4334b5..3604ceb04 100644 +index db92a7901..e959674d1 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1084,6 +1084,7 @@ struct vk_op_soft_max_push_constants { diff --git a/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch b/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch index dfa46916..d7c4def1 100644 --- a/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch +++ b/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch @@ -20,7 +20,7 @@ Subject: [PATCH] vulkan: Fix crash when FP16 mul_mat accumulation is not 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index 3604ceb04..80185d9f0 100644 +index e959674d1..903050b0b 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -146,8 +146,13 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline); diff --git a/llama/patches/0036-ggml-cuda-skip-large-batches.patch b/llama/patches/0036-ggml-cuda-skip-large-batches.patch new file mode 100644 index 00000000..1c9ee45f --- /dev/null +++ b/llama/patches/0036-ggml-cuda-skip-large-batches.patch @@ -0,0 +1,25 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Michael Yang +Date: Tue, 18 Nov 2025 11:13:04 -0800 +Subject: [PATCH] ggml-cuda: skip large batches + +cuda panics on batches larger than 1024 so mark it as unsupported to +fallback to cpu +--- + ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index f1a20e7fe..1a71e07c9 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -3677,6 +3677,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g + if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) { + return false; + } ++ if (op->op == GGML_OP_MUL_MAT && b->ne[2] * b->ne[3] > 1024) { ++ return false; ++ } + #ifdef GGML_USE_MUSA + const int cc = ggml_cuda_info().devices[dev_ctx->device].cc; + if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index f1a20e7f..1a71e07c 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3677,6 +3677,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) { return false; } + if (op->op == GGML_OP_MUL_MAT && b->ne[2] * b->ne[3] > 1024) { + return false; + } #ifdef GGML_USE_MUSA const int cc = ggml_cuda_info().devices[dev_ctx->device].cc; if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {