From 0796d79d19345bc3724bd08108a96b669b0f1841 Mon Sep 17 00:00:00 2001
From: Michael Yang <git@mxy.ng>
Date: Tue, 18 Nov 2025 11:13:37 -0800
Subject: [PATCH] cuda: skip large batches

cuda panics on batches larger than 1024 so skip those and fallback to
cpu
---
 ...-Add-memory-detection-using-DXGI-PDH.patch |  2 +-
 ..._vk_buffer_write_2d-from-ggml_vk_buf.patch |  2 +-
 ...er-Dot-Refactor-and-K-Quant-support-.patch |  2 +-
 ...pk_moe-fusion-to-handle-gpt-s-late-s.patch |  2 +-
 ...0032-vulkan-Fuse-rope-set_rows-16769.patch |  2 +-
 ...gsort-with-a-large-number-of-rows-16.patch |  2 +-
 ...-when-FP16-mul_mat-accumulation-is-n.patch |  2 +-
 .../0036-ggml-cuda-skip-large-batches.patch   | 25 +++++++++++++++++++
 .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu      |  3 +++
 9 files changed, 35 insertions(+), 7 deletions(-)
 create mode 100644 llama/patches/0036-ggml-cuda-skip-large-batches.patch

diff --git a/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch b/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch
index 2c211095..1d7018b8 100644
--- a/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch
+++ b/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch
@@ -38,7 +38,7 @@ index 44ae76d66..639d551a2 100644
  #ifdef __cplusplus
  }
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index d2c278a35..221e29509 100644
+index ca02ea079..c12b069e5 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -73,6 +73,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
diff --git a/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch b/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch
index e9737aa4..a99b6120 100644
--- a/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch
+++ b/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch
@@ -11,7 +11,7 @@ vidmem optimization.
  1 file changed, 1 insertion(+), 4 deletions(-)
 
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 221e29509..18b7cbccf 100644
+index c12b069e5..76c78c2ea 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -5654,14 +5654,11 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
diff --git a/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch b/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch
index 1b1f65e4..24286766 100644
--- a/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch
+++ b/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch
@@ -50,7 +50,7 @@ Subject: [PATCH] Vulkan MMQ Integer Dot Refactor and K-Quant support (#16536)
  create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
 
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 18b7cbccf..53b57c179 100644
+index 76c78c2ea..7669ed206 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -488,6 +488,7 @@ struct vk_device_struct {
diff --git a/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch b/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch
index 41cd9cd5..f48e25bb 100644
--- a/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch
+++ b/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch
@@ -58,7 +58,7 @@ index 639d551a2..e5c446d1d 100644
  GGML_API size_t gguf_type_size(enum gguf_type type);
  GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 53b57c179..b2855b078 100644
+index 7669ed206..63a762ec2 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -387,12 +387,76 @@ static constexpr uint32_t num_argsort_pipelines = 11;
diff --git a/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch b/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch
index 64c7ffa4..27e342dc 100644
--- a/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch
+++ b/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch
@@ -31,7 +31,7 @@ Add new backend tests.
  6 files changed, 371 insertions(+), 117 deletions(-)
 
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index b2855b078..aaf4334b5 100644
+index 63a762ec2..db92a7901 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -458,6 +458,11 @@ static topk_moe_mode ggml_vk_num_additional_ops_to_topk_moe_mode(uint32_t num) {
diff --git a/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch b/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch
index 27a50a5f..a7048e7f 100644
--- a/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch
+++ b/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch
@@ -9,7 +9,7 @@ Subject: [PATCH] vulkan: Handle argsort with a large number of rows (#16851)
  2 files changed, 16 insertions(+), 4 deletions(-)
 
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index aaf4334b5..3604ceb04 100644
+index db92a7901..e959674d1 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -1084,6 +1084,7 @@ struct vk_op_soft_max_push_constants {
diff --git a/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch b/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch
index dfa46916..d7c4def1 100644
--- a/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch
+++ b/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch
@@ -20,7 +20,7 @@ Subject: [PATCH] vulkan: Fix crash when FP16 mul_mat accumulation is not
  1 file changed, 13 insertions(+), 7 deletions(-)
 
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 3604ceb04..80185d9f0 100644
+index e959674d1..903050b0b 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 @@ -146,8 +146,13 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
diff --git a/llama/patches/0036-ggml-cuda-skip-large-batches.patch b/llama/patches/0036-ggml-cuda-skip-large-batches.patch
new file mode 100644
index 00000000..1c9ee45f
--- /dev/null
+++ b/llama/patches/0036-ggml-cuda-skip-large-batches.patch
@@ -0,0 +1,25 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <git@mxy.ng>
+Date: Tue, 18 Nov 2025 11:13:04 -0800
+Subject: [PATCH] ggml-cuda: skip large batches
+
+cuda panics on batches larger than 1024 so mark it as unsupported to
+fallback to cpu
+---
+ ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index f1a20e7fe..1a71e07c9 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
++++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -3677,6 +3677,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+                 if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
+                     return false;
+                 }
++                if (op->op == GGML_OP_MUL_MAT && b->ne[2] * b->ne[3] > 1024) {
++                    return false;
++                }
+ #ifdef GGML_USE_MUSA
+                 const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
+                 if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index f1a20e7f..1a71e07c 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3677,6 +3677,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
                     return false;
                 }
+                if (op->op == GGML_OP_MUL_MAT && b->ne[2] * b->ne[3] > 1024) {
+                    return false;
+                }
 #ifdef GGML_USE_MUSA
                 const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
                 if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {