ggml update to b7108 (#12992)

* Revert "vulkan: temporary cary of vulkan fixes (#12971)" This reverts commit 3a9e8e9fd4. * ggml update to b7087 * fix argsort on metal * update to b7108 * fix bakllava regression This model lacks the metadata for the projector type. * update to b7209 * fix TopK perf * only build arm code on arm
2025-12-26 00:18:02 +00:00 · 2025-12-03 19:43:29 -08:00
parent 854d40edc5
commit 0cf7794b16
303 changed files with 32711 additions and 23435 deletions
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -23,7 +23,7 @@ problem.
 8 files changed, 21 insertions(+), 2 deletions(-)

 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index ff9135fe2..8ba86f824 100644
+index 4cf377e7f..4882541c8 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -42,7 +42,7 @@ index ff9135fe2..8ba86f824 100644
 }
 
 static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-@@ -2075,6 +2075,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+@@ -2079,6 +2079,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     GGML_ASSERT(buffer);
     ggml_aligned_free(buffer->context, buffer->size);
@@ -54,7 +54,7 @@ index ff9135fe2..8ba86f824 100644
 }
 
 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-@@ -2127,7 +2132,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
+@@ -2131,7 +2136,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
 };
 
 static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
@@ -64,10 +64,10 @@ index ff9135fe2..8ba86f824 100644
     /* .init_tensor     = */ NULL, // no initialization required
     /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
 diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index 8bd5449f1..01e2df61a 100644
+index df28d67fb..1f6a56ba2 100644
 --- a/ggml/src/ggml-cann/ggml-cann.cpp
 +++ b/ggml/src/ggml-cann/ggml-cann.cpp
-@@ -820,6 +820,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
+@@ -831,6 +831,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
 static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
     delete ctx;
@@ -75,7 +75,7 @@ index 8bd5449f1..01e2df61a 100644
 }
 
 /**
-@@ -1560,6 +1561,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
+@@ -1570,6 +1571,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
  */
 static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
     ACL_CHECK(aclrtFreeHost(buffer->context));
@@ -84,10 +84,10 @@ index 8bd5449f1..01e2df61a 100644
 
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index bc396b521..aefc6935e 100644
+index fa7e1e13a..8f3b1c173 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -576,6 +576,7 @@ struct ggml_backend_cuda_buffer_context {
+@@ -579,6 +579,7 @@ struct ggml_backend_cuda_buffer_context {
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     delete ctx;
@@ -95,7 +95,7 @@ index bc396b521..aefc6935e 100644
 }
 
 static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-@@ -831,6 +832,7 @@ struct ggml_backend_cuda_split_buffer_context {
+@@ -834,6 +835,7 @@ struct ggml_backend_cuda_split_buffer_context {
 static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
     delete ctx;
@@ -103,7 +103,7 @@ index bc396b521..aefc6935e 100644
 }
 
 static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1112,6 +1114,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
+@@ -1115,6 +1117,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
 
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     CUDA_CHECK(cudaFreeHost(buffer->context));
@@ -112,7 +112,7 @@ index bc396b521..aefc6935e 100644
 
 static void * ggml_cuda_host_malloc(size_t size) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
-index 7afc881fa..bf0962274 100644
+index 70bf6f3d9..f2b7fe692 100644
 --- a/ggml/src/ggml-metal/ggml-metal.cpp
 +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
@@ -132,10 +132,10 @@ index 7afc881fa..bf0962274 100644
 
 static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
-index db33a4ab6..c42ee26e1 100644
+index e5302f455..43fa83e8f 100644
 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
 +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
-@@ -3266,6 +3266,7 @@ struct ggml_backend_opencl_buffer_context {
+@@ -3412,6 +3412,7 @@ struct ggml_backend_opencl_buffer_context {
 static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
     delete ctx;
@@ -144,10 +144,10 @@ index db33a4ab6..c42ee26e1 100644
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index a38df5a97..fd07e4a21 100644
+index 48fd99a76..da2aab3df 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -528,6 +528,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -555,6 +555,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     RPC_STATUS_ASSERT(status);
     delete ctx;
@@ -156,10 +156,10 @@ index a38df5a97..fd07e4a21 100644
 
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index b695ba051..37e853120 100644
+index 3f1bdfb9f..a95c2f305 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
-@@ -352,6 +352,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+@@ -355,6 +355,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
     ggml_sycl_set_device(ctx->device);
 
     delete ctx;
@@ -167,7 +167,7 @@ index b695ba051..37e853120 100644
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-@@ -813,6 +814,7 @@ struct ggml_backend_sycl_split_buffer_context {
+@@ -816,6 +817,7 @@ struct ggml_backend_sycl_split_buffer_context {
 static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
     delete ctx;
@@ -175,7 +175,7 @@ index b695ba051..37e853120 100644
 }
 
 static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1155,6 +1157,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
+@@ -1158,6 +1160,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_sycl_host_free(buffer->context);
@@ -184,10 +184,10 @@ index b695ba051..37e853120 100644
 
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index b783f7805..216dc167c 100644
+index 66dd0bfab..83cdec29e 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -11828,6 +11828,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -12368,6 +12368,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@@ -195,7 +195,7 @@ index b783f7805..216dc167c 100644
 }
 
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -11971,6 +11972,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -12511,6 +12512,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -10,10 +10,10 @@ logs instead of throwing an error
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 639fecbd3..a7ce6f8e1 100644
+index a73c4c448..b9f0631f4 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1812,16 +1812,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         if (type == LLAMA_VOCAB_TYPE_BPE) {
             add_space_prefix = false;
             clean_spaces = true;
@@ -31,8 +31,8 @@ index 639fecbd3..a7ce6f8e1 100644
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -1993,7 +1984,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
+@@ -2014,7 +2005,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+                 pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
                 clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
--- a/llama/patches/0003-clip-unicode.patch
+++ b/llama/patches/0003-clip-unicode.patch
@@ -10,11 +10,11 @@ filesystems for paths that include wide characters
 1 file changed, 39 insertions(+)

 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
-index f2abf8852..c984e6282 100644
+index 05777d2d9..f4c4d2c48 100644
 --- a/tools/mtmd/clip.cpp
 +++ b/tools/mtmd/clip.cpp
-@@ -28,6 +28,19 @@
- #include <numeric>
+@@ -24,6 +24,19 @@
+ #include <array>
 #include <functional>
 
 +#if defined(_WIN32)
@@ -30,10 +30,10 @@ index f2abf8852..c984e6282 100644
 +#endif
 +#endif
 +
- struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
+ struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
 
 enum ffn_op_type {
-@@ -2774,7 +2787,29 @@ struct clip_model_loader {
+@@ -3255,7 +3268,29 @@ struct clip_model_loader {
         {
             std::vector<uint8_t> read_buf;
 
@@ -63,7 +63,7 @@ index f2abf8852..c984e6282 100644
             if (!fin) {
                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
             }
-@@ -2801,7 +2836,11 @@ struct clip_model_loader {
+@@ -3282,7 +3317,11 @@ struct clip_model_loader {
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
                 }
             }
--- a/llama/patches/0004-solar-pro.patch
+++ b/llama/patches/0004-solar-pro.patch
@@ -5,20 +5,36 @@ Subject: [PATCH] solar-pro

 adds support for the Solar Pro architecture
 ---
- src/llama-arch.cpp         |  21 ++++
+ src/CMakeLists.txt         |   1 +
+ src/llama-arch.cpp         |  21 +++++
 src/llama-arch.h           |   3 +
 src/llama-hparams.cpp      |   8 ++
- src/llama-hparams.h        |   5 +
+ src/llama-hparams.h        |   5 ++
 src/llama-model-loader.cpp |   2 +-
- src/llama-model.cpp        | 207 +++++++++++++++++++++++++++++++++++++
+ src/llama-model.cpp        |  48 +++++++++++
 src/llama-model.h          |   3 +
- 7 files changed, 248 insertions(+), 1 deletion(-)
+ src/models/models.h        |   5 ++
+ src/models/solar.cpp       | 158 +++++++++++++++++++++++++++++++++++++
+ 10 files changed, 253 insertions(+), 1 deletion(-)
+ create mode 100644 src/models/solar.cpp

+diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+index 67c7807e0..fda881640 100644
+--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
+@@ -125,6 +125,7 @@ add_library(llama
+             models/seed-oss.cpp
+             models/smallthinker.cpp
+             models/smollm3.cpp
+            models/solar.cpp
+             models/stablelm.cpp
+             models/starcoder.cpp
+             models/starcoder2.cpp
 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 8ca769c5f..ab262ec0c 100644
+index 8571a2e02..b6bde25d5 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
-@@ -82,6 +82,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
     { LLM_ARCH_GRANITE_HYBRID,   "granitehybrid"    },
     { LLM_ARCH_CHAMELEON,        "chameleon"        },
@@ -26,7 +42,7 @@ index 8ca769c5f..ab262ec0c 100644
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
     { LLM_ARCH_PLM,              "plm"              },
     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
-@@ -183,6 +184,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+@@ -204,6 +205,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
     { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
     { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
@@ -34,7 +50,7 @@ index 8ca769c5f..ab262ec0c 100644
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
-@@ -1901,6 +1903,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -2023,6 +2025,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
@@ -59,7 +75,7 @@ index 8ca769c5f..ab262ec0c 100644
     {
         LLM_ARCH_WAVTOKENIZER_DEC,
         {
-@@ -2469,6 +2489,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -2681,6 +2701,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_LAUREL_POST_NORM,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -68,10 +84,10 @@ index 8ca769c5f..ab262ec0c 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index dea725c1a..ea2b4ffb9 100644
+index 150646478..3936a4687 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -86,6 +86,7 @@ enum llm_arch {
+@@ -89,6 +89,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_GRANITE_HYBRID,
     LLM_ARCH_CHAMELEON,
@@ -79,7 +95,7 @@ index dea725c1a..ea2b4ffb9 100644
     LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_PLM,
     LLM_ARCH_BAILINGMOE,
-@@ -187,6 +188,7 @@ enum llm_kv {
+@@ -208,6 +209,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_OUTPUT_SCALE,
     LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
@@ -87,7 +103,7 @@ index dea725c1a..ea2b4ffb9 100644
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
-@@ -436,6 +438,7 @@ enum llm_tensor {
+@@ -459,6 +461,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -96,11 +112,11 @@ index dea725c1a..ea2b4ffb9 100644
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index db65d69ea..b6bf6bbf2 100644
+index 8cdbaf69f..41127bf91 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
-@@ -151,6 +151,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
-     return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
+@@ -161,6 +161,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
+     return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
 }
 
 +bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
@@ -115,7 +131,7 @@ index db65d69ea..b6bf6bbf2 100644
     if (il < n_layer) {
         return swa_layers[il];
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 6fcf91b7d..24569a258 100644
+index c3a53be79..2ffe7dd30 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
@@ -64,6 +64,8 @@ struct llama_hparams {
@@ -127,7 +143,7 @@ index 6fcf91b7d..24569a258 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -250,6 +252,9 @@ struct llama_hparams {
+@@ -256,6 +258,9 @@ struct llama_hparams {
 
     uint32_t n_pos_per_embd() const;
 
@@ -151,10 +167,10 @@ index aa3a65f87..ee303bd58 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 2a83d6627..54621ea39 100644
+index c2a545531..4468de2f9 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1890,6 +1890,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1961,6 +1961,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@@ -176,7 +192,7 @@ index 2a83d6627..54621ea39 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -5224,6 +5239,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -5350,6 +5365,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -211,12 +227,71 @@ index 2a83d6627..54621ea39 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -16515,6 +16558,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
-     }
+@@ -7425,6 +7468,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
+             {
+                 llm = std::make_unique<llm_build_chameleon>(*this, params);
+             } break;
+        case LLM_ARCH_SOLAR:
+            {
+                llm = std::make_unique<llm_build_solar>(*this, params);
+            } break;
+         case LLM_ARCH_WAVTOKENIZER_DEC:
+             {
+                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
+@@ -7684,6 +7731,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+         case LLM_ARCH_GRANITE_MOE:
+         case LLM_ARCH_GRANITE_HYBRID:
+         case LLM_ARCH_CHAMELEON:
+        case LLM_ARCH_SOLAR:
+         case LLM_ARCH_BAILINGMOE:
+         case LLM_ARCH_NEO_BERT:
+         case LLM_ARCH_SMOLLM3:
+diff --git a/src/llama-model.h b/src/llama-model.h
+index f8342cf2c..cbf4e1bfa 100644
+--- a/src/llama-model.h
+++ b/src/llama-model.h
+@@ -76,6 +76,7 @@ enum llm_type {
+     LLM_TYPE_15B,
+     LLM_TYPE_16B,
+     LLM_TYPE_20B,
+    LLM_TYPE_22B,
+     LLM_TYPE_26B,
+     LLM_TYPE_27B,
+     LLM_TYPE_30B,
+@@ -404,6 +405,8 @@ struct llama_layer {
+     struct ggml_tensor * ffn_act_beta    = nullptr;
+     struct ggml_tensor * ffn_act_eps     = nullptr;
+ 
+    struct ggml_tensor * bskcn_tv = nullptr;
+
+     struct llama_layer_posnet posnet;
+ 
+     struct llama_layer_convnext convnext;
+diff --git a/src/models/models.h b/src/models/models.h
+index 7ba225b47..71fea796d 100644
+--- a/src/models/models.h
+++ b/src/models/models.h
+@@ -510,6 +510,11 @@ struct llm_build_smollm3 : public llm_graph_context {
+     llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
 };
 
 +struct llm_build_solar : public llm_graph_context {
-+    llm_build_solar(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    llm_build_solar(const llama_model & model, const llm_graph_params & params);
+};
+
+
+ struct llm_build_stablelm : public llm_graph_context {
+     llm_build_stablelm(const llama_model & model, const llm_graph_params & params);
+ };
+diff --git a/src/models/solar.cpp b/src/models/solar.cpp
+new file mode 100644
+index 000000000..97383928c
+--- /dev/null
+++ b/src/models/solar.cpp
+@@ -0,0 +1,158 @@
+#include "models.h"
+
+llm_build_solar::llm_build_solar(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
 +        const int64_t n_embd_head = hparams.n_embd_head_v;
 +        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 +        GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -285,7 +360,7 @@ index 2a83d6627..54621ea39 100644
 +                cb(Kcur, "Kcur", il);
 +                if (model.layers[il].bk) {
 +                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-+                    cb(Kcur, "Kcur", il);
+                   cb(Kcur, "Kcur", il);
 +                }
 +
 +                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
@@ -371,49 +446,4 @@ index 2a83d6627..54621ea39 100644
 +        res->t_logits = cur;
 +
 +        ggml_build_forward_expand(gf, cur);
-+    }
-+};
-+
- // ref: https://github.com/facebookresearch/chameleon
- // based on the original build_llama() function, changes:
- //   * qk-norm
-@@ -20096,6 +20298,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
-             {
-                 llm = std::make_unique<llm_build_chameleon>(*this, params);
-             } break;
-+        case LLM_ARCH_SOLAR:
-+            {
-+                llm = std::make_unique<llm_build_solar>(*this, params);
-+            } break;
-         case LLM_ARCH_WAVTOKENIZER_DEC:
-             {
-                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
-@@ -20331,6 +20537,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
-         case LLM_ARCH_GRANITE_MOE:
-         case LLM_ARCH_GRANITE_HYBRID:
-         case LLM_ARCH_CHAMELEON:
-+        case LLM_ARCH_SOLAR:
-         case LLM_ARCH_BAILINGMOE:
-         case LLM_ARCH_NEO_BERT:
-         case LLM_ARCH_SMOLLM3:
-diff --git a/src/llama-model.h b/src/llama-model.h
-index 248f85410..4a7924aaa 100644
--- a/src/llama-model.h
-+++ b/src/llama-model.h
-@@ -76,6 +76,7 @@ enum llm_type {
-     LLM_TYPE_15B,
-     LLM_TYPE_16B,
-     LLM_TYPE_20B,
-+    LLM_TYPE_22B,
-     LLM_TYPE_27B,
-     LLM_TYPE_30B,
-     LLM_TYPE_32B,
-@@ -390,6 +391,8 @@ struct llama_layer {
-     struct ggml_tensor * ffn_act_beta    = nullptr;
-     struct ggml_tensor * ffn_act_eps     = nullptr;
- 
-+    struct ggml_tensor * bskcn_tv = nullptr;
-+
-     struct llama_layer_posnet posnet;
- 
-     struct llama_layer_convnext convnext;
+}
--- a/llama/patches/0005-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0005-fix-deepseek-deseret-regex.patch
@@ -12,7 +12,7 @@ regex
 2 files changed, 22 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index a7ce6f8e1..8064dc197 100644
+index b9f0631f4..1525283d7 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
@@ -25,7 +25,7 @@ index a7ce6f8e1..8064dc197 100644
                     "\\s+$",
                     "[一-龥ࠀ-一가-퟿]+",
 diff --git a/src/unicode.cpp b/src/unicode.cpp
-index 65f366517..ce336a228 100644
+index 77ba4fc46..040518e1e 100644
 --- a/src/unicode.cpp
 +++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
--- a/llama/patches/0006-maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0006-maintain-ordering-for-rules-for-grammar.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
 1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
-index dd9b51a9e..d88f43209 100644
+index c8421e1e8..cb659915d 100644
 --- a/common/json-schema-to-grammar.cpp
 +++ b/common/json-schema-to-grammar.cpp
-@@ -308,7 +308,7 @@ private:
+@@ -310,7 +310,7 @@ private:
     friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
     std::function<json(const std::string &)> _fetch_json;
     bool _dotall;
--- a/llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index ba281b8e6..ead235878 100644
+index d93664b8b..800f98b65 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -314,6 +314,7 @@ function(ggml_add_cpu_backend_variant tag_name)
+@@ -349,6 +349,7 @@ function(ggml_add_cpu_backend_variant tag_name)
     endif()
 
     ggml_add_cpu_backend_variant_impl(${tag_name})
@@ -19,7 +19,7 @@ index ba281b8e6..ead235878 100644
 endfunction()
 
 ggml_add_backend(CPU)
-@@ -324,6 +325,7 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -359,6 +360,7 @@ if (GGML_CPU_ALL_VARIANTS)
     elseif (GGML_CPU_ARM_ARCH)
         message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
     endif()
--- a/llama/patches/0009-remove-amx.patch
+++ b/llama/patches/0009-remove-amx.patch
@@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
 1 file changed, 4 deletions(-)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index ead235878..f9a6587f1 100644
+index 800f98b65..6d493a4ff 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -334,10 +334,6 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -369,10 +369,6 @@ if (GGML_CPU_ALL_VARIANTS)
         ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
         ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
         ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
--- a/llama/patches/0010-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0010-fix-string-arr-kv-loading.patch
@@ -53,10 +53,10 @@ index 8cc4ef1cf..d950dbdf5 100644
 }
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 8064dc197..31f49801c 100644
+index 1525283d7..ea450c361 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1768,9 +1768,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
                 const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
--- a/llama/patches/0011-ollama-debug-tensor.patch
+++ b/llama/patches/0011-ollama-debug-tensor.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
 1 file changed, 6 insertions(+)

 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 9ec485cfa..4b2f8b7bd 100644
+index 3247af8bb..5be08d6f4 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@
@@ -20,7 +20,7 @@ index 9ec485cfa..4b2f8b7bd 100644
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -2891,6 +2893,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2922,6 +2924,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         ggml_compute_forward(&params, node);
 
--- a/llama/patches/0012-add-ollama-vocab-for-grammar-support.patch
+++ b/llama/patches/0012-add-ollama-vocab-for-grammar-support.patch
@@ -6,14 +6,14 @@ Subject: [PATCH] add ollama vocab for grammar support
 ---
 src/llama-grammar.cpp  | 49 ++++++++++++++++++++++++++++++++++++------
 src/llama-grammar.h    | 14 ++++++++++++
- src/llama-sampling.cpp |  4 ++--
- 3 files changed, 58 insertions(+), 9 deletions(-)
+ src/llama-sampling.cpp |  6 +++---
+ 3 files changed, 59 insertions(+), 10 deletions(-)

 diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
-index bed706bb2..b51cee090 100644
+index b3c5eb571..a7307c47f 100644
 --- a/src/llama-grammar.cpp
 +++ b/src/llama-grammar.cpp
-@@ -907,6 +907,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
+@@ -915,6 +915,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
 
 struct llama_grammar * llama_grammar_init_impl(
         const struct llama_vocab * vocab,
@@ -21,7 +21,7 @@ index bed706bb2..b51cee090 100644
         const llama_grammar_element ** rules,
         size_t n_rules,
         size_t start_rule_index) {
-@@ -962,6 +963,7 @@ struct llama_grammar * llama_grammar_init_impl(
+@@ -970,6 +971,7 @@ struct llama_grammar * llama_grammar_init_impl(
     // then the pointers would be invalidated when the local vec_rules goes out of scope.
     return new llama_grammar {
         vocab,
@@ -29,7 +29,7 @@ index bed706bb2..b51cee090 100644
         std::move(vec_rules),
         std::move(stacks),
         /* .partial_utf8 = */     {},
-@@ -975,6 +977,7 @@ struct llama_grammar * llama_grammar_init_impl(
+@@ -983,6 +985,7 @@ struct llama_grammar * llama_grammar_init_impl(
 
 struct llama_grammar * llama_grammar_init_impl(
         const struct llama_vocab * vocab,
@@ -37,7 +37,7 @@ index bed706bb2..b51cee090 100644
                       const char * grammar_str,
                       const char * grammar_root,
                               bool lazy,
-@@ -1067,6 +1070,7 @@ struct llama_grammar * llama_grammar_init_impl(
+@@ -1075,6 +1078,7 @@ struct llama_grammar * llama_grammar_init_impl(
     // then the pointers would be invalidated when the local vec_rules goes out of scope.
     return new llama_grammar {
         vocab,
@@ -45,7 +45,7 @@ index bed706bb2..b51cee090 100644
         std::move(vec_rules),
         std::move(stacks),
         /* .partial_utf8 = */     {},
-@@ -1089,6 +1093,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
+@@ -1097,6 +1101,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
 struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
     auto * result = new llama_grammar {
         grammar.vocab,
@@ -53,7 +53,7 @@ index bed706bb2..b51cee090 100644
         grammar.rules,
         grammar.stacks,
         grammar.partial_utf8,
-@@ -1116,7 +1121,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
+@@ -1124,7 +1129,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
 }
 
 void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
@@ -61,7 +61,7 @@ index bed706bb2..b51cee090 100644
 
     if (grammar.awaiting_trigger) {
         return;
-@@ -1138,9 +1142,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
+@@ -1146,9 +1150,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
 
     for (size_t i = 0; i < cur_p->size; ++i) {
         const llama_token id      = cur_p->data[i].id;
@@ -77,7 +77,7 @@ index bed706bb2..b51cee090 100644
             if (!allow_eog) {
                 cur_p->data[i].logit = -INFINITY;
             }
-@@ -1159,9 +1167,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
+@@ -1167,9 +1175,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
 }
 
 void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
@@ -90,7 +90,7 @@ index bed706bb2..b51cee090 100644
 
     if (grammar.awaiting_trigger) {
         if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
-@@ -1201,13 +1210,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
+@@ -1209,13 +1218,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
         }
     }
 
@@ -107,7 +107,7 @@ index bed706bb2..b51cee090 100644
     }
 
     llama_grammar_accept_str(grammar, piece);
-@@ -1227,3 +1237,28 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
+@@ -1235,3 +1245,28 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
         throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
     }
 }
@@ -184,10 +184,10 @@ index f8c291de9..2a3a62db3 100644
                       const char * grammar_root,
                               bool lazy,
 diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
-index 55d2e355f..da34526b1 100644
+index 3f4a729bc..38a30ea05 100644
 --- a/src/llama-sampling.cpp
 +++ b/src/llama-sampling.cpp
-@@ -1563,7 +1563,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
+@@ -1561,7 +1561,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
         trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
     }
 
@@ -196,12 +196,15 @@ index 55d2e355f..da34526b1 100644
                                                  ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
                                                  ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
 
-@@ -1645,7 +1645,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
+@@ -1639,9 +1639,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
+             trigger_pattern += ")[\\s\\S]*";
+ 
+             std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() };
+-            grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, tmp_trigger_patterns.data(), tmp_trigger_patterns.size(), trigger_tokens, num_trigger_tokens);
+            grammar = llama_grammar_init_impl(vocab, nullptr, grammar_str, grammar_root, lazy, tmp_trigger_patterns.data(), tmp_trigger_patterns.size(), trigger_tokens, num_trigger_tokens);
+         } else {
+-            grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens);
+            grammar = llama_grammar_init_impl(vocab, nullptr, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens);
+         }
+         *ctx = {
             /* .vocab        = */ vocab,
-             /* .grammar_str  = */ grammar_str,
-             /* .grammar_root = */ grammar_root,
-            /* .grammar      = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
-+            /* .grammar      = */ llama_grammar_init_impl(vocab, nullptr, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
-         };
-         if (!ctx->grammar) {
-             delete ctx;
--- a/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
@@ -8,14 +8,14 @@ Subject: [PATCH] add argsort and cuda copy for i32
 ggml/src/ggml-cuda/argsort.cu        | 122 ++++++++++++++++++++++++---
 ggml/src/ggml-cuda/cpy-utils.cuh     |   6 ++
 ggml/src/ggml-cuda/cpy.cu            |  40 +++++++++
- ggml/src/ggml-metal/ggml-metal.metal |  64 ++++++++++++++
- 5 files changed, 263 insertions(+), 12 deletions(-)
+ ggml/src/ggml-metal/ggml-metal.metal |  69 +++++++++++++++
+ 5 files changed, 268 insertions(+), 12 deletions(-)

 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index b52f0f847..902fdad69 100644
+index 2745fc54e..40666bab6 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -7889,6 +7889,45 @@ static void ggml_compute_forward_argsort_f32(
+@@ -7846,6 +7846,45 @@ static void ggml_compute_forward_argsort_f32(
     }
 }
 
@@ -61,7 +61,7 @@ index b52f0f847..902fdad69 100644
 void ggml_compute_forward_argsort(
     const ggml_compute_params * params,
     ggml_tensor * dst) {
-@@ -7900,6 +7939,10 @@ void ggml_compute_forward_argsort(
+@@ -7857,6 +7896,10 @@ void ggml_compute_forward_argsort(
             {
                 ggml_compute_forward_argsort_f32(params, dst);
             } break;
@@ -73,7 +73,7 @@ index b52f0f847..902fdad69 100644
             {
                 GGML_ABORT("fatal error");
 diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
-index 6e7b90d42..08dd30525 100644
+index da9652c3b..b82be371c 100644
 --- a/ggml/src/ggml-cuda/argsort.cu
 +++ b/ggml/src/ggml-cuda/argsort.cu
@@ -168,13 +168,107 @@ static void argsort_f32_i32_cuda_bitonic(const float *   x,
@@ -220,11 +220,11 @@ index 6e7b90d42..08dd30525 100644
 +    }
 }
 diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
-index e621cb981..597c0c8b3 100644
+index 7697c292d..00d773dd3 100644
 --- a/ggml/src/ggml-cuda/cpy-utils.cuh
 +++ b/ggml/src/ggml-cuda/cpy-utils.cuh
@@ -215,3 +215,9 @@ template<typename src_t, typename dst_t>
- static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
+ static __device__ void cpy_1_scalar(const char * cxi, char * cdsti) {
     *(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
 }
 +
@@ -234,10 +234,10 @@ index e621cb981..597c0c8b3 100644
 +    *dst = *src;
 +}
 diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
-index 12d5bf776..a0e34030e 100644
+index c4ceb4fc5..0e53ecc39 100644
 --- a/ggml/src/ggml-cuda/cpy.cu
 +++ b/ggml/src/ggml-cuda/cpy.cu
-@@ -251,6 +251,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
+@@ -352,6 +352,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
         (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 
@@ -281,73 +281,76 @@ index 12d5bf776..a0e34030e 100644
 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
     const int64_t ne = ggml_nelements(src0);
     GGML_ASSERT(ne == ggml_nelements(src1));
-@@ -332,6 +369,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
-         ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
-         ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+@@ -481,6 +518,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+             ggml_cpy_scalar_cuda<half, float>
+                 (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+         }
 +    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
 +        // TODO consider converting to template
 +        ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
-         ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-     } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
+         if (can_be_transposed) {
+             ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16, true>
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 2c2f01415..50b8071de 100644
+index 73b45c762..aed013a9d 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -4467,8 +4467,72 @@ kernel void kernel_argsort_f32_i32(
+@@ -4721,8 +4721,77 @@ kernel void kernel_argsort_f32_i32(
     }
 }
 
 +typedef void (i32_argsort_t)(
 +        constant   ggml_metal_kargs_argsort & args,
-+        device  const int32_t * x,
+        device  const int32_t * src0,
 +        device        int32_t * dst,
-+        threadgroup   int32_t * shared_values [[threadgroup(0)]],
-+        uint3 tgpig[[threadgroup_position_in_grid]],
-+        uint3 tpitg[[thread_position_in_threadgroup]]);
+        threadgroup   int32_t * shmem_i32 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]);
 +
 +template<ggml_sort_order order>
 +kernel void kernel_argsort_i32_i32(
 +        constant   ggml_metal_kargs_argsort & args,
-+        device const int32_t * x,
+        device const int32_t * src0,
 +        device       int32_t * dst,
-+        threadgroup int32_t  * shared_values [[threadgroup(0)]],
-+        uint3 tgpig[[threadgroup_position_in_grid]],
-+        uint3 tpitg[[thread_position_in_threadgroup]]) {
+        threadgroup int32_t  * shmem_i32 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
 +    // bitonic sort
-+    int col = tpitg[0];
-+    int row = tgpig[1];
+    const int col = tpitg[0];
 +
-+    if (col >= args.ncols_pad) return;
+    const int i00 = (tgpig[0]/args.ne01)*ntg.x;
+    const int i01 =  tgpig[0]%args.ne01;
+    const int i02 =  tgpig[1];
+    const int i03 =  tgpig[2];
 +
-+    device const int32_t * x_row   = x + row * args.ncols;
-+    threadgroup int32_t  * dst_row = shared_values;
+    device const int32_t * src0_row = (device const int32_t *) (src0 + args.nb01*i01 + args.nb02*i02 + args.nb03*i03);
 +
 +    // initialize indices
-+    dst_row[col] = col;
+    shmem_i32[col] = i00 + col;
 +
 +    threadgroup_barrier(mem_flags::mem_threadgroup);
 +
-+    for (int k = 2; k <= args.ncols_pad; k *= 2) {
+    for (int k = 2; k <= ntg.x; k *= 2) {
 +        for (int j = k / 2; j > 0; j /= 2) {
 +            int ixj = col ^ j;
 +            if (ixj > col) {
 +                if ((col & k) == 0) {
-+                    if (dst_row[col] >= args.ncols ||
-+                        (dst_row[ixj] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
-+                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
-+                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+                    if (shmem_i32[col] >= args.ne00 ||
+                        (shmem_i32[ixj] <  args.ne00 && (order == GGML_SORT_ORDER_ASC ?
+                            src0_row[shmem_i32[col]] > src0_row[shmem_i32[ixj]] :
+                            src0_row[shmem_i32[col]] < src0_row[shmem_i32[ixj]]))
 +                    ) {
-+                        SWAP(dst_row[col], dst_row[ixj]);
+                        SWAP(shmem_i32[col], shmem_i32[ixj]);
 +                    }
 +                } else {
-+                    if (dst_row[ixj] >= args.ncols ||
-+                        (dst_row[col] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
-+                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
-+                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+                    if (shmem_i32[ixj] >= args.ne00 ||
+                        (shmem_i32[col] <  args.ne00 && (order == GGML_SORT_ORDER_ASC ?
+                            src0_row[shmem_i32[col]] < src0_row[shmem_i32[ixj]] :
+                            src0_row[shmem_i32[col]] > src0_row[shmem_i32[ixj]]))
 +                    ) {
-+                        SWAP(dst_row[col], dst_row[ixj]);
+                        SWAP(shmem_i32[col], shmem_i32[ixj]);
 +                    }
 +                }
 +            }
@@ -356,8 +359,10 @@ index 2c2f01415..50b8071de 100644
 +    }
 +
 +    // copy the result to dst without the padding
-+    if (col < args.ncols) {
-+        dst[row * args.ncols + col] = dst_row[col];
+    if (i00 + col < args.ne00) {
+        dst += i00 + args.ne00*i01 + args.ne00*args.ne01*i02 + args.ne00*args.ne01*args.ne02*i03;
+
+        dst[col] = shmem_i32[col];
 +    }
 +}
 +
@@ -366,5 +371,5 @@ index 2c2f01415..50b8071de 100644
 +template [[host_name("kernel_argsort_i32_i32_asc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_ASC>;
 +template [[host_name("kernel_argsort_i32_i32_desc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_DESC>;
 
- kernel void kernel_leaky_relu_f32(
-         constant     ggml_metal_kargs_leaky_relu & args,
+ typedef void (argsort_merge_t)(
+         constant   ggml_metal_kargs_argsort_merge & args,
--- a/llama/patches/0014-graph-memory-reporting-on-failure.patch
+++ b/llama/patches/0014-graph-memory-reporting-on-failure.patch
@@ -35,10 +35,10 @@ index f1b740785..c54ff98bf 100644
     GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
 diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
-index c830c0965..363853873 100644
+index 218222ece..06ee502ab 100644
 --- a/ggml/src/ggml-alloc.c
 +++ b/ggml/src/ggml-alloc.c
-@@ -486,6 +486,7 @@ struct node_alloc {
+@@ -493,6 +493,7 @@ struct node_alloc {
 struct ggml_gallocr {
     ggml_backend_buffer_type_t * bufts; // [n_buffers]
     struct vbuffer ** buffers; // [n_buffers]
@@ -46,7 +46,7 @@ index c830c0965..363853873 100644
     struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
     int n_buffers;
 
-@@ -509,6 +510,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
+@@ -516,6 +517,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
     galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
     GGML_ASSERT(galloc->buffers != NULL);
 
@@ -56,7 +56,7 @@ index c830c0965..363853873 100644
     galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
     GGML_ASSERT(galloc->buf_tallocs != NULL);
 
-@@ -576,6 +580,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
+@@ -583,6 +587,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
     ggml_hash_set_free(&galloc->hash_set);
     free(galloc->hash_values);
     free(galloc->bufts);
@@ -64,7 +64,7 @@ index c830c0965..363853873 100644
     free(galloc->buffers);
     free(galloc->buf_tallocs);
     free(galloc->node_allocs);
-@@ -891,6 +896,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+@@ -898,6 +903,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
         }
     }
 
@@ -73,8 +73,8 @@ index c830c0965..363853873 100644
     // reallocate buffers if needed
     for (int i = 0; i < galloc->n_buffers; i++) {
         // if the buffer type is used multiple times, we reuse the same buffer
-@@ -920,14 +927,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
- 
+@@ -932,14 +939,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+ #endif
             ggml_vbuffer_free(galloc->buffers[i]);
             galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
 -            if (galloc->buffers[i] == NULL) {
@@ -96,7 +96,7 @@ index c830c0965..363853873 100644
 }
 
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
-@@ -1082,6 +1094,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+@@ -1094,6 +1106,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
     return ggml_vbuffer_size(galloc->buffers[buffer_id]);
 }
 
@@ -120,10 +120,10 @@ index c830c0965..363853873 100644
 
 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 8ba86f824..cb2b99562 100644
+index 4882541c8..ff41c7712 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
-@@ -1809,6 +1809,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
+@@ -1813,6 +1813,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
     return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }
 
--- a/llama/patches/0015-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0015-ggml-Export-GPU-UUIDs.patch
@@ -1,10 +1,8 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Thu, 24 Apr 2025 14:48:51 -0700
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Sun, 30 Nov 2025 11:05:56 -0800
 Subject: [PATCH] ggml: Export GPU UUIDs

-This enables matching up devices and information reported by the backend
-with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
 ---
 ggml/include/ggml-backend.h        |  1 +
 ggml/src/ggml-cuda/ggml-cuda.cu    | 67 +++++++++++++++++++++++++++---
@@ -24,10 +22,10 @@ index c54ff98bf..229bf387b 100644
         size_t memory_total;
         // device type
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index aefc6935e..cc201afff 100644
+index 8f3b1c173..e803f4af6 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -183,6 +183,51 @@ static int ggml_cuda_parse_id(char devName[]) {
+@@ -185,6 +185,51 @@ static int ggml_cuda_parse_id(char devName[]) {
 }
 #endif // defined(GGML_USE_HIP)
 
@@ -79,7 +77,7 @@ index aefc6935e..cc201afff 100644
 static ggml_cuda_device_info ggml_cuda_init() {
     ggml_cuda_device_info info = {};
 
-@@ -249,22 +294,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
+@@ -251,22 +296,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
                 info.devices[id].cc += prop.minor * 0x10;
             }
         }
@@ -110,7 +108,7 @@ index aefc6935e..cc201afff 100644
         std::string device_name(prop.name);
         if (device_name == "NVIDIA GeForce MX450") {
             turing_devices_without_mma.push_back({ id, device_name });
-@@ -3268,6 +3315,7 @@ struct ggml_backend_cuda_device_context {
+@@ -4048,6 +4095,7 @@ struct ggml_backend_cuda_device_context {
     std::string name;
     std::string description;
     std::string pci_bus_id;
@@ -118,9 +116,9 @@ index aefc6935e..cc201afff 100644
 };
 
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
-@@ -3280,6 +3328,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
-     return ctx->description.c_str();
+@@ -4136,6 +4184,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
 }
+ #endif // defined(__linux__)
 
 +static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
 +    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
@@ -130,7 +128,7 @@ index aefc6935e..cc201afff 100644
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     ggml_cuda_set_device(ctx->device);
-@@ -3296,6 +3349,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+@@ -4176,6 +4229,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
 
     props->name        = ggml_backend_cuda_device_get_name(dev);
     props->description = ggml_backend_cuda_device_get_description(dev);
@@ -138,7 +136,7 @@ index aefc6935e..cc201afff 100644
     props->type        = ggml_backend_cuda_device_get_type(dev);
     props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
-@@ -3869,6 +3923,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -4767,6 +4821,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 cudaDeviceProp prop;
                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                 dev_ctx->description = prop.name;
@@ -147,10 +145,10 @@ index aefc6935e..cc201afff 100644
                 char pci_bus_id[16] = {};
                 snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
 diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
-index bf0962274..f2ff9f322 100644
+index f2b7fe692..8fc1c2fb5 100644
 --- a/ggml/src/ggml-metal/ggml-metal.cpp
 +++ b/ggml/src/ggml-metal/ggml-metal.cpp
-@@ -538,6 +538,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
+@@ -547,6 +547,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
     props->name        = ggml_backend_metal_device_get_name(dev);
     props->description = ggml_backend_metal_device_get_description(dev);
--- a/llama/patches/0016-add-C-API-for-mtmd_input_text.patch
+++ b/llama/patches/0016-add-C-API-for-mtmd_input_text.patch
@@ -10,10 +10,10 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
 2 files changed, 13 insertions(+)

 diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
-index 4d487581a..35a0d25ed 100644
+index dfad9cd79..9858de630 100644
 --- a/tools/mtmd/mtmd.cpp
 +++ b/tools/mtmd/mtmd.cpp
-@@ -79,6 +79,16 @@ enum mtmd_slice_tmpl {
+@@ -87,6 +87,16 @@ enum mtmd_slice_tmpl {
     MTMD_SLICE_TMPL_IDEFICS3,
 };
 
@@ -31,7 +31,7 @@ index 4d487581a..35a0d25ed 100644
     return "<__media__>";
 }
 diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
-index f4ea07d3a..cf287224b 100644
+index 015119be8..8d3fa5d34 100644
 --- a/tools/mtmd/mtmd.h
 +++ b/tools/mtmd/mtmd.h
@@ -75,6 +75,9 @@ typedef struct mtmd_input_chunk  mtmd_input_chunk;
--- a/llama/patches/0017-no-power-throttling-win32-with-gnuc.patch
+++ b/llama/patches/0017-no-power-throttling-win32-with-gnuc.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
 1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 4b2f8b7bd..046646282 100644
+index 5be08d6f4..7a0df30c3 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -2441,7 +2441,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
+@@ -2463,7 +2463,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
         // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
         // all our threads onto the first 4 cores which results in terrible performance with
         // n_threads > 4
--- a/llama/patches/0018-ggml-Add-batch-size-hint.patch
+++ b/llama/patches/0018-ggml-Add-batch-size-hint.patch
@@ -58,7 +58,7 @@ index 6792ba986..0f5b03cef 100644
         // (optional) event synchronization
         // record an event on this stream
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index cb2b99562..41eef3b5f 100644
+index ff41c7712..f511e8d76 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -348,14 +348,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
@@ -97,7 +97,7 @@ index cb2b99562..41eef3b5f 100644
                 for (int b = 0; b < src_backend_id; b++) {
                     if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
                         SET_CAUSE(tensor, "1.off");
-@@ -1550,7 +1552,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
+@@ -1556,7 +1558,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
         }
 
         if (!sched->callback_eval) {
@@ -106,7 +106,7 @@ index cb2b99562..41eef3b5f 100644
             if (ec != GGML_STATUS_SUCCESS) {
                 return ec;
             }
-@@ -1572,7 +1574,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
+@@ -1578,7 +1580,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
 
                 struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
 
@@ -115,7 +115,7 @@ index cb2b99562..41eef3b5f 100644
                 if (ec != GGML_STATUS_SUCCESS) {
                     return ec;
                 }
-@@ -1651,6 +1653,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
+@@ -1657,6 +1659,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
 
     sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
     sched->op_offload = op_offload;
@@ -123,7 +123,7 @@ index cb2b99562..41eef3b5f 100644
 
     ggml_backend_sched_reset(sched);
 
-@@ -1682,6 +1685,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
+@@ -1688,6 +1691,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
     free(sched);
 }
 
@@ -178,10 +178,10 @@ index 3191faaa4..32f14c811 100644
 
 static const struct ggml_backend_i ggml_backend_cpu_i = {
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index cc201afff..02d413467 100644
+index e803f4af6..78fb2d8b3 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2693,7 +2693,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
+@@ -2885,7 +2885,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
 
 #ifdef USE_CUDA_GRAPH
 static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
@@ -190,7 +190,7 @@ index cc201afff..02d413467 100644
 
     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
 
-@@ -2726,24 +2726,34 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
+@@ -2918,24 +2918,34 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
 #endif
         }
 
@@ -241,7 +241,7 @@ index cc201afff..02d413467 100644
         }
 
         if (!use_cuda_graph) {
-@@ -3128,7 +3138,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+@@ -3679,7 +3689,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
     }
 }
 
@@ -250,7 +250,7 @@ index cc201afff..02d413467 100644
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     ggml_cuda_set_device(cuda_ctx->device);
-@@ -3166,7 +3176,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+@@ -3717,7 +3727,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
     if (use_cuda_graph) {
         cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
 
@@ -260,10 +260,10 @@ index cc201afff..02d413467 100644
         // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
         if (use_cuda_graph && cuda_graph_update_required) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
-index f2ff9f322..05ff6a5a6 100644
+index 8fc1c2fb5..ba95b4acc 100644
 --- a/ggml/src/ggml-metal/ggml-metal.cpp
 +++ b/ggml/src/ggml-metal/ggml-metal.cpp
-@@ -410,10 +410,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml
+@@ -419,10 +419,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml
     GGML_UNUSED(dst);
 }
 
@@ -278,10 +278,10 @@ index f2ff9f322..05ff6a5a6 100644
 
 static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 216dc167c..3a6bbe564 100644
+index 83cdec29e..a36c6560c 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -12357,7 +12357,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
+@@ -13103,7 +13103,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
     return num_adds;
 }
 
@@ -290,7 +290,7 @@ index 216dc167c..3a6bbe564 100644
     VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
 
-@@ -12561,6 +12561,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
+@@ -13320,6 +13320,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
     return GGML_STATUS_SUCCESS;
 
     UNUSED(backend);
--- a/llama/patches/0020-ggml-No-alloc-mode.patch
+++ b/llama/patches/0020-ggml-No-alloc-mode.patch
@@ -12,8 +12,8 @@ must be recreated with no-alloc set to false before loading data.
 ggml/src/ggml-backend-impl.h    |  16 +++
 ggml/src/ggml-backend.cpp       |  72 ++++++++++-
 ggml/src/ggml-cuda/common.cuh   |  58 ++++++++-
- ggml/src/ggml-cuda/ggml-cuda.cu | 217 ++++++++++++++++++++++++++------
- 5 files changed, 320 insertions(+), 44 deletions(-)
+ ggml/src/ggml-cuda/ggml-cuda.cu | 218 ++++++++++++++++++++++++++------
+ 5 files changed, 321 insertions(+), 44 deletions(-)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
 index 2763f2bd6..b3b5b356a 100644
@@ -75,7 +75,7 @@ index 0f5b03cef..7bdf9d81f 100644
 
     struct ggml_backend {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 41eef3b5f..c81a2e48a 100644
+index f511e8d76..74b7f070c 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
@@ -134,7 +134,7 @@ index 41eef3b5f..c81a2e48a 100644
 };
 
 #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
-@@ -1608,6 +1634,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
+@@ -1614,6 +1640,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
         size_t graph_size,
         bool parallel,
         bool op_offload) {
@@ -152,7 +152,7 @@ index 41eef3b5f..c81a2e48a 100644
     GGML_ASSERT(n_backends > 0);
     GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
     GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
-@@ -1649,11 +1686,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
+@@ -1655,11 +1692,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
                 sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
             }
         }
@@ -167,7 +167,7 @@ index 41eef3b5f..c81a2e48a 100644
 
     ggml_backend_sched_reset(sched);
 
-@@ -1668,6 +1708,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
+@@ -1674,6 +1714,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
         for (int c = 0; c < sched->n_copies; c++) {
             ggml_backend_event_free(sched->events[b][c]);
         }
@@ -178,7 +178,7 @@ index 41eef3b5f..c81a2e48a 100644
     }
     ggml_gallocr_free(sched->galloc);
     ggml_free(sched->ctx);
-@@ -1715,6 +1759,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
+@@ -1719,6 +1763,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
         return false;
     }
 
@@ -203,7 +203,7 @@ index 41eef3b5f..c81a2e48a 100644
     ggml_backend_sched_reset(sched);
 
     return true;
-@@ -1820,7 +1882,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
+@@ -1824,7 +1886,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
     int backend_index = ggml_backend_sched_backend_id(sched, backend);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
 
@@ -219,10 +219,10 @@ index 41eef3b5f..c81a2e48a 100644
 
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
 diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
-index 41ff89c4d..2931c15ca 100644
+index 611341deb..c3f8ca914 100644
 --- a/ggml/src/ggml-cuda/common.cuh
 +++ b/ggml/src/ggml-cuda/common.cuh
-@@ -35,6 +35,41 @@
+@@ -37,6 +37,41 @@
 #include "vendors/cuda.h"
 #endif // defined(GGML_USE_HIP)
 
@@ -264,7 +264,7 @@ index 41ff89c4d..2931c15ca 100644
 #define STRINGIZE_IMPL(...) #__VA_ARGS__
 #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
 
-@@ -856,6 +891,9 @@ struct ggml_cuda_pool {
+@@ -891,6 +926,9 @@ struct ggml_cuda_pool {
 
     virtual void * alloc(size_t size, size_t * actual_size) = 0;
     virtual void free(void * ptr, size_t size) = 0;
@@ -274,46 +274,48 @@ index 41ff89c4d..2931c15ca 100644
 };
 
 template<typename T>
-@@ -992,11 +1030,11 @@ struct ggml_backend_cuda_context {
+@@ -1179,11 +1217,11 @@ struct ggml_backend_cuda_context {
     // pool
-     std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
+     std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS];
 
-    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
-+    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, bool alloc);
+-    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, int stream_no);
+    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, int stream_no, bool alloc);
 
     ggml_cuda_pool & pool(int device) {
-         if (pools[device] == nullptr) {
-            pools[device] = new_pool_for_device(device);
-+            pools[device] = new_pool_for_device(device, true);
+         if (pools[device][curr_stream_no] == nullptr) {
+-            pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no);
+            pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, true);
         }
-         return *pools[device];
+         return *pools[device][curr_stream_no];
     }
-@@ -1004,4 +1042,20 @@ struct ggml_backend_cuda_context {
+@@ -1191,6 +1229,22 @@ struct ggml_backend_cuda_context {
     ggml_cuda_pool & pool() {
         return pool(device);
     }
 +
 +    void pool_set_alloc(bool alloc) {
-+        GGML_ASSERT(pools[device] == nullptr || pools[device]->alloc_memory() == alloc);
+        GGML_ASSERT(pools[device][curr_stream_no] == nullptr || pools[device][curr_stream_no]->alloc_memory() == alloc);
 +
-+        if (pools[device] == nullptr) {
-+            pools[device] = new_pool_for_device(device, alloc);
+        if (pools[device][curr_stream_no] == nullptr) {
+            pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc);
 +        }
 +    }
 +
 +    size_t pool_get_alloc_size() {
-+        if (pools[device] == nullptr) {
+        if (pools[device][curr_stream_no] == nullptr) {
 +            return 0;
 +        }
 +
-+        return pools[device]->alloc_size();
+        return pools[device][curr_stream_no]->alloc_size();
 +    }
 };
+ 
+ struct ggml_cuda_mm_fusion_args_host {
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 02d413467..f79e5d65c 100644
+index 78fb2d8b3..fe0da71ca 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -359,6 +359,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
+@@ -361,6 +361,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
 
 // #define DEBUG_CUDA_MALLOC
 
@@ -322,7 +324,7 @@ index 02d413467..f79e5d65c 100644
 // buffer pool for cuda (legacy)
 struct ggml_cuda_pool_leg : public ggml_cuda_pool {
     static const int MAX_BUFFERS = 256;
-@@ -371,9 +373,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
+@@ -373,9 +375,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 
     ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
     size_t pool_size = 0;
@@ -337,7 +339,7 @@ index 02d413467..f79e5d65c 100644
     }
 
     ~ggml_cuda_pool_leg() {
-@@ -381,7 +386,9 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
+@@ -383,7 +388,9 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
         for (int i = 0; i < MAX_BUFFERS; ++i) {
             ggml_cuda_buffer & b = buffer_pool[i];
             if (b.ptr != nullptr) {
@@ -348,7 +350,7 @@ index 02d413467..f79e5d65c 100644
                 pool_size -= b.size;
             }
         }
-@@ -429,8 +436,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
+@@ -431,8 +438,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
         void * ptr;
         size_t look_ahead_size = (size_t) (1.05 * size);
         look_ahead_size = 256 * ((look_ahead_size + 255)/256);
@@ -366,7 +368,7 @@ index 02d413467..f79e5d65c 100644
         *actual_size = look_ahead_size;
         pool_size += look_ahead_size;
 #ifdef DEBUG_CUDA_MALLOC
-@@ -450,10 +464,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
+@@ -452,10 +466,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
             }
         }
         GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
@@ -389,7 +391,7 @@ index 02d413467..f79e5d65c 100644
 };
 
 // pool with virtual memory
-@@ -465,18 +489,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
+@@ -467,18 +491,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
     CUdeviceptr pool_addr = 0;
     size_t pool_used = 0;
     size_t pool_size = 0;
@@ -417,7 +419,7 @@ index 02d413467..f79e5d65c 100644
 #if defined(GGML_USE_HIP)
             // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
             for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
-@@ -503,35 +533,49 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
+@@ -505,35 +535,49 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
 
             GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
 
@@ -493,7 +495,7 @@ index 02d413467..f79e5d65c 100644
 
             // add to the pool
             pool_size += reserve_size;
-@@ -564,16 +608,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
+@@ -566,17 +610,27 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
         // all deallocations must be in reverse order of the allocations
         GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
     }
@@ -505,11 +507,14 @@ index 02d413467..f79e5d65c 100644
 +    size_t alloc_size() override {
 +        return pool_size + last_alloc;
 +    }
+
 };
 #endif // defined(GGML_USE_VMM)
 
-std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-+std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device, bool alloc) {
+ std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int                  device,
+-                                                                               [[maybe_unused]] int stream_no) {
+                                                                               [[maybe_unused]] int stream_no,
+                                                                               bool alloc) {
 #if defined(GGML_USE_VMM)
     if (ggml_cuda_info().devices[device].vmm) {
 -        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
@@ -521,7 +526,7 @@ index 02d413467..f79e5d65c 100644
 }
 
 // destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
-@@ -757,11 +809,20 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
+@@ -760,11 +814,20 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
 }
 
 static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -543,7 +548,7 @@ index 02d413467..f79e5d65c 100644
 static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
     size_t size = ggml_nbytes(tensor);
     int64_t ne0 = tensor->ne[0];
-@@ -785,6 +846,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
+@@ -788,6 +851,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
     /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
     /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
     /* .is_host          = */ NULL,
@@ -551,7 +556,7 @@ index 02d413467..f79e5d65c 100644
 };
 
 ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
-@@ -2986,6 +3048,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
+@@ -3258,6 +3322,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
 
 static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
     bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
@@ -559,7 +564,7 @@ index 02d413467..f79e5d65c 100644
     // flag used to determine whether it is an integrated_gpu
     const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
 
-@@ -3001,6 +3064,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+@@ -3347,6 +3412,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     continue;
                 }
 
@@ -567,11 +572,10 @@ index 02d413467..f79e5d65c 100644
 +                if (reserving_graph && node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
 +                    continue;
 +                }
-+
-                 static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
-                 if (!disable_fusion) {
 
-@@ -3140,6 +3208,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+                 // start of fusion operations
+                 static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
+@@ -3691,6 +3760,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
 static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@@ -579,7 +583,7 @@ index 02d413467..f79e5d65c 100644
 
     ggml_cuda_set_device(cuda_ctx->device);
 
-@@ -3215,6 +3284,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+@@ -3766,6 +3836,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
     return GGML_STATUS_SUCCESS;
 }
 
@@ -645,16 +649,16 @@ index 02d413467..f79e5d65c 100644
 +
 +static void ggml_backend_cuda_reset(ggml_backend_t backend) {
 +    ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
-+    ctx->pools[ctx->device] = NULL;
+    ctx->pools[ctx->device][ctx->curr_stream_no] = NULL;
 +}
 +
 static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
-@@ -3255,6 +3389,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
+@@ -4035,6 +4170,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
     /* .event_record            = */ ggml_backend_cuda_event_record,
     /* .event_wait              = */ ggml_backend_cuda_event_wait,
-     /* .graph_optimize          = */ NULL,
+     /* .graph_optimize          = */ ggml_backend_cuda_graph_optimize,
 +    /* .graph_reserve           = */ ggml_backend_cuda_graph_reserve,
 +    /* .buffer_size             = */ ggml_backend_cuda_buffer_size,
 +    /* .reset                   = */ ggml_backend_cuda_reset,
--- a/llama/patches/0021-decode-disable-output_all.patch
+++ b/llama/patches/0021-decode-disable-output_all.patch
@@ -8,12 +8,12 @@ Subject: [PATCH] decode: disable output_all
 1 file changed, 1 insertion(+), 2 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index bd348bcad..8b4a89d38 100644
+index e04f0fc4f..1359c614b 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -974,8 +974,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
+@@ -999,8 +999,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
     const int64_t n_vocab = vocab.n_tokens();
-     const int64_t n_embd  = hparams.n_embd;
+     const int64_t n_embd  = hparams.n_embd_inp();
 
 -    // when computing embeddings, all tokens are output
 -    const bool output_all = cparams.embeddings;
--- a/llama/patches/0022-ggml-Enable-resetting-backend-devices.patch
+++ b/llama/patches/0022-ggml-Enable-resetting-backend-devices.patch
@@ -43,7 +43,7 @@ index 7bdf9d81f..21b35ac5c 100644
 
     struct ggml_backend_device {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index c81a2e48a..9b0a9b91f 100644
+index 74b7f070c..8d2cc167f 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
@@ -62,10 +62,10 @@ index c81a2e48a..9b0a9b91f 100644
     GGML_ASSERT(device);
     return device->iface.get_buffer_type(device);
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index f79e5d65c..c9333689f 100644
+index fe0da71ca..0787e443c 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -107,6 +107,11 @@ int ggml_cuda_get_device() {
+@@ -109,6 +109,11 @@ int ggml_cuda_get_device() {
     return id;
 }
 
@@ -77,7 +77,7 @@ index f79e5d65c..c9333689f 100644
 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
     ggml_cuda_set_device(device);
     cudaError_t err;
-@@ -3499,7 +3504,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+@@ -4380,7 +4385,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
     props->id          = ggml_backend_cuda_device_get_id(dev);
     props->type        = ggml_backend_cuda_device_get_type(dev);
     props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
@@ -89,7 +89,7 @@ index f79e5d65c..c9333689f 100644
 
     bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
 #ifdef GGML_CUDA_NO_PEER_COPY
-@@ -3936,6 +3944,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
+@@ -4835,6 +4843,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
     CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
 }
 
@@ -101,7 +101,7 @@ index f79e5d65c..c9333689f 100644
 static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
     /* .get_name                = */ ggml_backend_cuda_device_get_name,
     /* .get_description         = */ ggml_backend_cuda_device_get_description,
-@@ -3952,6 +3965,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
+@@ -4851,6 +4864,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
     /* .event_new               = */ ggml_backend_cuda_device_event_new,
     /* .event_free              = */ ggml_backend_cuda_device_event_free,
     /* .event_synchronize       = */ ggml_backend_cuda_device_event_synchronize,
@@ -110,7 +110,7 @@ index f79e5d65c..c9333689f 100644
 
 // backend reg
 diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
-index 890c10364..1f06be80e 100644
+index b7d6edf7f..b987d7aeb 100644
 --- a/ggml/src/ggml-cuda/vendors/hip.h
 +++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -45,6 +45,7 @@
--- a/llama/patches/0024-GPU-discovery-enhancements.patch
+++ b/llama/patches/0024-GPU-discovery-enhancements.patch
@@ -20,10 +20,10 @@ fix vulkan PCI ID and ID handling
 ggml/src/ggml-cuda/vendors/hip.h     |   3 +
 ggml/src/ggml-impl.h                 |   8 +
 ggml/src/ggml-metal/ggml-metal.cpp   |   2 +
- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 209 +++++++++--
+ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 169 ++++++++-
 ggml/src/mem_hip.cpp                 | 529 +++++++++++++++++++++++++++
 ggml/src/mem_nvml.cpp                | 209 +++++++++++
- 9 files changed, 1003 insertions(+), 30 deletions(-)
+ 9 files changed, 976 insertions(+), 17 deletions(-)
 create mode 100644 ggml/src/mem_hip.cpp
 create mode 100644 ggml/src/mem_nvml.cpp

@@ -45,7 +45,7 @@ index 69223c488..6510e0cba 100644
 
     GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index f9a6587f1..03f359ae9 100644
+index 6d493a4ff..ac8f38464 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
@@ -209,6 +209,8 @@ add_library(ggml-base
@@ -56,12 +56,12 @@ index f9a6587f1..03f359ae9 100644
 +            mem_nvml.cpp
             gguf.cpp)
 
- target_include_directories(ggml-base PRIVATE .)
+ set_target_properties(ggml-base PROPERTIES
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index c9333689f..f1a20e7fe 100644
+index 0787e443c..736d47c07 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
+@@ -263,6 +263,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
     for (int id = 0; id < info.device_count; ++id) {
         int device_vmm = 0;
 
@@ -78,7 +78,7 @@ index c9333689f..f1a20e7fe 100644
 #if defined(GGML_USE_VMM)
         CUdevice device;
         CU_CHECK(cuDeviceGet(&device, id));
-@@ -314,6 +324,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
+@@ -316,6 +326,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
 #else
         info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
         info.devices[id].cc = 100*prop.major + 10*prop.minor;
@@ -90,7 +90,7 @@ index c9333689f..f1a20e7fe 100644
         GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
                         id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
                         ggml_cuda_parse_uuid(prop, id).c_str());
-@@ -3468,6 +3483,11 @@ struct ggml_backend_cuda_device_context {
+@@ -4249,6 +4264,11 @@ struct ggml_backend_cuda_device_context {
     std::string description;
     std::string pci_bus_id;
     std::string id;
@@ -102,7 +102,7 @@ index c9333689f..f1a20e7fe 100644
 };
 
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
-@@ -3488,6 +3508,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
+@@ -4345,6 +4365,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     ggml_cuda_set_device(ctx->device);
@@ -129,9 +129,9 @@ index c9333689f..f1a20e7fe 100644
 +    }
 +#endif
     CUDA_CHECK(cudaMemGetInfo(free, total));
- }
 
-@@ -3496,6 +3538,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
+ // ref: https://github.com/ggml-org/llama.cpp/pull/17368
+@@ -4377,6 +4419,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
     return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
 
@@ -139,7 +139,7 @@ index c9333689f..f1a20e7fe 100644
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
 
-@@ -3509,6 +3552,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+@@ -4390,6 +4433,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
     // If you need the memory data, call ggml_backend_dev_memory() explicitly.
     props->memory_total = props->memory_free = 0;
 
@@ -159,7 +159,7 @@ index c9333689f..f1a20e7fe 100644
     bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
 #ifdef GGML_CUDA_NO_PEER_COPY
     bool events = false;
-@@ -4075,6 +4131,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -4974,6 +5030,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
         std::lock_guard<std::mutex> lock(mutex);
         if (!initialized) {
             ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
@@ -167,7 +167,7 @@ index c9333689f..f1a20e7fe 100644
 
             for (int i = 0; i < ggml_cuda_info().device_count; i++) {
                 ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
-@@ -4090,6 +4147,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -4989,6 +5046,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
                 dev_ctx->pci_bus_id = pci_bus_id;
 
@@ -183,7 +183,7 @@ index c9333689f..f1a20e7fe 100644
                     /* .iface   = */ ggml_backend_cuda_device_interface,
                     /* .reg     = */ &reg,
 diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
-index 1f06be80e..2f9ef2dc0 100644
+index b987d7aeb..5ad5623ae 100644
 --- a/ggml/src/ggml-cuda/vendors/hip.h
 +++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -5,6 +5,8 @@
@@ -204,7 +204,7 @@ index 1f06be80e..2f9ef2dc0 100644
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
 diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
-index e9201cdc6..44ae76d66 100644
+index fe57d4c58..1c07e767a 100644
 --- a/ggml/src/ggml-impl.h
 +++ b/ggml/src/ggml-impl.h
@@ -677,6 +677,14 @@ static inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph * cgraph,
@@ -223,10 +223,10 @@ index e9201cdc6..44ae76d66 100644
 }
 #endif
 diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
-index 05ff6a5a6..032dee76d 100644
+index ba95b4acc..f6f8f7a10 100644
 --- a/ggml/src/ggml-metal/ggml-metal.cpp
 +++ b/ggml/src/ggml-metal/ggml-metal.cpp
-@@ -537,6 +537,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
+@@ -546,6 +546,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
     GGML_UNUSED(dev);
 }
 
@@ -234,7 +234,7 @@ index 05ff6a5a6..032dee76d 100644
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
     props->name        = ggml_backend_metal_device_get_name(dev);
     props->description = ggml_backend_metal_device_get_description(dev);
-@@ -545,6 +546,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
+@@ -554,6 +555,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
 
     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
 
@@ -243,18 +243,18 @@ index 05ff6a5a6..032dee76d 100644
         /* .async                 = */ true,
         /* .host_buffer           = */ false,
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 3a6bbe564..ca02ea079 100644
+index a36c6560c..a234eda2e 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -229,6 +229,7 @@ class vk_memory_logger;
- #endif
+@@ -236,6 +236,7 @@ class vk_memory_logger;
 class vk_perf_logger;
 static void ggml_vk_destroy_buffer(vk_buffer& buf);
+ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx);
 +static std::string ggml_vk_get_device_id(int device);
 
 static constexpr uint32_t mul_mat_vec_max_cols = 8;
 static constexpr uint32_t p021_max_gqa_ratio = 8;
-@@ -11813,6 +11814,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
+@@ -12353,6 +12354,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
     snprintf(description, description_size, "%s", props.deviceName.data());
 }
 
@@ -284,7 +284,7 @@ index 3a6bbe564..ca02ea079 100644
 // backend interface
 
 #define UNUSED GGML_UNUSED
-@@ -12761,31 +12785,102 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
+@@ -13614,15 +13638,72 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
     ggml_vk_get_device_description(dev_idx, description, description_size);
 }
 
@@ -312,24 +312,23 @@ index 3a6bbe564..ca02ea079 100644
 +    int driver_major;
 +    int driver_minor;
 +};
-+
+ 
+-    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
 +void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
 +    GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size());
 +    GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size());
 +
 +    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]];
- 
-    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
-    vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops;
-    vk::PhysicalDeviceMemoryProperties2 memprops = {};
-    bool membudget_supported = vk_instance.device_supports_membudget[device];
-+    vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
+     vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops;
+     vk::PhysicalDeviceMemoryProperties2 memprops = {};
+-    const bool membudget_supported = vk_instance.device_supports_membudget[device];
+    const bool membudget_supported = vk_instance.device_supports_membudget[ctx->device];
+     const bool is_integrated_gpu = vkdev.getProperties().deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
+    
 +    vk::PhysicalDeviceProperties2 props2;
 +    vkdev.getProperties2(&props2);
- 
-    if (membudget_supported) {
-        memprops.pNext = &budgetprops;
-+    if (!ctx->is_integrated_gpu)
+
+    if (!is_integrated_gpu)
 +    {
 +        // Use vendor specific management libraries for best VRAM reporting if available
 +        switch (props2.properties.vendorID) {
@@ -356,55 +355,13 @@ index 3a6bbe564..ca02ea079 100644
 +            }
 +            break;
 +        }
-     }
-    vkdev.getMemoryProperties2(&memprops);
+    }
 +    // else fallback to memory budget if supported
+
 
-    for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) {
-        const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i];
-+    *total = 0;
-+    *free = 0;
-+    vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
-+    vk::PhysicalDeviceMemoryProperties2 memprops2;
-+    memprops2.pNext = &mem_budget_props;
-+    vkdev.getMemoryProperties2(&memprops2);
-+    for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
-+        if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
-+            *total += memprops2.memoryProperties.memoryHeaps[i].size;
-+        } else if (ctx->is_integrated_gpu) {
-+            // Include shared memory on iGPUs
-+            *total += memprops2.memoryProperties.memoryHeaps[i].size;
-+        }
-+    }
-+    for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
-+        if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
-+            *free += mem_budget_props.heapBudget[i];
-+        } else if (ctx->is_integrated_gpu) {
-+            *free += mem_budget_props.heapBudget[i];
-+        }
-+    }
-+    if (*total > 0 && *free > 0) {
-+        return;
-+    } else if (*total > 0) {
-+        *free = *total;
-+        return;
-+    }
- 
-+    // else just report the physical memory
-+    for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) {
-         if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
-             *total = heap.size;
-
-            if (membudget_supported && i < budgetprops.heapUsage.size()) {
-                *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i];
-            } else {
-                *free = heap.size;
-            }
-+            *free = heap.size;
-             break;
-         }
-     }
-@@ -12818,8 +12913,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
+     if (membudget_supported) {
+         memprops.pNext = &budgetprops;
+@@ -13674,8 +13755,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
         }
     }
 
@@ -419,7 +376,7 @@ index 3a6bbe564..ca02ea079 100644
     }
 
     vk::PhysicalDeviceProperties2 props = {};
-@@ -12836,19 +12936,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
+@@ -13692,19 +13778,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
 
     char pci_bus_id[16] = {};
     snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
@@ -453,7 +410,7 @@ index 3a6bbe564..ca02ea079 100644
 
 static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-@@ -12860,9 +12965,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
+@@ -13716,9 +13807,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
     return ctx->description.c_str();
 }
 
@@ -469,7 +426,7 @@ index 3a6bbe564..ca02ea079 100644
 }
 
 static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
-@@ -12886,8 +12996,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
+@@ -13742,8 +13838,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
 
     props->name        = ggml_backend_vk_device_get_name(dev);
     props->description = ggml_backend_vk_device_get_description(dev);
@@ -480,7 +437,7 @@ index 3a6bbe564..ca02ea079 100644
     ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = {
         /* .async                 = */ false,
-@@ -12895,6 +13006,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
+@@ -13751,6 +13848,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
         /* .buffer_from_host_ptr  = */ false,
         /* .events                = */ false,
     };
@@ -494,7 +451,7 @@ index 3a6bbe564..ca02ea079 100644
 }
 
 static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
-@@ -13365,6 +13483,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
+@@ -14319,6 +14423,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
         static std::mutex mutex;
         std::lock_guard<std::mutex> lock(mutex);
         if (!initialized) {
@@ -503,7 +460,7 @@ index 3a6bbe564..ca02ea079 100644
             for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
                 ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
                 char desc[256];
-@@ -13373,12 +13493,41 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
+@@ -14327,12 +14433,41 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                 ctx->name = GGML_VK_NAME + std::to_string(i);
                 ctx->description = desc;
                 ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
--- a/llama/patches/0027-interleave-multi-rope.patch
+++ b/llama/patches/0027-interleave-multi-rope.patch
@@ -6,108 +6,101 @@ Subject: [PATCH] interleave multi rope
 since ollama doesn't use mrope for anything else, change it to mean the
 interleaved version used for qwen3vl
 ---
- ggml/src/ggml-cpu/ops.cpp                           |  7 ++-----
- ggml/src/ggml-cuda/rope.cu                          | 12 +++---------
- ggml/src/ggml-metal/ggml-metal.metal                | 10 +++-------
- ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp | 12 +++---------
- 4 files changed, 11 insertions(+), 30 deletions(-)
+ ggml/src/ggml-cpu/ops.cpp                           | 8 ++++----
+ ggml/src/ggml-cuda/rope.cu                          | 8 ++++----
+ ggml/src/ggml-metal/ggml-metal.metal                | 8 ++++----
+ ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl | 8 ++++----
+ 4 files changed, 16 insertions(+), 16 deletions(-)

 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index 902fdad69..70955347d 100644
+index 40666bab6..3155cb4bb 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -5509,15 +5509,12 @@ static void ggml_mrope_cache_init(
-         }
+@@ -5599,14 +5599,14 @@ static void ggml_mrope_cache_init(
 
         float theta = theta_t;
-        if (sector >= sections[0] && sector < sec_w) {
-+        if (sector % 3 == 1 && sector < 1 + 3 * sections[1]) {
-             theta = theta_h;
-         }
-        else if (sector >= sec_w && sector < sec_w + sections[2]) {
-+        else if (sector % 3 == 2 && sector < 2 + 3 * sections[2]) {
-             theta = theta_w;
-         }
-        else if (sector >= sec_w + sections[2]) {
-            theta = theta_e;
-        }
- 
-         rope_yarn(
-             theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
+         if (is_imrope) { // qwen3vl apply interleaved mrope
+-            if (sector % 3 == 1 && sector < 3 * sections[1]) {
+            if (sector % 3 == 1 && sector < 1 + 3 * sections[1]) {
+                 theta = theta_h;
+-            } else if (sector % 3 == 2 && sector < 3 * sections[2]) {
+            } else if (sector % 3 == 2 && sector < 2 + 3 * sections[2]) {
+                 theta = theta_w;
+             } else if (sector % 3 == 0 && sector < 3 * sections[0]) {
+                 theta = theta_t;
+-            } else {
+-                theta = theta_e;
+            // } else {
+            //     theta = theta_e;
+             }
+         } else {
+             if (sector >= sections[0] && sector < sec_w) {
 diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
-index d058504cd..287fe9d2c 100644
+index 88ed79111..71ca60214 100644
 --- a/ggml/src/ggml-cuda/rope.cu
 +++ b/ggml/src/ggml-cuda/rope.cu
-@@ -151,19 +151,13 @@ static __global__ void rope_multi(
-     const int sec_w = sections.v[1] + sections.v[0];
-     const int sector = (i0 / 2) % sect_dims;
- 
-    float theta_base = 0.0;
-    if (sector < sections.v[0]) {
-        theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
-    }
-    else if (sector >= sections.v[0] && sector < sec_w) {
-+    float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
-+    if (sector % 3 == 1 && sector < 1 + 3 * sections.v[1]) {
-         theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
-     }
-    else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
-+    else if (sector % 3 == 2 && sector < 2 + 3 * sections.v[2]) {
-         theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
-     }
-    else if (sector >= sec_w + sections.v[2]) {
-        theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
-    }
- 
-     const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+@@ -200,14 +200,14 @@ static __global__ void rope_multi(
 
+     float theta_base = 0.0;
+     if (is_imrope) {
+-        if (sector % 3 == 1 && sector < 3 * sections.v[1]) { // h
+        if (sector % 3 == 1 && sector < 1 + 3 * sections.v[1]) { // h
+             theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
+-        } else if (sector % 3 == 2 && sector < 3 * sections.v[2]) { // w
+        } else if (sector % 3 == 2 && sector < 2 + 3 * sections.v[2]) { // w
+             theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
+         } else if (sector % 3 == 0 && sector < 3 * sections.v[0]) { // t
+             theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
+-        } else {
+-            theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
+        // } else {
+        //     theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
+         }
+     } else {
+         if (sector < sections.v[0]) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 50b8071de..65a3183c8 100644
+index aed013a9d..a489de435 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -3888,15 +3888,11 @@ kernel void kernel_rope_multi(
-             const int sec_w012  = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
-             const int sector    = ic % sect_dims;
+@@ -4009,14 +4009,14 @@ kernel void kernel_rope_multi(
 
-            float theta_base;
-            if (sector < args.sect_0) {
-                theta_base = (float) pos[i2];
-            } else if (sector < sec_w01) {
-+            float theta_base = (float) pos[i2];
-+            if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) {
-                 theta_base = (float) pos[i2 + args.ne02];
-            } else if (sector < sec_w012) {
-+            } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) {
-                 theta_base = (float) pos[i2 + args.ne02 * 2];
-            } else {
-                theta_base = (float) pos[i2 + args.ne02 * 3];
-             }
-             // end of mrope
- 
-diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
-index 111286b49..633dc20ff 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
-+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
-@@ -31,19 +31,13 @@ void main() {
-     const int sec_w = p.sections[1] + p.sections[0];
-     const uint sector = (i0 / 2) % sect_dims;
- 
-    float theta_base = 0.0;
-    if (sector < p.sections[0]) {
-        theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
-    }
-    else if (sector >= p.sections[0] && sector < sec_w) {
-+    float theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
-+    if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) {
-         theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
-     }
-    else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
-+    else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) {
-         theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
-     }
-    else if (sector >= sec_w + p.sections[2]) {
-        theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
-    }
- 
-     const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
+             float theta_base;
+             if (FC_rope_is_imrope) {
+-                if (sector % 3 == 1 && sector < 3 * args.sect_1) { // h
+                if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) { // h
+                     theta_base = (float) pos[i2 + args.ne02 * 1];
+-                } else if (sector % 3 == 2 && sector < 3 * args.sect_2) { // w
+                } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) { // w
+                     theta_base = (float) pos[i2 + args.ne02 * 2];
+                 } else if (sector % 3 == 0 && sector < 3 * args.sect_0) { // t
+                     theta_base = (float) pos[i2 + args.ne02 * 0];
+-                } else { // e
+-                    theta_base = (float) pos[i2 + args.ne02 * 3];
+                // } else { // e
+                //     theta_base = (float) pos[i2 + args.ne02 * 3];
+                 }
+             } else {
+                 if (sector < args.sect_0) {
+diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
+index 9726b722d..1c8c69422 100644
+--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
+@@ -148,14 +148,14 @@ void rope_multi(const uint i0, const uint i1, rope_params p) {
 
+     float theta_base = 0.0;
+     if (p.is_imrope != 0) {
+-        if (sector % 3 == 1 && sector < 3 * p.sections[1]) {
+        if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) {
+             theta_base = rope_data_pos[i02 + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
+-        } else if (sector % 3 == 2 && sector < 3 * p.sections[2]) {
+        } else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) {
+             theta_base = rope_data_pos[i02 + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
+         } else if (sector % 3 == 0 && sector < 3 * p.sections[0]) {
+             theta_base = rope_data_pos[i02]*pow(p.theta_scale, i0/2.0f);
+-        } else {
+-            theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
+        //} else {
+        //    theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
+         }
+     } else {
+         if (sector < p.sections[0]) {
--- a/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch
+++ b/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch
@@ -6,13 +6,13 @@ Subject: [PATCH] Add memory detection using DXGI + PDH
 ---
 ggml/src/CMakeLists.txt              |   1 +
 ggml/src/ggml-impl.h                 |   3 +
- ggml/src/ggml-vulkan/ggml-vulkan.cpp |  29 ++-
+ ggml/src/ggml-vulkan/ggml-vulkan.cpp |  26 ++-
 ggml/src/mem_dxgi_pdh.cpp            | 297 +++++++++++++++++++++++++++
- 4 files changed, 327 insertions(+), 3 deletions(-)
+ 4 files changed, 325 insertions(+), 2 deletions(-)
 create mode 100644 ggml/src/mem_dxgi_pdh.cpp

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 03f359ae9..4b3e5efb5 100644
+index ac8f38464..faa1beed2 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
@@ -211,6 +211,7 @@ add_library(ggml-base
@@ -22,9 +22,9 @@ index 03f359ae9..4b3e5efb5 100644
 +            mem_dxgi_pdh.cpp
             gguf.cpp)
 
- target_include_directories(ggml-base PRIVATE .)
+ set_target_properties(ggml-base PROPERTIES
 diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
-index 44ae76d66..639d551a2 100644
+index 1c07e767a..0da3e065b 100644
 --- a/ggml/src/ggml-impl.h
 +++ b/ggml/src/ggml-impl.h
@@ -684,6 +684,9 @@ GGML_API void ggml_nvml_release();
@@ -38,10 +38,10 @@ index 44ae76d66..639d551a2 100644
 #ifdef __cplusplus
 }
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index ca02ea079..c12b069e5 100644
+index a234eda2e..c98f98c73 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -73,6 +73,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
+@@ -74,6 +74,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
 #define VK_KHR_SHADER_BFLOAT16_EXTENSION_NAME                        "VK_KHR_shader_bfloat16"
 #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR ((VkStructureType)1000141000)
 #define VK_COMPONENT_TYPE_BFLOAT16_KHR                               ((VkComponentTypeKHR)1000141000)
@@ -49,7 +49,7 @@ index ca02ea079..c12b069e5 100644
 
 typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
     VkStructureType                       sType;
-@@ -12802,6 +12803,7 @@ struct ggml_backend_vk_device_context {
+@@ -13655,6 +13656,7 @@ struct ggml_backend_vk_device_context {
     std::string pci_id;
     std::string id;
     std::string uuid;
@@ -57,8 +57,8 @@ index ca02ea079..c12b069e5 100644
     int major;
     int minor;
     int driver_major;
-@@ -12817,8 +12819,22 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
-     vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
+@@ -13673,6 +13675,20 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
+     
     vk::PhysicalDeviceProperties2 props2;
     vkdev.getProperties2(&props2);
 +    GGML_LOG_DEBUG("ggml_backend_vk_get_device_memory called: uuid %s\n", ctx->uuid.c_str());
@@ -76,22 +76,17 @@ index ca02ea079..c12b069e5 100644
 +        ggml_dxgi_pdh_release();
 +    }
 
-    if (!ctx->is_integrated_gpu)
-+    if (!ctx->is_integrated_gpu) 
+     if (!is_integrated_gpu)
     {
-         // Use vendor specific management libraries for best VRAM reporting if available
-         switch (props2.properties.vendorID) {
-@@ -12846,8 +12862,8 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
-             break;
-         }
+@@ -13704,7 +13720,6 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
     }
-    // else fallback to memory budget if supported
+     // else fallback to memory budget if supported
 
-+    // else fallback to memory budget if supported
-     *total = 0;
-     *free = 0;
-     vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
-@@ -13500,7 +13516,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
+-
+     if (membudget_supported) {
+         memprops.pNext = &budgetprops;
+     }
+@@ -14440,7 +14455,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                     /* .reg     = */ reg,
                     /* .context = */ ctx,
                 });
@@ -99,7 +94,7 @@ index ca02ea079..c12b069e5 100644
                 // Gather additional information about the device
                 int dev_idx = vk_instance.device_indices[i];
                 vk::PhysicalDeviceProperties props1;
-@@ -13523,6 +13538,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
+@@ -14463,6 +14477,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                     }
                 }
                 ctx->uuid = oss.str();
--- a/llama/patches/0029-ggml-cuda-skip-large-batches.patch
+++ b/llama/patches/0029-ggml-cuda-skip-large-batches.patch
@@ -10,10 +10,10 @@ fallback to cpu
 1 file changed, 3 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index f1a20e7fe..1a71e07c9 100644
+index 736d47c07..7350f6758 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -3677,6 +3677,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+@@ -4564,6 +4564,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
                     return false;
                 }
--- a/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch
+++ b/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch
@@ -1,32 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jeff Bolz <jbolz@nvidia.com>
-Date: Wed, 29 Oct 2025 03:53:04 -0500
-Subject: [PATCH] vulkan: Call ggml_vk_buffer_write_2d from ggml_vk_buffer_copy
- (#16793)
-
-This lets the copy to the destination device use the host-visible
-vidmem optimization.
---
- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 5 +----
- 1 file changed, 1 insertion(+), 4 deletions(-)
-
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index c12b069e5..76c78c2ea 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -5654,14 +5654,11 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
-         VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
-         // Copy device to device
-         ggml_vk_ensure_sync_staging_buffer(src->device, size);
-        ggml_vk_ensure_sync_staging_buffer(dst->device, size);
- 
-         // Copy to src staging buffer
-         ggml_vk_buffer_copy(src->device->sync_staging, 0, src, src_offset, size);
-        // memcpy to dst staging buffer
-        memcpy(dst->device->sync_staging->ptr, src->device->sync_staging->ptr, size);
-         // Copy to dst buffer
-        ggml_vk_buffer_copy(dst, dst_offset, dst->device->sync_staging, 0, size);
-+        ggml_vk_buffer_write_2d(dst, dst_offset, src->device->sync_staging->ptr, 0, size, 1);
-     }
- }
- 
--- a/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch
+++ b/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch
--- a/llama/patches/0030-win-exit-instead-of-abort.patch
+++ b/llama/patches/0030-win-exit-instead-of-abort.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] win: exit instead of abort
 1 file changed, 6 insertions(+), 1 deletion(-)

 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 9be35c1be..923c33d05 100644
+index b99345a2e..1c9e0bc05 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
@@ -229,8 +229,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
--- a/llama/patches/0031-fix-bakllava-regression.patch
+++ b/llama/patches/0031-fix-bakllava-regression.patch
@@ -0,0 +1,25 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Tue, 11 Nov 2025 11:39:43 -0800
+Subject: [PATCH] fix bakllava regression
+
+Rever to prior logic of assuming an empty projector type is mlp
+---
+ tools/mtmd/clip.cpp | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
+index f4c4d2c48..3334ff25b 100644
+--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
+@@ -2648,6 +2648,10 @@ struct clip_model_loader {
+             if (proj_type.empty()) {
+                 if (modality == CLIP_MODALITY_VISION) {
+                     get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
+                    if (proj_type.empty()) {
+                        // Assume MLP if no projector type listed
+                        proj_type = "mlp";
+                    }
+                 } else if (modality == CLIP_MODALITY_AUDIO) {
+                     get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false);
+                 } else {
--- a/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch
+++ b/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch
@@ -1,657 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jeff Bolz <jbolz@nvidia.com>
-Date: Wed, 29 Oct 2025 08:44:29 -0500
-Subject: [PATCH] vulkan: Update topk_moe fusion to handle gpt's late softmax
- (#16656)
-
-* vulkan: Update topk_moe fusion to handle gpt's late softmax
-
-Based on #16649.
-
-* Add ggml_check_edges
-
-* Add sync logging to show fusion effects
-
-* handle clamp added in #16655
-
-* Update ggml/src/ggml-impl.h
-
-Co-authored-by: Diego Devesa <slarengh@gmail.com>
---
- ggml/src/ggml-impl.h                          |  16 +
- ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 304 +++++++++++-------
- .../ggml-vulkan/vulkan-shaders/topk_moe.comp  |  90 ++++--
- 3 files changed, 272 insertions(+), 138 deletions(-)
-
-diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
-index 639d551a2..e5c446d1d 100644
--- a/ggml/src/ggml-impl.h
-+++ b/ggml/src/ggml-impl.h
-@@ -693,6 +693,7 @@ GGML_API void ggml_dxgi_pdh_release();
- #endif
- 
- #ifdef __cplusplus
-+#include <array>
- #include <initializer_list>
- #include <vector>
- 
-@@ -708,6 +709,21 @@ inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph *          cgraph,
-     return ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size());
- }
- 
-+// Return true if the edges in the graph match expectations.
-+inline bool ggml_check_edges(const struct ggml_cgraph *                cgraph,
-+                             int                                       start_idx,
-+                             std::initializer_list<std::array<int, 3>> edges) {
-+    for (const auto & edge : edges) {
-+        int dst_node = edge[0];
-+        int src_idx  = edge[1];
-+        int src_node = edge[2];
-+        if (cgraph->nodes[start_idx + dst_node]->src[src_idx] != cgraph->nodes[start_idx + src_node]) {
-+            return false;
-+        }
-+    }
-+    return true;
-+}
-+
- // expose GGUF internals for test code
- GGML_API size_t gguf_type_size(enum gguf_type type);
- GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 7669ed206..63a762ec2 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -387,12 +387,76 @@ static constexpr uint32_t num_argsort_pipelines = 11;
- static constexpr uint32_t max_argsort_cols = 1 << (num_argsort_pipelines-1);
- static constexpr uint32_t num_topk_moe_pipelines = 10;
- 
-static constexpr std::array topk_moe_norm{ GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
-                                           GGML_OP_VIEW,     GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
-                                           GGML_OP_SUM_ROWS, GGML_OP_DIV,      GGML_OP_RESHAPE };
-static constexpr std::array topk_moe     { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
-                                           GGML_OP_VIEW,     GGML_OP_GET_ROWS };
-+static constexpr std::initializer_list<ggml_op> topk_moe_early_softmax_norm{ GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
-+                                                                             GGML_OP_VIEW,     GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
-+                                                                             GGML_OP_SUM_ROWS, GGML_OP_CLAMP,    GGML_OP_DIV,
-+                                                                             GGML_OP_RESHAPE };
-+static constexpr std::initializer_list<ggml_op> topk_moe_early_softmax     { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
-+                                                                             GGML_OP_VIEW,     GGML_OP_GET_ROWS };
-+static constexpr std::initializer_list<ggml_op> topk_moe_late_softmax      { GGML_OP_ARGSORT,  GGML_OP_VIEW,
-+                                                                             GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
-+                                                                             GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
-+
-+//node #978 (  SOFT_MAX):     ffn_moe_probs-15 (   0K) [Vulka         ] use=2:    ffn_moe_logits-15 (   0K) [Vulka         ]
-+//node #979 (   RESHAPE): ffn_moe_probs-15 (re (   0K) [Vulka         ] use=1:     ffn_moe_probs-15 (   0K) [Vulka         ]
-+//node #980 (   ARGSORT):   ffn_moe_argsort-15 (   0K) [Vulka         ] use=1:     ffn_moe_probs-15 (   0K) [Vulka         ]
-+//node #981 (      VIEW):      ffn_moe_topk-15 (   0K) [Vulka         ] use=4:   ffn_moe_argsort-15 (   0K) [Vulka         ]
-+//node #982 (  GET_ROWS):   ffn_moe_weights-15 (   0K) [Vulka         ] use=1: ffn_moe_probs-15 (re (   0K) [Vulka         ]      ffn_moe_topk-15 (   0K) [Vulka         ]
-+//node #983 (   RESHAPE): ffn_moe_weights-15 ( (   0K) [Vulka         ] use=2:   ffn_moe_weights-15 (   0K) [Vulka         ]
-+//node #984 (  SUM_ROWS): ffn_moe_weights_sum- (   0K) [Vulka         ] use=1: ffn_moe_weights-15 ( (   0K) [Vulka         ]
-+//node #985 (     CLAMP): ffn_moe_weights_sum_ (   0K) [Vulka         ] use=1: ffn_moe_weights_sum- (   0K) [Vulka         ]
-+//node #986 (       DIV): ffn_moe_weights_norm (   0K) [Vulka         ] use=1: ffn_moe_weights-15 ( (   0K) [Vulka         ] ffn_moe_weights_sum_ (   0K) [Vulka         ]
-+//node #987 (   RESHAPE): ffn_moe_weights_norm (   0K) [Vulka         ] use=1: ffn_moe_weights_norm (   0K) [Vulka         ]
-+static constexpr std::initializer_list<std::array<int, 3>> topk_moe_early_softmax_norm_edges {
-+    { 1, 0, 0 }, // reshape->src[0]  == softmax
-+    { 2, 0, 0 }, // argsort->src[0]  == softmax
-+    { 3, 0, 2 }, // view->src[0]     == argsort
-+    { 4, 0, 1 }, // get_rows->src[0] == reshape
-+    { 4, 1, 3 }, // get_rows->src[1] == view
-+    { 5, 0, 4 }, // reshape->src[0]  == get_rows
-+    { 6, 0, 5 }, // sum_rows->src[0] == reshape
-+    { 7, 0, 6 }, // clamp->src[0]    == sum_rows
-+    { 8, 0, 5 }, // div->src[0]      == reshape
-+    { 8, 1, 7 }, // div->src[1]      == clamp
-+    { 9, 0, 8 }, // reshape->src[0]  == div
-+};
-+
-+// same as early_softmax_norm but ending after the get_rows
-+static constexpr std::initializer_list<std::array<int, 3>> topk_moe_early_softmax_edges {
-+    { 1, 0, 0 }, // reshape->src[0]  == softmax
-+    { 2, 0, 0 }, // argsort->src[0]  == softmax
-+    { 3, 0, 2 }, // view->src[0]     == argsort
-+    { 4, 0, 1 }, // get_rows->src[0] == reshape
-+    { 4, 1, 3 }, // get_rows->src[1] == view
-+};
- 
-+//node #652 (   ARGSORT):   ffn_moe_argsort-11 (   0K) [Vulka         ] use=1:     ffn_moe_probs-11 (   0K) [Vulka         ]
-+//node #653 (      VIEW):      ffn_moe_topk-11 (   0K) [Vulka         ] use=7:   ffn_moe_argsort-11 (   0K) [Vulka         ]
-+//node #654 (  GET_ROWS):   ffn_moe_weights-11 (   0K) [Vulka         ] use=1: ffn_moe_probs-11 (re (   0K) [Vulka         ]      ffn_moe_topk-11 (   0K) [Vulka         ]
-+//node #655 (   RESHAPE): ffn_moe_weights-11 ( (   0K) [Vulka         ] use=1:   ffn_moe_weights-11 (   0K) [Vulka         ]
-+//node #656 (  SOFT_MAX):             node_656 (   0K) [Vulka         ] use=1: ffn_moe_weights-11 ( (   0K) [Vulka         ]
-+//node #657 (   RESHAPE): ffn_moe_weights_soft (   0K) [Vulka         ] use=1:             node_656 (   0K) [Vulka         ]
-+static constexpr std::initializer_list<std::array<int, 3>> topk_moe_late_softmax_edges {
-+    { 1, 0, 0 }, // view->src[0]     == argsort
-+    { 2, 1, 1 }, // get_rows->src[1] == view
-+    { 3, 0, 2 }, // reshape->src[0]  == get_rows
-+    { 4, 0, 3 }, // soft_max->src[0] == reshape
-+    { 5, 0, 4 }, // reshape->src[0]  == soft_max
-+};
-+
-+enum topk_moe_mode {
-+    TOPK_MOE_EARLY_SOFTMAX,
-+    TOPK_MOE_EARLY_SOFTMAX_NORM,
-+    TOPK_MOE_LATE_SOFTMAX,
-+    TOPK_MOE_COUNT,
-+};
-+
-+static topk_moe_mode ggml_vk_num_additional_ops_to_topk_moe_mode(uint32_t num) {
-+    topk_moe_mode mode = num == topk_moe_early_softmax_norm.size() - 1 ? TOPK_MOE_EARLY_SOFTMAX_NORM :
-+                         num == topk_moe_early_softmax.size() - 1      ? TOPK_MOE_EARLY_SOFTMAX :
-+                                                                         TOPK_MOE_LATE_SOFTMAX;
-+    return mode;
-+}
- 
- struct vk_device_struct {
-     std::recursive_mutex mutex;
-@@ -607,8 +671,7 @@ struct vk_device_struct {
- 
-     vk_pipeline pipeline_flash_attn_split_k_reduce;
- 
-    // [2] is {!norm, norm}
-    vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][2];
-+    vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT];
- 
-     std::vector<vk_pipeline_ref> all_pipelines;
- 
-@@ -956,6 +1019,8 @@ static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);
- struct vk_op_topk_moe_push_constants {
-     uint32_t n_rows;
-     uint32_t n_expert_used;
-+    float clamp_min;
-+    float clamp_max;
- };
- 
- struct vk_op_add_id_push_constants {
-@@ -3806,8 +3871,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
-     ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
- 
-     for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][0], "topk_moe_f32_"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0}, 1, true, true);
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][1], "topk_moe_f32_"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1}, 1, true, true);
-+        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX],      "topk_moe_f32_early_softmax_"+std::to_string(i),       topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true);
-+        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true);
-+        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX],       "topk_moe_f32_late_softmax"+std::to_string(i),         topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true);
-     }
- 
-     for (auto &c : compiles) {
-@@ -8085,8 +8151,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
-         if (ctx->num_additional_fused_ops) {
-             uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
-             GGML_ASSERT(idx < num_topk_moe_pipelines);
-            bool with_norm = ctx->num_additional_fused_ops == topk_moe_norm.size() - 1;
-            return ctx->device->pipeline_topk_moe[idx][with_norm];
-+            topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
-+            return ctx->device->pipeline_topk_moe[idx][mode];
-         }
- 
-         if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
-@@ -8141,6 +8207,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
-             return nullptr;
-         }
-     case GGML_OP_ARGSORT:
-+        if (ctx->num_additional_fused_ops) {
-+            uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
-+            GGML_ASSERT(idx < num_topk_moe_pipelines);
-+            topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
-+            return ctx->device->pipeline_topk_moe[idx][mode];
-+        }
-+
-         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_I32) {
-             uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
-             return ctx->device->pipeline_argsort_f32[idx];
-@@ -9676,10 +9749,12 @@ static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& sub
- 
- static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
- 
-    bool with_norm = ctx->num_additional_fused_ops == topk_moe_norm.size() - 1;
-+    topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
-     ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0];
-    ggml_tensor * weights = with_norm ? cgraph->nodes[node_idx + 8] : cgraph->nodes[node_idx + 4];
-    ggml_tensor * ids = cgraph->nodes[node_idx + 3];
-+    ggml_tensor * weights = (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) ? cgraph->nodes[node_idx + 9] :
-+                            (mode == TOPK_MOE_EARLY_SOFTMAX)      ? cgraph->nodes[node_idx + 4] :
-+                                                                    cgraph->nodes[node_idx + 5];
-+    ggml_tensor * ids = (mode == TOPK_MOE_LATE_SOFTMAX) ? cgraph->nodes[node_idx + 1] : cgraph->nodes[node_idx + 3];
- 
-     GGML_ASSERT(logits->type == GGML_TYPE_F32);
-     GGML_ASSERT(weights->type == GGML_TYPE_F32);
-@@ -9738,9 +9813,14 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
-         GGML_ASSERT(d_ids != nullptr);
-     }
- 
-    vk_op_topk_moe_push_constants pc;
-+    vk_op_topk_moe_push_constants pc {};
-     pc.n_rows = n_rows;
-     pc.n_expert_used = n_expert_used;
-+    if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
-+        ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
-+        pc.clamp_min = ggml_get_op_params_f32(clamp, 0);
-+        pc.clamp_max = ggml_get_op_params_f32(clamp, 1);
-+    }
- 
-     GGML_ASSERT(n_expert_used <= n_experts);
- 
-@@ -11335,7 +11415,13 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
-                 }
-             }
-         }
-+
-+#define ENABLE_SYNC_LOGGING 0
-+
-         if (need_sync) {
-+#if ENABLE_SYNC_LOGGING
-+            std::cerr <<  "sync" << std::endl;
-+#endif
-             ctx->unsynced_nodes_written.clear();
-             ctx->unsynced_nodes_read.clear();
-             ggml_vk_sync_buffers(ctx, compute_ctx);
-@@ -11353,6 +11439,18 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
-             }
-         }
-     }
-+#if ENABLE_SYNC_LOGGING
-+    if (!dryrun) {
-+        for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
-+            auto *n = cgraph->nodes[node_idx + i];
-+            std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " <<  n->name;
-+            if (n->op == GGML_OP_GLU) {
-+                std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
-+            }
-+            std::cerr << std::endl;
-+        }
-+    }
-+#endif
- 
-     switch (node->op) {
-     case GGML_OP_REPEAT:
-@@ -11531,7 +11629,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
- 
-         break;
-     case GGML_OP_ARGSORT:
-        ggml_vk_argsort(ctx, compute_ctx, src0, node, dryrun);
-+        if (ctx->num_additional_fused_ops) {
-+            ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx, dryrun);
-+        } else {
-+            ggml_vk_argsort(ctx, compute_ctx, src0, node, dryrun);
-+        }
- 
-         break;
-     case GGML_OP_SUM:
-@@ -12329,30 +12431,27 @@ static bool ggml_vk_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, st
- }
- 
- static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph,
-                                      int node_idx, bool with_norm) {
-+                                      int node_idx, topk_moe_mode mode) {
- 
-    if (with_norm) {
-        if (node_idx + (int)topk_moe_norm.size() > cgraph->n_nodes) {
-            return false;
-        }
-        for (size_t i = 0; i < topk_moe_norm.size(); ++i) {
-            if (cgraph->nodes[node_idx + i]->op != topk_moe_norm[i]) {
-                return false;
-            }
-        }
-    } else {
-        if (node_idx + (int)topk_moe.size() > cgraph->n_nodes) {
-            return false;
-        }
-        for (size_t i = 0; i < topk_moe.size(); ++i) {
-            if (cgraph->nodes[node_idx + i]->op != topk_moe[i]) {
-                return false;
-            }
-        }
-    }
-+    const ggml_tensor * softmax;
-+    const ggml_tensor * weights;
- 
-    const ggml_tensor * softmax =  cgraph->nodes[node_idx + 0];
-    const ggml_tensor * weights = with_norm ? cgraph->nodes[node_idx + 8] : cgraph->nodes[node_idx + 4];
-+    switch (mode) {
-+    case TOPK_MOE_EARLY_SOFTMAX_NORM:
-+        softmax = cgraph->nodes[node_idx + 0];
-+        weights = cgraph->nodes[node_idx + 9];
-+        break;
-+    case TOPK_MOE_EARLY_SOFTMAX:
-+        softmax = cgraph->nodes[node_idx + 0];
-+        weights = cgraph->nodes[node_idx + 4];
-+        break;
-+    case TOPK_MOE_LATE_SOFTMAX:
-+        softmax = cgraph->nodes[node_idx + 4];
-+        weights = cgraph->nodes[node_idx + 5];
-+        break;
-+    default:
-+        return false;
-+    }
- 
-     const float * op_params = (const float *)softmax->op_params;
- 
-@@ -12378,60 +12477,6 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
-         return false;
-     }
- 
-    // Check that the nodes don't have any unexpected uses
-    const ggml_tensor * reshape1 =  cgraph->nodes[node_idx + 1];
-    const ggml_tensor * argsort =   cgraph->nodes[node_idx + 2];
-    const ggml_tensor * view =      cgraph->nodes[node_idx + 3];
-    const ggml_tensor * get_rows =  cgraph->nodes[node_idx + 4];
-    const ggml_tensor * reshape5 =  with_norm ? cgraph->nodes[node_idx + 5] : nullptr;
-    const ggml_tensor * sum_rows =  with_norm ? cgraph->nodes[node_idx + 6] : nullptr;
-    const ggml_tensor * div =       with_norm ? cgraph->nodes[node_idx + 7] : nullptr;
-    const ggml_tensor * reshape8 =  with_norm ? cgraph->nodes[node_idx + 8] : nullptr;
-
-    // softmax is used by reshape and argsort
-    if (ggml_node_get_use_count(cgraph, node_idx) != 2 ||
-        reshape1->src[0] != softmax ||
-        argsort->src[0] != softmax) {
-        return false;
-    }
-    // reshape is used by get_rows
-    if (ggml_node_get_use_count(cgraph, node_idx + 1) != 1 ||
-        get_rows->src[0] != reshape1) {
-        return false;
-    }
-    // argsort is used by view
-    if (ggml_node_get_use_count(cgraph, node_idx + 2) != 1 ||
-        view->src[0] != argsort) {
-        return false;
-    }
-    // view is written (via argsort), we can skip checking it
-
-    if (with_norm) {
-        // get_rows is used by reshape
-        if (ggml_node_get_use_count(cgraph, node_idx + 4) != 1 ||
-            reshape5->src[0] != get_rows) {
-            return false;
-        }
-
-        // reshape is used by sum_rows and div
-        if (ggml_node_get_use_count(cgraph, node_idx + 5) != 2 ||
-            sum_rows->src[0] != reshape5 ||
-            div->src[0] != reshape5) {
-            return false;
-        }
-
-        // sum_rows is used by div
-        if (ggml_node_get_use_count(cgraph, node_idx + 6) != 1 ||
-            div->src[1] != sum_rows) {
-            return false;
-        }
-
-        // div/reshape are written
-        if (reshape8->src[0] != div) {
-            return false;
-        }
-    }
-
-     if (!ctx->device->subgroup_arithmetic ||
-         !ctx->device->subgroup_shuffle ||
-         !ctx->device->subgroup_require_full_support ||
-@@ -12517,10 +12562,18 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
-                 ctx->num_additional_fused_ops = num_adds - 1;
-             } else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
-                 ctx->num_additional_fused_ops = 1;
-            } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, true)) {
-                ctx->num_additional_fused_ops = topk_moe_norm.size() - 1;
-            } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, false)) {
-                ctx->num_additional_fused_ops = topk_moe.size() - 1;
-+            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
-+                       ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
-+                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
-+                ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
-+            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
-+                       ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
-+                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
-+                ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
-+            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
-+                       ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
-+                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
-+                ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
-             }
-         }
-         ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false);
-@@ -12618,10 +12671,18 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
-                 ctx->num_additional_fused_ops = num_adds - 1;
-             } else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
-                 ctx->num_additional_fused_ops = 1;
-            } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, true)) {
-                ctx->num_additional_fused_ops = topk_moe_norm.size() - 1;
-            } else if (ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, false)) {
-                ctx->num_additional_fused_ops = topk_moe.size() - 1;
-+            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
-+                       ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
-+                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
-+                ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
-+            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
-+                       ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
-+                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
-+                ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
-+            } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
-+                       ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
-+                       ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
-+                ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
-             }
-         }
- 
-@@ -12754,25 +12815,44 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
-     while (first_unused < graph->n_nodes) {
-         std::vector<int> current_set;
- 
-        // Avoid reordering topk_moe_norm
-        if (first_unused + (int)topk_moe_norm.size() <= graph->n_nodes) {
-            bool is_topk_moe_norm = true;
-            for (size_t j = 0; j < topk_moe_norm.size(); ++j) {
-                if (graph->nodes[first_unused + j]->op != topk_moe_norm[j] || used[first_unused + j]) {
-                    is_topk_moe_norm = false;
-+        // Check for fusion patterns and avoid reordering them
-+        auto const &match_pattern = [&](const std::initializer_list<ggml_op> &pattern, int start) -> bool {
-+            if (start + (int)pattern.size() <= graph->n_nodes) {
-+                bool is_pattern = true;
-+                for (size_t j = 0; j < pattern.size(); ++j) {
-+                    if (graph->nodes[start + j]->op != pattern.begin()[j] || used[start + j]) {
-+                        is_pattern = false;
-+                    }
-                 }
-+                return is_pattern;
-             }
-            if (is_topk_moe_norm) {
-                for (size_t j = 0; j < topk_moe_norm.size(); ++j) {
-+            return false;
-+        };
-+
-+        auto const &keep_pattern = [&](const std::initializer_list<ggml_op> &pattern) -> bool {
-+            if (match_pattern(pattern, first_unused)) {
-+                for (size_t j = 0; j < pattern.size(); ++j) {
-                     new_order.push_back(graph->nodes[first_unused + j]);
-                     used[first_unused + j] = true;
-                 }
-                 while (first_unused < graph->n_nodes && used[first_unused]) {
-                     first_unused++;
-                 }
-                continue;
-+                return true;
-             }
-+            return false;
-+        };
-+
-+        if (keep_pattern(topk_moe_early_softmax_norm)) {
-+            continue;
-+        }
-+        if (keep_pattern(topk_moe_early_softmax)) {
-+            continue;
-         }
-+        if (keep_pattern(topk_moe_late_softmax)) {
-+            continue;
-+        }
-+
-         // First, grab the next unused node.
-         current_set.push_back(first_unused);
- 
-@@ -12790,6 +12870,12 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
-             if (is_empty(graph->nodes[j])) {
-                 continue;
-             }
-+            // Don't pull forward nodes from fusion patterns
-+            if (match_pattern(topk_moe_early_softmax_norm, j) ||
-+                match_pattern(topk_moe_early_softmax, j) ||
-+                match_pattern(topk_moe_late_softmax, j)) {
-+                continue;
-+            }
-             bool ok = true;
-             for (int c = first_unused; c < j; ++c) {
-                 if (!used[c] &&
-diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
-index 9e56d5f8a..bc1c278bf 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
-+++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
-@@ -11,6 +11,8 @@ layout (push_constant) uniform parameter
- {
-     uint n_rows;
-     uint n_expert_used;
-+    float clamp_min;
-+    float clamp_max;
- };
- 
- layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
-@@ -18,6 +20,7 @@ layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
- layout(constant_id = 0) const uint WARP_SIZE = 32;
- layout(constant_id = 1) const uint n_experts = 512;
- layout(constant_id = 2) const bool with_norm = true;
-+layout(constant_id = 3) const bool late_softmax = false;
- 
- const uint experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;
- 
-@@ -25,53 +28,72 @@ layout (binding = 0, std430) readonly buffer Logits {float logits[];};
- layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
- layout (binding = 2, std430) writeonly buffer Ids {uint ids[];};
- 
-void main() {
-    const uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_LocalInvocationID.y;
-    if (row >= n_rows) {
-        return;
-    }
-+const float INFINITY = 1.0 / 0.0;
- 
-    const uint logits_offset = n_experts * row;
-    const uint weights_offset = n_expert_used * row;
-    const uint ids_offset = n_experts * row;
-
-    float logits_r[experts_per_thread];
-
-    const float INFINITY = 1.0 / 0.0;
-+// Warp-local softmax used for both the pre-top-k logits and the post-top-k delayed path.
-+void softmax_warp_inplace(inout float vals[experts_per_thread], const uint limit, const uint lane, const bool use_limit) {
-+    float max_val = -INFINITY;
- 
-     [[unroll]]
-    for (uint i = 0; i < n_experts; i += WARP_SIZE) {
-        const uint expert        = i + gl_LocalInvocationID.x;
-        logits_r[i / WARP_SIZE] = n_experts % WARP_SIZE == 0 || expert < n_experts ? logits[logits_offset + expert] : -INFINITY;
-+    for (int i = 0; i < experts_per_thread; i++) {
-+        const uint idx       = lane + i * WARP_SIZE;
-+        const bool is_active = !use_limit || (idx < limit);
-+        if (is_active) {
-+            max_val = max(max_val, vals[i]);
-+        }
-     }
- 
-    float max_val = logits_r[0];
-+    max_val = subgroupMax(max_val);
-+
-+    float sum = 0.f;
- 
-     [[unroll]]
-    for (int i = 1; i < experts_per_thread; i++) {
-        const float val = logits_r[i];
-        max_val         = max(val, max_val);
-+    for (int i = 0; i < experts_per_thread; i++) {
-+        const uint idx       = lane + i * WARP_SIZE;
-+        const bool is_active = !use_limit || (idx < limit);
-+        if (is_active) {
-+            const float val = exp(vals[i] - max_val);
-+            vals[i]         = val;
-+            sum += val;
-+        } else {
-+            vals[i] = 0.f;
-+        }
-     }
- 
-    max_val = subgroupMax(max_val);
-+    sum = subgroupAdd(sum);
- 
-    float wt[experts_per_thread];
-    float tmp = 0.f;
-+    const float inv_sum = 1.0f / sum;
- 
-     [[unroll]]
-     for (int i = 0; i < experts_per_thread; i++) {
-        const float val = logits_r[i];
-        wt[i]           = exp(val - max_val);
-        tmp += wt[i];
-+        const uint idx       = lane + i * WARP_SIZE;
-+        const bool is_active = !use_limit || (idx < limit);
-+        if (is_active) {
-+            vals[i] *= inv_sum;
-+        }
-     }
-+}
- 
-    tmp = subgroupAdd(tmp);
-+void main() {
-+    const uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_LocalInvocationID.y;
-+    if (row >= n_rows) {
-+        return;
-+    }
- 
-    const float inv_sum = 1.0f / tmp;
-+    const uint logits_offset = n_experts * row;
-+    const uint weights_offset = n_expert_used * row;
-+    const uint ids_offset = n_experts * row;
-+
-+    float wt[experts_per_thread];
- 
-     [[unroll]]
-    for (int i = 0; i < experts_per_thread; i++) {
-        wt[i] = wt[i] * inv_sum;
-+    for (uint i = 0; i < n_experts; i += WARP_SIZE) {
-+        const uint expert = i + gl_LocalInvocationID.x;
-+        wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[logits_offset + expert] : -INFINITY;
-+    }
-+
-+    if (!late_softmax) {
-+        softmax_warp_inplace(wt, n_experts, gl_LocalInvocationID.x, false);
-     }
- 
-     // at this point, each thread holds a portion of softmax,
-@@ -82,6 +104,11 @@ void main() {
- 
-     float output_weights[experts_per_thread];
- 
-+    [[unroll]]
-+    for (int i = 0; i < experts_per_thread; i++) {
-+        output_weights[i] = 0.f;
-+    }
-+
-     for (int k = 0; k < n_expert_used; k++) {
-         float max_val    = wt[0];
-         uint   max_expert = gl_LocalInvocationID.x;
-@@ -121,6 +148,7 @@ void main() {
- 
-     if (with_norm) {
-         wt_sum              = subgroupAdd(wt_sum);
-+        wt_sum              = clamp(wt_sum, clamp_min, clamp_max);
-         const float inv_sum = 1.0f / wt_sum;
- 
-         [[unroll]]
-@@ -129,6 +157,10 @@ void main() {
-         }
-     }
- 
-+    if (late_softmax) {
-+        softmax_warp_inplace(output_weights, n_expert_used, gl_LocalInvocationID.x, true);
-+    }
-+
-     [[unroll]]
-     for (uint i = 0; i < experts_per_thread; ++i) {
-         uint idx = i * WARP_SIZE + gl_LocalInvocationID.x;
--- a/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch
+++ b/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch
--- a/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch
+++ b/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch
@@ -1,85 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jeff Bolz <jbolz@nvidia.com>
-Date: Thu, 30 Oct 2025 01:27:41 -0500
-Subject: [PATCH] vulkan: Handle argsort with a large number of rows (#16851)
-
---
- ggml/src/ggml-vulkan/ggml-vulkan.cpp             |  4 ++++
- ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp | 16 ++++++++++++----
- 2 files changed, 16 insertions(+), 4 deletions(-)
-
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index db92a7901..e959674d1 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -1084,6 +1084,7 @@ struct vk_op_soft_max_push_constants {
- 
- struct vk_op_argsort_push_constants {
-     uint32_t ncols;
-+    uint32_t nrows;
-     int32_t order;
- };
- 
-@@ -8710,6 +8711,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
-         break;
-     case GGML_OP_ARGSORT:
-         elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
-+        elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
-         break;
-     case GGML_OP_IM2COL:
-         {
-@@ -9952,9 +9954,11 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, c
-     int32_t * op_params = (int32_t *)dst->op_params;
- 
-     uint32_t ncols = src0->ne[0];
-+    uint32_t nrows = ggml_nrows(src0);
- 
-     ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
-         ncols,
-+        nrows,
-         op_params[0],
-     }, dryrun);
- }
-diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp b/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
-index c81b84452..c4e68bc02 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
-+++ b/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
-@@ -14,6 +14,7 @@ layout (binding = 1)          buffer D {int data_d[];};
- 
- layout (push_constant) uniform parameter {
-     uint ncols;
-+    uint nrows;
-     uint order;
- } p;
- 
-@@ -26,10 +27,9 @@ void swap(uint idx0, uint idx1) {
-     dst_row[idx1] = tmp;
- }
- 
-void argsort(bool needs_bounds_check) {
-+void argsort(bool needs_bounds_check, const uint row) {
-     // bitonic sort
-     const int col = int(gl_LocalInvocationID.x);
-    const uint row = gl_WorkGroupID.y;
- 
-     const uint row_offset = row * p.ncols;
- 
-@@ -72,8 +72,16 @@ void argsort(bool needs_bounds_check) {
- 
- void main() {
-     if (p.ncols == BLOCK_SIZE) {
-        argsort(false);
-+        uint row = gl_WorkGroupID.y;
-+        while (row < p.nrows) {
-+            argsort(false, row);
-+            row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
-+        }
-     } else {
-        argsort(true);
-+        uint row = gl_WorkGroupID.y;
-+        while (row < p.nrows) {
-+            argsort(true, row);
-+            row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
-+        }
-     }
- }
--- a/llama/patches/0034-vulkan-fix-shmem-overrun-in-mmq-id-shader-16873.patch
+++ b/llama/patches/0034-vulkan-fix-shmem-overrun-in-mmq-id-shader-16873.patch
@@ -1,77 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Ruben Ortlam <picard12@live.de>
-Date: Fri, 31 Oct 2025 08:14:49 +0100
-Subject: [PATCH] vulkan: fix shmem overrun in mmq id shader (#16873)
-
-* vulkan: fix shmem overrun in mmq id shader
-
-* metal : fix mul_mm_id
-
---------
-
-Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
- ggml/src/ggml-metal/ggml-metal-device.cpp                    | 2 +-
- ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp             | 4 ++++
- ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl | 2 +-
- tests/test-backend-ops.cpp                                   | 3 +++
- 4 files changed, 9 insertions(+), 2 deletions(-)
-
-diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
-index 758116342..c78082ac3 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
-+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
-@@ -677,7 +677,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id_map0(ggml_metal_
-     char name[256];
- 
-     snprintf(base, 256, "kernel_mul_mm_id_map0_ne20_%d", ne20);
-    snprintf(name, 256, "%s", base);
-+    snprintf(name, 256, "%s_ne02=%d", base, ne02);
- 
-     ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
-     if (res) {
-diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
-index 8b238ac4b..d955b4fc7 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
-+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
-@@ -82,9 +82,13 @@ layout (constant_id = 10) const uint WARP = 32;
- 
- #include "mul_mmq_shmem_types.glsl"
- 
-+#ifdef MUL_MAT_ID
-+#define BK_STEP 1
-+#else
- #ifndef BK_STEP
- #define BK_STEP 4
- #endif
-+#endif
- 
- // Shared memory cache
- shared block_a_cache buf_a[BM * BK_STEP];
-diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
-index 72fec4404..1c0f5306f 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
-+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl
-@@ -27,7 +27,7 @@ struct block_a_cache {
- #elif defined(DATA_A_Q8_0)
- #define QUANT_R_MMQ 1
- // AMD likes 4, Intel likes 1 and Nvidia likes 2
-#define BK_STEP 1
-+// #define BK_STEP 1
- struct block_a_cache {
-     int32_t qs[32/4];
-     FLOAT_TYPE dm;
-diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
-index 657b6cc2f..1f8dda383 100644
--- a/tests/test-backend-ops.cpp
-+++ b/tests/test-backend-ops.cpp
-@@ -6722,6 +6722,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
-     test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 1, 1, false, 8, 16, 1));
-     test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, false, 32, 32, 32, 3));
- 
-+    // gpt-oss issue with Vulkan mmq_id
-+    test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_MXFP4, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
-+
-     for (ggml_type type_a : base_types) {
-         for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
-             for (int n_mats : {4, 8}) {
--- a/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch
+++ b/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch
@@ -1,80 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Masato Nakasaka <masato.nakasaka@intel.com>
-Date: Fri, 31 Oct 2025 16:18:59 +0900
-Subject: [PATCH] vulkan: Fix crash when FP16 mul_mat accumulation is not
- supported (#16796)
-
-* Experimenting crash fix
-
-* added assert for aborting and fixed comment
-
-* changed to check if a pipeline is empty or not
-
-* Moved function in class definition
-
-* replaced with is_empty
-
-* Modified is_empty to check only unaligned pipelines
---
- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 20 +++++++++++++-------
- 1 file changed, 13 insertions(+), 7 deletions(-)
-
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index e959674d1..903050b0b 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -146,8 +146,13 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
- struct vk_matmul_pipeline_struct {
-     vk_pipeline l, m, s;
-     vk_pipeline a_l, a_m, a_s;
-+    // Returns true when all unaligned pipelines are null.
-+    // We only check for unaligned variants since one of the unaligned pipelines must exist
-+    // while aligned pipelines are optional
-+    bool is_empty() const {
-+        return l == nullptr && m == nullptr && s == nullptr;
-+    }
- };
-
- typedef std::shared_ptr<vk_matmul_pipeline_struct> vk_matmul_pipeline;
- 
- struct vk_matmul_pipeline2 {
-@@ -5080,7 +5085,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
-     if (src1_type == GGML_TYPE_Q8_1) {
-         vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f32acc;
- 
-        if (pipelines->s == nullptr && pipelines->m == nullptr && pipelines->l == nullptr) {
-+        if (pipelines->is_empty()) {
-             return nullptr;
-         }
- 
-@@ -5229,7 +5234,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
-     if (src1_type == GGML_TYPE_Q8_1) {
-         vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_id_q8_1[src0_type].f32acc;
- 
-        if (pipelines->s == nullptr && pipelines->m == nullptr && pipelines->l == nullptr) {
-+        if (pipelines->is_empty()) {
-             return nullptr;
-         }
- 
-@@ -5264,16 +5269,17 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
-             return nullptr;
-     }
- 
-+    vk_matmul_pipeline2& mmp = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type];
-     // XXX TODO 'prec' is not actually allowed in mul_mat_id.
-     bool prefer_fp16acc = ctx->device->fp16 /*&& prec == GGML_PREC_DEFAULT*/;
-    bool support_fp16acc = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc != nullptr;
-    bool support_fp32acc = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc != nullptr;
-+    bool support_fp16acc = !mmp.f16acc->is_empty();
-+    bool support_fp32acc = !mmp.f32acc->is_empty();
- 
-     if (support_fp16acc && (prefer_fp16acc || !support_fp32acc)) {
-        return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc;
-+        return mmp.f16acc;
-     } else {
-         GGML_ASSERT(support_fp32acc);
-        return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc;
-+        return mmp.f32acc;
-     }
- }
-