Update GGML to b6646 (#12245)

Notable EOLs with this change: - MacOS v12 and v13 are no longer supported (v14+ required) - AMD gfx900 and gfx906 are no longer supported
2025-12-25 07:58:01 +00:00 · 2025-10-02 14:47:10 -07:00
parent fdb109469f
commit c68f367ef6
326 changed files with 30615 additions and 20624 deletions
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -15,18 +15,18 @@ problem.
 ggml/src/ggml-backend.cpp            | 9 +++++++--
 ggml/src/ggml-cann/ggml-cann.cpp     | 2 ++
 ggml/src/ggml-cuda/ggml-cuda.cu      | 3 +++
- ggml/src/ggml-metal/ggml-metal.m     | 1 +
+ ggml/src/ggml-metal/ggml-metal.cpp   | 2 ++
 ggml/src/ggml-opencl/ggml-opencl.cpp | 1 +
 ggml/src/ggml-rpc/ggml-rpc.cpp       | 1 +
 ggml/src/ggml-sycl/ggml-sycl.cpp     | 3 +++
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 ++
- 8 files changed, 20 insertions(+), 2 deletions(-)
+ 8 files changed, 21 insertions(+), 2 deletions(-)

 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 1b9d29e9..97f47abd 100644
+index ff9135fe..8ba86f82 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
-@@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
+@@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
     if (buffer->iface.free_buffer != NULL) {
         buffer->iface.free_buffer(buffer);
     }
@@ -34,7 +34,7 @@ index 1b9d29e9..97f47abd 100644
 }
 
 size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
-@@ -529,6 +528,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -586,6 +585,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
 
     free(ctx->buffers);
     free(ctx);
@@ -42,9 +42,9 @@ index 1b9d29e9..97f47abd 100644
 }
 
 static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-@@ -1890,6 +1890,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
- 
+@@ -2075,6 +2075,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+     GGML_ASSERT(buffer);
     ggml_aligned_free(buffer->context, buffer->size);
 +    delete buffer;
 +}
@@ -54,7 +54,7 @@ index 1b9d29e9..97f47abd 100644
 }
 
 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-@@ -1937,7 +1942,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
+@@ -2127,7 +2132,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
 };
 
 static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
@@ -64,10 +64,10 @@ index 1b9d29e9..97f47abd 100644
     /* .init_tensor     = */ NULL, // no initialization required
     /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
 diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index cf575b36..ca1addfa 100755
+index b51b554e..3ba0f5a6 100755
 --- a/ggml/src/ggml-cann/ggml-cann.cpp
 +++ b/ggml/src/ggml-cann/ggml-cann.cpp
-@@ -826,6 +826,7 @@ static void ggml_backend_cann_buffer_free_buffer(
+@@ -843,6 +843,7 @@ static void ggml_backend_cann_buffer_free_buffer(
     ggml_backend_cann_buffer_context* ctx =
         (ggml_backend_cann_buffer_context*)buffer->context;
     delete ctx;
@@ -75,7 +75,7 @@ index cf575b36..ca1addfa 100755
 }
 
 /**
-@@ -1572,6 +1573,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
+@@ -1630,6 +1631,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
  */
 static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
     ACL_CHECK(aclrtFreeHost(buffer->context));
@@ -84,7 +84,7 @@ index cf575b36..ca1addfa 100755
 
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index d9110491..37ee2a6d 100644
+index b7e81b21..fdf8c63d 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -567,6 +567,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -111,23 +111,31 @@ index d9110491..37ee2a6d 100644
 }
 
 static void * ggml_cuda_host_malloc(size_t size) {
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index cb8eff4a..7bccc7bf 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -6032,6 +6032,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
-     }
+diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
+index e11555a7..909e17de 100644
+--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
+@@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
+     GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
 
-     free(ctx);
-+    free(buffer);
+     ggml_metal_buffer_free(ctx);
+    delete buffer;
 }
 
- static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
+ static void * ggml_backend_metal_buffer_shared_get_base(ggml_backend_buffer_t buffer) {
+@@ -99,6 +100,7 @@ static void ggml_backend_metal_buffer_private_free_buffer(ggml_backend_buffer_t
+     GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
+ 
+     ggml_metal_buffer_free(ctx);
+    delete buffer;
+ }
+ 
+ static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
-index 8ba1e00d..8163e8dc 100644
+index 0cf3b924..09d706b5 100644
 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
 +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
-@@ -2745,6 +2745,7 @@ struct ggml_backend_opencl_buffer_context {
+@@ -3215,6 +3215,7 @@ struct ggml_backend_opencl_buffer_context {
 static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
     delete ctx;
@@ -136,10 +144,10 @@ index 8ba1e00d..8163e8dc 100644
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index df6ba540..2e395968 100644
+index f99681c8..59591770 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -486,6 +486,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -505,6 +505,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     RPC_STATUS_ASSERT(status);
     delete ctx;
@@ -148,7 +156,7 @@ index df6ba540..2e395968 100644
 
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 3992dad0..67503951 100644
+index 4ac919ea..447ea3c4 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -331,6 +331,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -176,10 +184,10 @@ index 3992dad0..67503951 100644
 
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 4070e248..394a2839 100644
+index 2608cbd0..061cd078 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -10209,6 +10209,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -11603,6 +11603,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@@ -187,7 +195,7 @@ index 4070e248..394a2839 100644
 }
 
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -10352,6 +10353,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -11746,6 +11747,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -10,10 +10,10 @@ logs instead of throwing an error
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index f7e03e70..8ebe11cf 100644
+index da938af0..2a38abf4 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1804,16 +1804,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1811,16 +1811,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         if (type == LLAMA_VOCAB_TYPE_BPE) {
             add_space_prefix = false;
             clean_spaces = true;
@@ -31,8 +31,8 @@ index f7e03e70..8ebe11cf 100644
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -1975,7 +1966,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
+@@ -1987,7 +1978,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+                 pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
                 clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
--- a/llama/patches/0003-clip-unicode.patch
+++ b/llama/patches/0003-clip-unicode.patch
@@ -10,7 +10,7 @@ filesystems for paths that include wide characters
 1 file changed, 39 insertions(+)

 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
-index 20c21733..f4f69cfc 100644
+index 210ecc88..355219a9 100644
 --- a/tools/mtmd/clip.cpp
 +++ b/tools/mtmd/clip.cpp
@@ -28,6 +28,19 @@
@@ -33,7 +33,7 @@ index 20c21733..f4f69cfc 100644
 struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
 
 enum ffn_op_type {
-@@ -2597,7 +2610,29 @@ struct clip_model_loader {
+@@ -2759,7 +2772,29 @@ struct clip_model_loader {
         {
             std::vector<uint8_t> read_buf;
 
@@ -63,7 +63,7 @@ index 20c21733..f4f69cfc 100644
             if (!fin) {
                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
             }
-@@ -2624,7 +2659,11 @@ struct clip_model_loader {
+@@ -2786,7 +2821,11 @@ struct clip_model_loader {
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
                 }
             }
--- a/llama/patches/0004-solar-pro.patch
+++ b/llama/patches/0004-solar-pro.patch
@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
 7 files changed, 248 insertions(+)

 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 18dcc6dd..4b285646 100644
+index 4e8d54c4..f98a3574 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
-@@ -78,6 +78,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -81,6 +81,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
     { LLM_ARCH_GRANITE_HYBRID,   "granitehybrid"    },
     { LLM_ARCH_CHAMELEON,        "chameleon"        },
@@ -26,15 +26,15 @@ index 18dcc6dd..4b285646 100644
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
     { LLM_ARCH_PLM,              "plm"              },
     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
-@@ -164,6 +165,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       },
-     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
+@@ -177,6 +178,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
+     { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
+     { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
 +    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
-@@ -1794,6 +1796,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -1879,6 +1881,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
@@ -59,7 +59,7 @@ index 18dcc6dd..4b285646 100644
     {
         LLM_ARCH_WAVTOKENIZER_DEC,
         {
-@@ -2219,6 +2239,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -2368,6 +2388,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_LAUREL_POST_NORM,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -68,10 +68,10 @@ index 18dcc6dd..4b285646 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 7af587e7..3ea994c7 100644
+index b5c6f3d7..aa8e0e7b 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -82,6 +82,7 @@ enum llm_arch {
+@@ -85,6 +85,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_GRANITE_HYBRID,
     LLM_ARCH_CHAMELEON,
@@ -79,15 +79,15 @@ index 7af587e7..3ea994c7 100644
     LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_PLM,
     LLM_ARCH_BAILINGMOE,
-@@ -168,6 +169,7 @@ enum llm_kv {
-     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
-     LLM_KV_ATTENTION_SLIDING_WINDOW,
+@@ -181,6 +182,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SCALE,
+     LLM_KV_ATTENTION_OUTPUT_SCALE,
+     LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
 +    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
-@@ -394,6 +396,7 @@ enum llm_tensor {
+@@ -417,6 +419,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -96,10 +96,10 @@ index 7af587e7..3ea994c7 100644
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index 7a06368d..35fc054f 100644
+index c04ac58f..24a515a0 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
-@@ -146,6 +146,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
+@@ -147,6 +147,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
     return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
 }
 
@@ -115,10 +115,10 @@ index 7a06368d..35fc054f 100644
     if (il < n_layer) {
         return swa_layers[il];
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index bd231224..29bd9056 100644
+index 0fe4b569..eb13709f 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
-@@ -62,6 +62,8 @@ struct llama_hparams {
+@@ -64,6 +64,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
@@ -127,7 +127,7 @@ index bd231224..29bd9056 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -220,6 +222,9 @@ struct llama_hparams {
+@@ -236,6 +238,9 @@ struct llama_hparams {
 
     uint32_t n_pos_per_embd() const;
 
@@ -135,10 +135,10 @@ index bd231224..29bd9056 100644
 +    bool n_bskcn(uint32_t n, uint32_t il) const;
 +
     bool is_swa(uint32_t il) const;
- };
 
+     bool has_kv(uint32_t il) const;
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index f71c40f8..7eab9b68 100644
+index 8182a9ad..daef900c 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
@@ -465,6 +465,7 @@ namespace GGUFMeta {
@@ -150,10 +150,10 @@ index f71c40f8..7eab9b68 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 58ca7df7..280129e1 100644
+index 2470f878..0398b553 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1706,6 +1706,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1845,6 +1845,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@@ -175,7 +175,7 @@ index 58ca7df7..280129e1 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -4793,6 +4808,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -5113,6 +5128,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -210,7 +210,7 @@ index 58ca7df7..280129e1 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -15495,6 +15538,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
+@@ -16273,6 +16316,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
     }
 };
 
@@ -229,7 +229,7 @@ index 58ca7df7..280129e1 100644
 +        struct ggml_tensor * inp_pos = build_inp_pos();
 +
 +        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-+        auto * inp_attn = build_attn_inp_kv_unified();
+        auto * inp_attn = build_attn_inp_kv();
 +
 +        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 +
@@ -316,7 +316,7 @@ index 58ca7df7..280129e1 100644
 +
 +                cur = build_attn(inp_attn,
 +                        model.layers[il].wo, model.layers[il].bo,
-+                        Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
 +                cb(cur, "attn_out", il);
 +            }
 +
@@ -376,7 +376,7 @@ index 58ca7df7..280129e1 100644
 // ref: https://github.com/facebookresearch/chameleon
 // based on the original build_llama() function, changes:
 //   * qk-norm
-@@ -18439,6 +18641,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
+@@ -19552,6 +19754,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_chameleon>(*this, params);
             } break;
@@ -387,7 +387,7 @@ index 58ca7df7..280129e1 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
-@@ -18652,6 +18858,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -19770,6 +19976,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_GRANITE_HYBRID:
         case LLM_ARCH_CHAMELEON:
@@ -396,10 +396,10 @@ index 58ca7df7..280129e1 100644
         case LLM_ARCH_NEO_BERT:
         case LLM_ARCH_SMOLLM3:
 diff --git a/src/llama-model.h b/src/llama-model.h
-index 6fcd74d5..09964533 100644
+index d73ce969..c086f94e 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
-@@ -70,6 +70,7 @@ enum llm_type {
+@@ -76,6 +76,7 @@ enum llm_type {
     LLM_TYPE_15B,
     LLM_TYPE_16B,
     LLM_TYPE_20B,
@@ -407,7 +407,7 @@ index 6fcd74d5..09964533 100644
     LLM_TYPE_27B,
     LLM_TYPE_30B,
     LLM_TYPE_32B,
-@@ -367,6 +368,8 @@ struct llama_layer {
+@@ -380,6 +381,8 @@ struct llama_layer {
     // openai-moe
     struct ggml_tensor * attn_sinks = nullptr;
 
--- a/llama/patches/0005-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0005-fix-deepseek-deseret-regex.patch
@@ -12,7 +12,7 @@ regex
 2 files changed, 22 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 8ebe11cf..c011008f 100644
+index 2a38abf4..26fa9fad 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
--- a/llama/patches/0006-maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0006-maintain-ordering-for-rules-for-grammar.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
 1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
-index 637891f5..98b8280f 100644
+index db1f0b23..f4de7e34 100644
 --- a/common/json-schema-to-grammar.cpp
 +++ b/common/json-schema-to-grammar.cpp
-@@ -307,7 +307,7 @@ private:
+@@ -308,7 +308,7 @@ private:
     friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
     std::function<json(const std::string &)> _fetch_json;
     bool _dotall;
--- a/llama/patches/0007-sort-devices-by-score.patch
+++ b/llama/patches/0007-sort-devices-by-score.patch
@@ -11,10 +11,10 @@ with the fastest acceleration is loaded
 1 file changed, 13 insertions(+), 8 deletions(-)

 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 6c315137..3040b2aa 100644
+index 136afec7..f794d9cf 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
-@@ -162,7 +162,7 @@ struct ggml_backend_reg_entry {
+@@ -175,7 +175,7 @@ struct ggml_backend_reg_entry {
 
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_entry> backends;
@@ -23,7 +23,7 @@ index 6c315137..3040b2aa 100644
 
     ggml_backend_registry() {
 #ifdef GGML_USE_CUDA
-@@ -207,7 +207,7 @@ struct ggml_backend_registry {
+@@ -223,7 +223,7 @@ struct ggml_backend_registry {
         }
     }
 
@@ -32,7 +32,7 @@ index 6c315137..3040b2aa 100644
         if (!reg) {
             return;
         }
-@@ -218,15 +218,20 @@ struct ggml_backend_registry {
+@@ -234,15 +234,20 @@ struct ggml_backend_registry {
 #endif
         backends.push_back({ reg, std::move(handle) });
         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
@@ -56,7 +56,7 @@ index 6c315137..3040b2aa 100644
     }
 
     ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
-@@ -270,7 +275,7 @@ struct ggml_backend_registry {
+@@ -286,7 +291,7 @@ struct ggml_backend_registry {
 
         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
 
@@ -65,7 +65,7 @@ index 6c315137..3040b2aa 100644
 
         return reg;
     }
-@@ -293,7 +298,7 @@ struct ggml_backend_registry {
+@@ -309,7 +314,7 @@ struct ggml_backend_registry {
         // remove devices
         devices.erase(
             std::remove_if(devices.begin(), devices.end(),
@@ -74,7 +74,7 @@ index 6c315137..3040b2aa 100644
             devices.end());
 
         // remove backend
-@@ -351,7 +356,7 @@ size_t ggml_backend_dev_count() {
+@@ -367,7 +372,7 @@ size_t ggml_backend_dev_count() {
 
 ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
     GGML_ASSERT(index < ggml_backend_dev_count());
--- a/llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 177fb282..f5a5079a 100644
+index c8f3d859..ff6229a0 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -304,6 +304,7 @@ function(ggml_add_cpu_backend_variant tag_name)
+@@ -307,6 +307,7 @@ function(ggml_add_cpu_backend_variant tag_name)
     endif()
 
     ggml_add_cpu_backend_variant_impl(${tag_name})
@@ -19,7 +19,7 @@ index 177fb282..f5a5079a 100644
 endfunction()
 
 ggml_add_backend(CPU)
-@@ -314,6 +315,7 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -317,6 +318,7 @@ if (GGML_CPU_ALL_VARIANTS)
     elseif (GGML_CPU_ARM_ARCH)
         message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
     endif()
--- a/llama/patches/0009-remove-amx.patch
+++ b/llama/patches/0009-remove-amx.patch
@@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
 1 file changed, 4 deletions(-)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index f5a5079a..5158acd6 100644
+index ff6229a0..33b3a15f 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -324,10 +324,6 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -327,10 +327,6 @@ if (GGML_CPU_ALL_VARIANTS)
         ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
         ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
         ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
--- a/llama/patches/0010-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0010-fix-string-arr-kv-loading.patch
@@ -25,7 +25,7 @@ index 79ee2020..3efb22f0 100644
     // get ith C string from array with given key_id
     GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
 diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
-index 53504399..0f71d5f3 100644
+index 8cc4ef1c..d950dbdf 100644
 --- a/ggml/src/gguf.cpp
 +++ b/ggml/src/gguf.cpp
@@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
@@ -53,10 +53,10 @@ index 53504399..0f71d5f3 100644
 }
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index c011008f..fa388b03 100644
+index 26fa9fad..64c78a16 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1760,9 +1760,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1767,9 +1767,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
                 const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
@@ -66,4 +66,4 @@ index c011008f..fa388b03 100644
 +                const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                 const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                 precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
- #ifdef IS_BIG_ENDIAN
+ #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
--- a/llama/patches/0011-ollama-debug-tensor.patch
+++ b/llama/patches/0011-ollama-debug-tensor.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
 1 file changed, 6 insertions(+)

 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index d89cd8f4..a5689c18 100644
+index dbc07301..f8574d01 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@
@@ -20,7 +20,7 @@ index d89cd8f4..a5689c18 100644
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -2858,6 +2860,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2881,6 +2883,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         ggml_compute_forward(&params, node);
 
--- a/llama/patches/0012-add-ollama-vocab-for-grammar-support.patch
+++ b/llama/patches/0012-add-ollama-vocab-for-grammar-support.patch
@@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644
                       const char * grammar_root,
                               bool lazy,
 diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
-index bfbf5fa2..11f93f42 100644
+index 2186f827..8fb86009 100644
 --- a/src/llama-sampling.cpp
 +++ b/src/llama-sampling.cpp
-@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
+@@ -1563,7 +1563,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
         trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
     }
 
@@ -196,7 +196,7 @@ index bfbf5fa2..11f93f42 100644
                                                  ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
                                                  ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
 
-@@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
+@@ -1645,7 +1645,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
             /* .vocab        = */ vocab,
             /* .grammar_str  = */ grammar_str,
             /* .grammar_root = */ grammar_root,
--- a/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
@@ -4,17 +4,18 @@ Date: Thu, 1 May 2025 13:45:12 -0700
 Subject: [PATCH] add argsort and cuda copy for i32

 ---
- ggml/src/ggml-cpu/ops.cpp        |  43 +++++++++++++
- ggml/src/ggml-cuda/argsort.cu    | 102 ++++++++++++++++++++++++++++++-
- ggml/src/ggml-cuda/cpy-utils.cuh |   6 ++
- ggml/src/ggml-cuda/cpy.cu        |  43 +++++++++++++
- 4 files changed, 192 insertions(+), 2 deletions(-)
+ ggml/src/ggml-cpu/ops.cpp            |  43 +++++++++++
+ ggml/src/ggml-cuda/argsort.cu        | 102 ++++++++++++++++++++++++++-
+ ggml/src/ggml-cuda/cpy-utils.cuh     |   6 ++
+ ggml/src/ggml-cuda/cpy.cu            |  43 +++++++++++
+ ggml/src/ggml-metal/ggml-metal.metal |  64 +++++++++++++++++
+ 5 files changed, 256 insertions(+), 2 deletions(-)

 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index 854f1c2b..a2924757 100644
+index 14f7dcf4..f7f8da35 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -8146,6 +8146,45 @@ static void ggml_compute_forward_argsort_f32(
+@@ -7893,6 +7893,45 @@ static void ggml_compute_forward_argsort_f32(
     }
 }
 
@@ -60,7 +61,7 @@ index 854f1c2b..a2924757 100644
 void ggml_compute_forward_argsort(
     const ggml_compute_params * params,
     ggml_tensor * dst) {
-@@ -8157,6 +8196,10 @@ void ggml_compute_forward_argsort(
+@@ -7904,6 +7943,10 @@ void ggml_compute_forward_argsort(
             {
                 ggml_compute_forward_argsort_f32(params, dst);
             } break;
@@ -196,12 +197,12 @@ index 607ded85..53b02634 100644
 +    }
 }
 diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
-index 410c12b7..b8e9e107 100644
+index e621cb98..597c0c8b 100644
 --- a/ggml/src/ggml-cuda/cpy-utils.cuh
 +++ b/ggml/src/ggml-cuda/cpy-utils.cuh
-@@ -223,3 +223,9 @@ template<typename src_t, typename dst_t>
+@@ -215,3 +215,9 @@ template<typename src_t, typename dst_t>
 static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
-     convert_flt((const src_t *)cxi, (dst_t *)cdsti);
+     *(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
 }
 +
 +static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
@@ -210,10 +211,10 @@ index 410c12b7..b8e9e107 100644
 +    *dst = *src;
 +}
 diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
-index f9bb0256..9c3774e5 100644
+index 746f4396..911220e9 100644
 --- a/ggml/src/ggml-cuda/cpy.cu
 +++ b/ggml/src/ggml-cuda/cpy.cu
-@@ -278,6 +278,47 @@ static void ggml_cpy_f32_iq4_nl_cuda(
+@@ -277,6 +277,47 @@ static void ggml_cpy_f32_iq4_nl_cuda(
         (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
@@ -261,7 +262,7 @@ index f9bb0256..9c3774e5 100644
 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
     const int64_t ne = ggml_nelements(src0);
     GGML_ASSERT(ne == ggml_nelements(src1));
-@@ -369,6 +410,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+@@ -372,6 +413,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
         ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
         ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
@@ -270,3 +271,80 @@ index f9bb0256..9c3774e5 100644
     } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
         ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
+diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
+index 96df6f0c..44dc31c0 100644
+--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
+@@ -4428,8 +4428,72 @@ kernel void kernel_argsort_f32_i32(
+     }
+ }
+ 
+typedef void (i32_argsort_t)(
+        constant   ggml_metal_kargs_argsort & args,
+        device  const int32_t * x,
+        device        int32_t * dst,
+        threadgroup   int32_t * shared_values [[threadgroup(0)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]]);
+
+template<ggml_sort_order order>
+kernel void kernel_argsort_i32_i32(
+        constant   ggml_metal_kargs_argsort & args,
+        device const int32_t * x,
+        device       int32_t * dst,
+        threadgroup int32_t  * shared_values [[threadgroup(0)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]]) {
+    // bitonic sort
+    int col = tpitg[0];
+    int row = tgpig[1];
+
+    if (col >= args.ncols_pad) return;
+
+    device const int32_t * x_row   = x + row * args.ncols;
+    threadgroup int32_t  * dst_row = shared_values;
+
+    // initialize indices
+    dst_row[col] = col;
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    for (int k = 2; k <= args.ncols_pad; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            int ixj = col ^ j;
+            if (ixj > col) {
+                if ((col & k) == 0) {
+                    if (dst_row[col] >= args.ncols ||
+                        (dst_row[ixj] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+                    ) {
+                        SWAP(dst_row[col], dst_row[ixj]);
+                    }
+                } else {
+                    if (dst_row[ixj] >= args.ncols ||
+                        (dst_row[col] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+                    ) {
+                        SWAP(dst_row[col], dst_row[ixj]);
+                    }
+                }
+            }
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+        }
+    }
+
+    // copy the result to dst without the padding
+    if (col < args.ncols) {
+        dst[row * args.ncols + col] = dst_row[col];
+    }
+}
+
+ template [[host_name("kernel_argsort_f32_i32_asc")]]  kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_ASC>;
+ template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_DESC>;
+template [[host_name("kernel_argsort_i32_i32_asc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_ASC>;
+template [[host_name("kernel_argsort_i32_i32_desc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_DESC>;
+ 
+ kernel void kernel_leaky_relu_f32(
+         constant     ggml_metal_kargs_leaky_relu & args,
--- a/llama/patches/0014-graph-memory-reporting-on-failure.patch
+++ b/llama/patches/0014-graph-memory-reporting-on-failure.patch
@@ -6,12 +6,12 @@ Subject: [PATCH] graph memory reporting on failure
 ---
 ggml/include/ggml-alloc.h   |  1 +
 ggml/include/ggml-backend.h |  1 +
- ggml/src/ggml-alloc.c       | 36 ++++++++++++++++++++++++++++++++----
+ ggml/src/ggml-alloc.c       | 34 +++++++++++++++++++++++++++++++---
 ggml/src/ggml-backend.cpp   |  7 +++++++
- 4 files changed, 41 insertions(+), 4 deletions(-)
+ 4 files changed, 40 insertions(+), 3 deletions(-)

 diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
-index 2cb150fd2..7ab3f0192 100644
+index 2cb150fd..7ab3f019 100644
 --- a/ggml/include/ggml-alloc.h
 +++ b/ggml/include/ggml-alloc.h
@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
@@ -23,31 +23,31 @@ index 2cb150fd2..7ab3f0192 100644
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index a2977ea2e..e8cf30841 100644
+index 62b6d65e..fe20dca3 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
-@@ -303,6 +303,7 @@ extern "C" {
-     GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
+@@ -316,6 +316,7 @@ extern "C" {
 
-     GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-+    GGML_API size_t               ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+     GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
+     GGML_API size_t                     ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API size_t                     ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
 
     GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
 diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
-index 8b6e60283..b58bd671d 100644
+index fa46f3b4..421ff7c7 100644
 --- a/ggml/src/ggml-alloc.c
 +++ b/ggml/src/ggml-alloc.c
-@@ -350,6 +350,7 @@ struct node_alloc {
+@@ -492,6 +492,7 @@ struct node_alloc {
 struct ggml_gallocr {
     ggml_backend_buffer_type_t * bufts; // [n_buffers]
-     ggml_backend_buffer_t * buffers; // [n_buffers]
+     struct vbuffer ** buffers; // [n_buffers]
 +    size_t *buffer_sizes; // [n_buffers]
     struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
     int n_buffers;
 
-@@ -373,6 +374,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
-     galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
+@@ -515,6 +516,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
+     galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
     GGML_ASSERT(galloc->buffers != NULL);
 
 +    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
@@ -56,7 +56,7 @@ index 8b6e60283..b58bd671d 100644
     galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
     GGML_ASSERT(galloc->buf_tallocs != NULL);
 
-@@ -439,6 +443,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
+@@ -582,6 +586,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
     ggml_hash_set_free(&galloc->hash_set);
     free(galloc->hash_values);
     free(galloc->bufts);
@@ -64,7 +64,7 @@ index 8b6e60283..b58bd671d 100644
     free(galloc->buffers);
     free(galloc->buf_tallocs);
     free(galloc->node_allocs);
-@@ -734,6 +739,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+@@ -875,6 +880,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
         }
     }
 
@@ -73,23 +73,21 @@ index 8b6e60283..b58bd671d 100644
     // reallocate buffers if needed
     for (int i = 0; i < galloc->n_buffers; i++) {
         // if the buffer type is used multiple times, we reuse the same buffer
-@@ -755,15 +762,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+@@ -896,14 +903,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 
-             ggml_backend_buffer_free(galloc->buffers[i]);
-             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
+             ggml_vbuffer_free(galloc->buffers[i]);
+             galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
 -            if (galloc->buffers[i] == NULL) {
 +            if (galloc->buffers[i]) {
-+                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
-+                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+                galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
 +            } else {
                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
 -                return false;
 +                galloc->buffer_sizes[i] = new_size;
 +                success = false;
             }
-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
 +        } else {
-+            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+            galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
         }
     }
 
@@ -98,8 +96,8 @@ index 8b6e60283..b58bd671d 100644
 }
 
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
-@@ -920,6 +932,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
-     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
+@@ -1058,6 +1070,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+     return ggml_vbuffer_size(galloc->buffers[buffer_id]);
 }
 
 +size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
@@ -122,10 +120,10 @@ index 8b6e60283..b58bd671d 100644
 
 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 97f47abd2..d02a40e60 100644
+index 8ba86f82..cb2b9956 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
-@@ -1631,6 +1631,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
+@@ -1809,6 +1809,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
     return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }
 
@@ -137,5 +135,5 @@ index 97f47abd2..d02a40e60 100644
 +}
 +
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
+     GGML_ASSERT(sched);
     int backend_index = ggml_backend_sched_backend_id(sched, backend);
-     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
--- a/llama/patches/0015-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0015-ggml-Export-GPU-UUIDs.patch
@@ -6,28 +6,28 @@ Subject: [PATCH] ggml: Export GPU UUIDs
 This enables matching up devices and information reported by the backend
 with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
 ---
- ggml/include/ggml-backend.h      |  1 +
- ggml/src/ggml-cuda/ggml-cuda.cu  | 67 +++++++++++++++++++++++++++++---
- ggml/src/ggml-metal/ggml-metal.m |  1 +
+ ggml/include/ggml-backend.h        |  1 +
+ ggml/src/ggml-cuda/ggml-cuda.cu    | 67 +++++++++++++++++++++++++++---
+ ggml/src/ggml-metal/ggml-metal.cpp |  1 +
 3 files changed, 63 insertions(+), 6 deletions(-)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 8a91b381..9424394e 100644
+index fe20dca3..48777212 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
-@@ -152,6 +152,7 @@ extern "C" {
-     struct ggml_backend_dev_props {
-         const char * name;
+@@ -158,6 +158,7 @@ extern "C" {
         const char * description;
-+        const char * id;
+         // device free memory in bytes
         size_t memory_free;
+        const char * id;
+         // device total memory in bytes
         size_t memory_total;
-         enum ggml_backend_dev_type type;
+         // device type
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 37ee2a6d..57eae461 100644
+index fdf8c63d..ad389ece 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -179,6 +179,51 @@ static int ggml_cuda_parse_id(char devName[]) {
+@@ -183,6 +183,51 @@ static int ggml_cuda_parse_id(char devName[]) {
 }
 #endif // defined(GGML_USE_HIP)
 
@@ -77,9 +77,9 @@ index 37ee2a6d..57eae461 100644
 +}
 +
 static ggml_cuda_device_info ggml_cuda_init() {
- #if defined(GGML_USE_HIP)
-     // Workaround for a rocBLAS bug when using multiple graphics cards:
-@@ -267,22 +312,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
+     ggml_cuda_device_info info = {};
+ 
+@@ -249,22 +294,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
                 info.devices[id].cc += prop.minor * 0x10;
             }
         }
@@ -107,18 +107,18 @@ index 37ee2a6d..57eae461 100644
 +        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
 +                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
 +                        ggml_cuda_parse_uuid(prop, id).c_str());
- #endif // defined(GGML_USE_HIP)
-     }
- 
-@@ -3144,6 +3191,7 @@ struct ggml_backend_cuda_device_context {
-     int device;
+         std::string device_name(prop.name);
+         if (device_name == "NVIDIA GeForce MX450") {
+             turing_devices_without_mma.push_back({ id, device_name });
+@@ -3273,6 +3320,7 @@ struct ggml_backend_cuda_device_context {
     std::string name;
     std::string description;
+     std::string pci_bus_id;
 +    std::string id;
 };
 
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
-@@ -3156,6 +3204,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
+@@ -3285,6 +3333,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
     return ctx->description.c_str();
 }
 
@@ -130,31 +130,31 @@ index 37ee2a6d..57eae461 100644
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     ggml_cuda_set_device(ctx->device);
-@@ -3170,6 +3223,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
- static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+@@ -3301,6 +3354,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+ 
     props->name        = ggml_backend_cuda_device_get_name(dev);
     props->description = ggml_backend_cuda_device_get_description(dev);
 +    props->id          = ggml_backend_cuda_device_get_id(dev);
     props->type        = ggml_backend_cuda_device_get_type(dev);
+     props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
- 
-@@ -3767,6 +3821,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -3871,6 +3925,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 cudaDeviceProp prop;
                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                 dev_ctx->description = prop.name;
 +                dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
 
-                 ggml_backend_dev_t dev = new ggml_backend_device {
-                     /* .iface   = */ ggml_backend_cuda_device_interface,
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 7bccc7bf..fe7b2f0a 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -6522,6 +6522,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
- static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+                 char pci_bus_id[16] = {};
+                 snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
+diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
+index 909e17de..08ab4fc9 100644
+--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
+@@ -538,6 +538,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
+ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
     props->name        = ggml_backend_metal_device_get_name(dev);
     props->description = ggml_backend_metal_device_get_description(dev);
 +    props->id          = "0";
     props->type        = ggml_backend_metal_device_get_type(dev);
+ 
     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
-     props->caps = (struct ggml_backend_dev_caps) {
--- a/llama/patches/0016-add-C-API-for-mtmd_input_text.patch
+++ b/llama/patches/0016-add-C-API-for-mtmd_input_text.patch
@@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
 2 files changed, 13 insertions(+)

 diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
-index a05373d5..6f70f7f4 100644
+index cd022c5e..3d680945 100644
 --- a/tools/mtmd/mtmd.cpp
 +++ b/tools/mtmd/mtmd.cpp
@@ -79,6 +79,16 @@ enum mtmd_slice_tmpl {
--- a/llama/patches/0017-no-power-throttling-win32-with-gnuc.patch
+++ b/llama/patches/0017-no-power-throttling-win32-with-gnuc.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
 1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index a5689c18..85af19a3 100644
+index f8574d01..530efce0 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -2412,7 +2412,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
+@@ -2431,7 +2431,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
         // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
         // all our threads onto the first 4 cores which results in terrible performance with
         // n_threads > 4
--- a/llama/patches/0018-BF16-macos-version-guard.patch
+++ b/llama/patches/0018-BF16-macos-version-guard.patch
@@ -5,23 +5,24 @@ Subject: [PATCH] BF16 macos version guard

 Only enable BF16 on supported MacOS versions (v14+)
 ---
- ggml/src/ggml-metal/ggml-metal.m | 6 +++++-
- 1 file changed, 5 insertions(+), 1 deletion(-)
+ ggml/src/ggml-metal/ggml-metal-context.m | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)

-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index fe7b2f0a..e4c31268 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -106,7 +106,11 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
-         ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
+diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
+index 052efb7a..b47dc787 100644
+--- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
+@@ -125,7 +125,12 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
+ 
+     res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
+ 
+-    res->use_bfloat      = props_dev->has_bfloat;
+    if (@available(macOS 14.0, *)) {
+        res->use_bfloat = props_dev->has_bfloat;
+    } else {
+        res->use_bfloat = false;
+    }
+
+     res->use_fusion      = getenv("GGML_METAL_FUSION_DISABLE") == nil;
+     res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
 
- #if defined(GGML_METAL_USE_BF16)
-        ctx->use_bfloat = ctx->has_bfloat;
-+        if (@available(macOS 14.0, *)) {
-+            ctx->use_bfloat = ctx->has_bfloat;
-+        } else {
-+            ctx->use_bfloat = false;
-+        }
- #else
-         ctx->use_bfloat = false;
- #endif
--- a/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch
+++ b/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch
@@ -13,10 +13,10 @@ checks.
 1 file changed, 18 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 57eae461..c7f9dc3a 100644
+index ad389ece..e51c5035 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+@@ -2686,14 +2686,26 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
 
@@ -36,12 +36,14 @@ index 57eae461..c7f9dc3a 100644
     const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
     const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
     const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
+     const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
+     const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
 
 +
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
 
-@@ -2700,6 +2712,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+@@ -2717,6 +2729,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 
         if (node->op == GGML_OP_ADD &&
             node->src[1] && node->src[1]->ne[1] > 1 &&
--- a/llama/patches/0020-Disable-ggml-blas-on-macos-v13-and-older.patch
+++ b/llama/patches/0020-Disable-ggml-blas-on-macos-v13-and-older.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] Disable ggml-blas on macos v13 and older
 1 file changed, 5 insertions(+)

 diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
-index aeac2e57..40738d5b 100644
+index 5b888cdd..2a9ff7f6 100644
 --- a/ggml/src/ggml-blas/ggml-blas.cpp
 +++ b/ggml/src/ggml-blas/ggml-blas.cpp
-@@ -505,6 +505,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
+@@ -506,6 +506,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
 };
 
 ggml_backend_reg_t ggml_backend_blas_reg(void) {
--- a/llama/patches/0022-ggml-No-alloc-mode.patch
+++ b/llama/patches/0022-ggml-No-alloc-mode.patch
@@ -16,10 +16,10 @@ must be recreated with no-alloc set to false before loading data.
 5 files changed, 310 insertions(+), 44 deletions(-)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 2773cc310..ae94887dd 100644
+index 48777212..d4352663 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
-@@ -291,6 +291,7 @@ extern "C" {
+@@ -303,6 +303,7 @@ extern "C" {
 
     // Initialize a backend scheduler, backends with low index are given priority over backends with high index
     GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
@@ -28,7 +28,7 @@ index 2773cc310..ae94887dd 100644
 
     // Initialize backend buffers from a measure graph
 diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
-index c36c12d65..369e9e25a 100644
+index 07784d6f..869dc07d 100644
 --- a/ggml/src/ggml-backend-impl.h
 +++ b/ggml/src/ggml-backend-impl.h
@@ -26,12 +26,17 @@ extern "C" {
@@ -57,10 +57,10 @@ index c36c12d65..369e9e25a 100644
     };
 
     GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
-@@ -114,6 +120,16 @@ extern "C" {
-         void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
-         // wait for an event on on a different stream
-         void (*event_wait)  (ggml_backend_t backend, ggml_backend_event_t event);
+@@ -117,6 +123,16 @@ extern "C" {
+ 
+         // (optional) sort/optimize the nodes in the graph
+         void                      (*graph_optimize)    (ggml_backend_t backend, struct ggml_cgraph * cgraph);
 +
 +        // (optional) reserves intermediate buffers needed for the compution
 +        // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
@@ -75,7 +75,7 @@ index c36c12d65..369e9e25a 100644
 
     struct ggml_backend {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index d02a40e60..6b4dee4c7 100644
+index cb2b9956..6ef5eeaf 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
@@ -95,10 +95,10 @@ index d02a40e60..6b4dee4c7 100644
 +        return buf;
 +    }
 +
+     GGML_ASSERT(buft);
     return buft->iface.alloc_buffer(buft, size);
 }
- 
-@@ -89,7 +102,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
+@@ -95,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
         /* .buft      = */ buft,
         /* .context   = */ context,
         /* .size      = */ size,
@@ -108,7 +108,7 @@ index d02a40e60..6b4dee4c7 100644
     };
 
     return buffer;
-@@ -119,6 +133,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
+@@ -127,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
         return NULL;
     }
 
@@ -121,7 +121,7 @@ index d02a40e60..6b4dee4c7 100644
     void * base = buffer->iface.get_base(buffer);
 
     GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
-@@ -663,6 +683,12 @@ struct ggml_backend_sched {
+@@ -723,6 +743,12 @@ struct ggml_backend_sched {
     bool op_offload;
 
     int debug;
@@ -134,7 +134,7 @@ index d02a40e60..6b4dee4c7 100644
 };
 
 #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
-@@ -1449,6 +1475,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
+@@ -1606,6 +1632,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
         size_t graph_size,
         bool parallel,
         bool op_offload) {
@@ -152,7 +152,7 @@ index d02a40e60..6b4dee4c7 100644
     GGML_ASSERT(n_backends > 0);
     GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
     GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
-@@ -1490,10 +1527,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
+@@ -1647,10 +1684,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
                 sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
             }
         }
@@ -166,7 +166,7 @@ index d02a40e60..6b4dee4c7 100644
 
     ggml_backend_sched_reset(sched);
 
-@@ -1508,6 +1548,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
+@@ -1665,6 +1705,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
         for (int c = 0; c < sched->n_copies; c++) {
             ggml_backend_event_free(sched->events[b][c]);
         }
@@ -177,7 +177,7 @@ index d02a40e60..6b4dee4c7 100644
     }
     ggml_gallocr_free(sched->galloc);
     ggml_free(sched->ctx);
-@@ -1547,6 +1591,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
+@@ -1708,6 +1752,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
         return false;
     }
 
@@ -202,7 +202,7 @@ index d02a40e60..6b4dee4c7 100644
     ggml_backend_sched_reset(sched);
 
     return true;
-@@ -1635,7 +1697,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
+@@ -1813,7 +1875,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
     int backend_index = ggml_backend_sched_backend_id(sched, backend);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
 
@@ -218,7 +218,7 @@ index d02a40e60..6b4dee4c7 100644
 
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
 diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
-index 2e5d48797..b915ee1b8 100644
+index c4246b65..448badf0 100644
 --- a/ggml/src/ggml-cuda/common.cuh
 +++ b/ggml/src/ggml-cuda/common.cuh
@@ -35,6 +35,31 @@
@@ -253,7 +253,7 @@ index 2e5d48797..b915ee1b8 100644
 #define STRINGIZE_IMPL(...) #__VA_ARGS__
 #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
 
-@@ -771,6 +796,9 @@ struct ggml_cuda_pool {
+@@ -880,6 +905,9 @@ struct ggml_cuda_pool {
 
     virtual void * alloc(size_t size, size_t * actual_size) = 0;
     virtual void free(void * ptr, size_t size) = 0;
@@ -263,7 +263,7 @@ index 2e5d48797..b915ee1b8 100644
 };
 
 template<typename T>
-@@ -914,11 +942,11 @@ struct ggml_backend_cuda_context {
+@@ -1023,11 +1051,11 @@ struct ggml_backend_cuda_context {
     // pool
     std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
 
@@ -277,7 +277,7 @@ index 2e5d48797..b915ee1b8 100644
         }
         return *pools[device];
     }
-@@ -926,4 +954,20 @@ struct ggml_backend_cuda_context {
+@@ -1035,4 +1063,20 @@ struct ggml_backend_cuda_context {
     ggml_cuda_pool & pool() {
         return pool(device);
     }
@@ -299,7 +299,7 @@ index 2e5d48797..b915ee1b8 100644
 +    }
 };
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index c7f9dc3a5..d5abe09e0 100644
+index e51c5035..d324bc68 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
@@ -540,7 +540,7 @@ index c7f9dc3a5..d5abe09e0 100644
 };
 
 ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
-@@ -2936,6 +2998,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
+@@ -3008,6 +3070,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
 
 static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
     bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
@@ -548,7 +548,7 @@ index c7f9dc3a5..d5abe09e0 100644
     // flag used to determine whether it is an integrated_gpu
     const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
 
-@@ -2951,6 +3014,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+@@ -3023,6 +3086,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     continue;
                 }
 
@@ -559,8 +559,8 @@ index c7f9dc3a5..d5abe09e0 100644
 +
                 static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
                 if (!disable_fusion) {
-                     if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
-@@ -3022,6 +3090,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+ 
+@@ -3149,6 +3217,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
 static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@@ -568,7 +568,7 @@ index c7f9dc3a5..d5abe09e0 100644
 
     ggml_cuda_set_device(cuda_ctx->device);
 
-@@ -3101,6 +3170,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+@@ -3228,6 +3297,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
     return GGML_STATUS_SUCCESS;
 }
 
@@ -640,10 +640,10 @@ index c7f9dc3a5..d5abe09e0 100644
 static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
-@@ -3140,6 +3274,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
-     /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
+@@ -3268,6 +3402,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
     /* .event_record            = */ ggml_backend_cuda_event_record,
     /* .event_wait              = */ ggml_backend_cuda_event_wait,
+     /* .graph_optimize          = */ NULL,
 +    /* .graph_reserve           = */ ggml_backend_cuda_graph_reserve,
 +    /* .buffer_size             = */ ggml_backend_cuda_buffer_size,
 +    /* .reset                   = */ ggml_backend_cuda_reset,
--- a/llama/patches/0023-decode-disable-output_all.patch
+++ b/llama/patches/0023-decode-disable-output_all.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] decode: disable output_all
 1 file changed, 1 insertion(+), 2 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 26a5cf9c..6ece5263 100644
+index d8a8b5e6..09247cef 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
+@@ -974,8 +974,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
     const int64_t n_vocab = vocab.n_tokens();
     const int64_t n_embd  = hparams.n_embd;
 
--- a/llama/patches/0024-ggml-Enable-resetting-backend-devices.patch
+++ b/llama/patches/0024-ggml-Enable-resetting-backend-devices.patch
@@ -15,10 +15,10 @@ unused then it can be reset to free these data structures.
 5 files changed, 29 insertions(+), 2 deletions(-)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index b602a7c78..fda5ceb24 100644
+index d4352663..0a2dae26 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
-@@ -167,6 +167,7 @@ extern "C" {
+@@ -178,6 +178,7 @@ extern "C" {
     GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
     GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
     GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
@@ -27,10 +27,10 @@ index b602a7c78..fda5ceb24 100644
     GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
     GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
 diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
-index 81749a5a3..6f10c353b 100644
+index 869dc07d..4889df79 100644
 --- a/ggml/src/ggml-backend-impl.h
 +++ b/ggml/src/ggml-backend-impl.h
-@@ -178,6 +178,10 @@ extern "C" {
+@@ -195,6 +195,10 @@ extern "C" {
         ggml_backend_event_t (*event_new)         (ggml_backend_dev_t dev);
         void                 (*event_free)        (ggml_backend_dev_t dev, ggml_backend_event_t event);
         void                 (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
@@ -42,10 +42,10 @@ index 81749a5a3..6f10c353b 100644
 
     struct ggml_backend_device {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 05a842ed5..6556943b0 100644
+index 6ef5eeaf..0b757af5 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
-@@ -477,6 +477,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
+@@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
     return device->iface.init_backend(device, params);
 }
 
@@ -58,13 +58,13 @@ index 05a842ed5..6556943b0 100644
 +}
 +
 ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
+     GGML_ASSERT(device);
     return device->iface.get_buffer_type(device);
- }
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index c7f9dc3a5..e43fde523 100644
+index d324bc68..531d6e27 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -103,6 +103,11 @@ int ggml_cuda_get_device() {
+@@ -107,6 +107,11 @@ int ggml_cuda_get_device() {
     return id;
 }
 
@@ -76,10 +76,10 @@ index c7f9dc3a5..e43fde523 100644
 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
     ggml_cuda_set_device(device);
     cudaError_t err;
-@@ -3243,7 +3248,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
-     props->description = ggml_backend_cuda_device_get_description(dev);
+@@ -3512,7 +3517,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
     props->id          = ggml_backend_cuda_device_get_id(dev);
     props->type        = ggml_backend_cuda_device_get_type(dev);
+     props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
 -    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
 +
 +    // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
@@ -88,7 +88,7 @@ index c7f9dc3a5..e43fde523 100644
 
     bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
 #ifdef GGML_CUDA_NO_PEER_COPY
-@@ -3700,6 +3708,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
+@@ -3945,6 +3953,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
     CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
 }
 
@@ -100,7 +100,7 @@ index c7f9dc3a5..e43fde523 100644
 static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
     /* .get_name                = */ ggml_backend_cuda_device_get_name,
     /* .get_description         = */ ggml_backend_cuda_device_get_description,
-@@ -3716,6 +3729,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
+@@ -3961,6 +3974,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
     /* .event_new               = */ ggml_backend_cuda_device_event_new,
     /* .event_free              = */ ggml_backend_cuda_device_event_free,
     /* .event_synchronize       = */ ggml_backend_cuda_device_event_synchronize,
@@ -108,7 +108,7 @@ index c7f9dc3a5..e43fde523 100644
 };
 
 // backend reg
-@@ -3835,7 +3849,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -4076,7 +4090,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 dev_ctx->device = i;
                 dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
 
@@ -117,10 +117,10 @@ index c7f9dc3a5..e43fde523 100644
                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                 dev_ctx->description = prop.name;
 diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
-index c31f31923..cf22e60d2 100644
+index 37386afc..06f9e7c1 100644
 --- a/ggml/src/ggml-cuda/vendors/hip.h
 +++ b/ggml/src/ggml-cuda/vendors/hip.h
-@@ -40,6 +40,7 @@
+@@ -41,6 +41,7 @@
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
 #define cudaDeviceProp hipDeviceProp_t
--- a/llama/patches/0026-GPU-discovery-enhancements.patch
+++ b/llama/patches/0026-GPU-discovery-enhancements.patch
@@ -6,25 +6,25 @@ Subject: [PATCH] GPU discovery enhancements
 Expose more information about the devices through backend props, and leverage
 management libraries for more accurate VRAM usage reporting if available.
 ---
- ggml/include/ggml-backend.h      |   9 +
- ggml/src/CMakeLists.txt          |   2 +
- ggml/src/ggml-cuda/ggml-cuda.cu  |  75 +++++-
- ggml/src/ggml-cuda/vendors/hip.h |   1 +
- ggml/src/ggml-impl.h             |   8 +
- ggml/src/ggml-metal/ggml-metal.m |   2 +
- ggml/src/mem_hip.cpp             | 449 +++++++++++++++++++++++++++++++
- ggml/src/mem_nvml.cpp            | 172 ++++++++++++
- 8 files changed, 717 insertions(+), 1 deletion(-)
+ ggml/include/ggml-backend.h        |   9 +
+ ggml/src/CMakeLists.txt            |   2 +
+ ggml/src/ggml-cuda/ggml-cuda.cu    |  72 +++++
+ ggml/src/ggml-cuda/vendors/hip.h   |   4 +
+ ggml/src/ggml-impl.h               |   8 +
+ ggml/src/ggml-metal/ggml-metal.cpp |   3 +-
+ ggml/src/mem_hip.cpp               | 449 +++++++++++++++++++++++++++++
+ ggml/src/mem_nvml.cpp              | 172 +++++++++++
+ 8 files changed, 718 insertions(+), 1 deletion(-)
 create mode 100644 ggml/src/mem_hip.cpp
 create mode 100644 ggml/src/mem_nvml.cpp

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index fda5ceb24..7c2d86703 100644
+index 0a2dae26..a6bf3378 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
-@@ -158,6 +158,15 @@ extern "C" {
-         size_t memory_total;
-         enum ggml_backend_dev_type type;
+@@ -169,6 +169,15 @@ extern "C" {
+         const char * device_id;
+         // device capabilities
         struct ggml_backend_dev_caps caps;
 +        int driver_major;
 +        int driver_minor;
@@ -39,10 +39,10 @@ index fda5ceb24..7c2d86703 100644
 
     GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 5158acd6a..3a428a22d 100644
+index 33b3a15f..86191ef2 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -203,6 +203,8 @@ add_library(ggml-base
+@@ -206,6 +206,8 @@ add_library(ggml-base
             ggml-threading.h
             ggml-quants.c
             ggml-quants.h
@@ -52,10 +52,10 @@ index 5158acd6a..3a428a22d 100644
 
 target_include_directories(ggml-base PRIVATE .)
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index e43fde523..14baf0fb1 100644
+index 531d6e27..3fa3a057 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -279,6 +279,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
+@@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
     for (int id = 0; id < info.device_count; ++id) {
         int device_vmm = 0;
 
@@ -72,7 +72,7 @@ index e43fde523..14baf0fb1 100644
 #if defined(GGML_USE_VMM)
         CUdevice device;
         CU_CHECK(cuDeviceGet(&device, id));
-@@ -332,9 +342,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
+@@ -314,6 +324,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
 #else
         info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
         info.devices[id].cc = 100*prop.major + 10*prop.minor;
@@ -84,33 +84,29 @@ index e43fde523..14baf0fb1 100644
         GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
                         id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
                         ggml_cuda_parse_uuid(prop, id).c_str());
-+
- #endif // defined(GGML_USE_HIP)
-     }
- 
-@@ -3215,6 +3231,14 @@ struct ggml_backend_cuda_device_context {
-     std::string name;
+@@ -3481,6 +3496,14 @@ struct ggml_backend_cuda_device_context {
     std::string description;
+     std::string pci_bus_id;
     std::string id;
 +    int major;
 +    int minor;
 +    int driver_major;
 +    int driver_minor;
 +    int integrated;
-+    int pci_bus_id;
-+    int pci_device_id;
-+    int pci_domain_id;
+    int pciBusID;
+    int pciDeviceID;
+    int pciDomainID;
 };
 
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
-@@ -3235,6 +3259,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
+@@ -3501,6 +3524,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     ggml_cuda_set_device(ctx->device);
 +
 +#if defined(GGML_USE_HIP)
 +    if (ggml_hip_mgmt_init() == 0) {
-+        int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
+        int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
 +        if (status == 0) {
 +            GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
 +            ggml_hip_mgmt_release();
@@ -132,19 +128,18 @@ index e43fde523..14baf0fb1 100644
     CUDA_CHECK(cudaMemGetInfo(free, total));
 }
 
-@@ -3243,6 +3289,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
+@@ -3509,6 +3554,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
     return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
 
 +#define GGML_HIP_NAME "HIP"
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
-     props->name        = ggml_backend_cuda_device_get_name(dev);
-     props->description = ggml_backend_cuda_device_get_description(dev);
-@@ -3253,6 +3300,23 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+ 
+@@ -3522,6 +3568,22 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
     // If you need the memory data, call ggml_backend_dev_memory() explicitly.
     props->memory_total = props->memory_free = 0;
 
-+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
 +#if defined(GGML_USE_HIP)
 +    int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD;
 +    props->compute_major = cc / 0x100;
@@ -156,15 +151,15 @@ index e43fde523..14baf0fb1 100644
 +    props->driver_major = ctx->driver_major;
 +    props->driver_minor = ctx->driver_minor;
 +    props->integrated = ctx->integrated;
-+    props->pci_bus_id = ctx->pci_bus_id;
-+    props->pci_device_id = ctx->pci_device_id;
-+    props->pci_domain_id = ctx->pci_domain_id;
+    props->pci_bus_id = ctx->pciBusID;
+    props->pci_device_id = ctx->pciDeviceID;
+    props->pci_domain_id = ctx->pciDomainID;
 +    props->library = GGML_CUDA_NAME;
 +
     bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
 #ifdef GGML_CUDA_NO_PEER_COPY
     bool events = false;
-@@ -3843,6 +3907,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -4084,6 +4146,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
         std::lock_guard<std::mutex> lock(mutex);
         if (!initialized) {
             ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
@@ -173,27 +168,36 @@ index e43fde523..14baf0fb1 100644
 
             for (int i = 0; i < ggml_cuda_info().device_count; i++) {
                 ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
-@@ -3853,7 +3919,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
-                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
-                 dev_ctx->description = prop.name;
-                 dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
-
+@@ -4099,6 +4163,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+                 snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
+                 dev_ctx->pci_bus_id = pci_bus_id;
+ 
 +                dev_ctx->major = prop.major;
 +                dev_ctx->minor = prop.minor;
 +                dev_ctx->driver_major = driverVersion / 1000;
 +                dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
 +                dev_ctx->integrated = prop.integrated;
-+                dev_ctx->pci_bus_id = prop.pciBusID;
-+                dev_ctx->pci_device_id = prop.pciDeviceID;
-+                dev_ctx->pci_domain_id = prop.pciDomainID;
+                dev_ctx->pciBusID = prop.pciBusID;
+                dev_ctx->pciDeviceID = prop.pciDeviceID;
+                dev_ctx->pciDomainID = prop.pciDomainID;
                 ggml_backend_dev_t dev = new ggml_backend_device {
                     /* .iface   = */ ggml_backend_cuda_device_interface,
                     /* .reg     = */ &reg,
 diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
-index cf22e60d2..957a795f2 100644
+index 06f9e7c1..eb8f66cb 100644
 --- a/ggml/src/ggml-cuda/vendors/hip.h
 +++ b/ggml/src/ggml-cuda/vendors/hip.h
-@@ -42,6 +42,7 @@
+@@ -5,6 +5,9 @@
+ #include <hipblas/hipblas.h>
+ #include <hip/hip_fp16.h>
+ #include <hip/hip_bf16.h>
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+
+ 
+ #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+ #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+@@ -43,6 +46,7 @@
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceReset hipDeviceReset
 #define cudaDeviceSynchronize hipDeviceSynchronize
@@ -202,11 +206,11 @@ index cf22e60d2..957a795f2 100644
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
 diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
-index 19a7adb2d..b9b102a5e 100644
+index 86a1ebf6..9fc9fbfc 100644
 --- a/ggml/src/ggml-impl.h
 +++ b/ggml/src/ggml-impl.h
-@@ -602,6 +602,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
-     return true;
+@@ -635,6 +635,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
+     return ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
 }
 
 +// Management libraries for fetching more accurate free VRAM data
@@ -220,28 +224,30 @@ index 19a7adb2d..b9b102a5e 100644
 #ifdef __cplusplus
 }
 #endif
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index e4c31268f..ec6b385ba 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -6523,12 +6523,14 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
+diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
+index 08ab4fc9..17999a61 100644
+--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
+@@ -535,6 +535,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
     GGML_UNUSED(dev);
 }
 
 +#define GGML_METAL_NAME "Metal"
- static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
     props->name        = ggml_backend_metal_device_get_name(dev);
     props->description = ggml_backend_metal_device_get_description(dev);
-     props->id          = "0";
+@@ -542,7 +543,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
     props->type        = ggml_backend_metal_device_get_type(dev);
+ 
     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
+-
 +    props->library = GGML_METAL_NAME;
-     props->caps = (struct ggml_backend_dev_caps) {
-         /* .async                 = */ false,
+     props->caps = {
+         /* .async                 = */ true,
         /* .host_buffer           = */ false,
 diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
 new file mode 100644
-index 000000000..8ef19b8cf
+index 00000000..8ef19b8c
 --- /dev/null
 +++ b/ggml/src/mem_hip.cpp
@@ -0,0 +1,449 @@
@@ -697,7 +703,7 @@ index 000000000..8ef19b8cf
 \ No newline at end of file
 diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
 new file mode 100644
-index 000000000..aa05e9dc1
+index 00000000..aa05e9dc
 --- /dev/null
 +++ b/ggml/src/mem_nvml.cpp
@@ -0,0 +1,172 @@
--- a/llama/patches/0026-ggml-Backport-scale-kernel-fixes.patch
+++ b/llama/patches/0026-ggml-Backport-scale-kernel-fixes.patch
@@ -1,57 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Tue, 23 Sep 2025 15:41:58 -0700
-Subject: [PATCH] ggml: Backport scale kernel fixes
-
-The GGML scale kernel uses signed 32-bit ints to represent
-the number of elements in the tensor. For large images,
-mistral-small3.2 overflows this, triggering CUDA errors due
-to negative arguments.
-
-Currently, this can happen when the user passes a large image
-to mistral-small3.2. However, with upcoming changes to reserve
-CUDA memory, it happens every time mistral-small is loaded as
-we reserve using a worst case batch.
-
-This patch is part of an upstream GGML commit and should be removed
-after GGML is updated past 0a1b398 "ggml: add ops for WAN video model
-(cuda && cpu) (#15669)".
-
-Fixes #10388
---
- ggml/src/ggml-cuda/scale.cu | 19 ++++++++++---------
- 1 file changed, 10 insertions(+), 9 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu
-index 2ee9e5889..0ddeff6a1 100644
--- a/ggml/src/ggml-cuda/scale.cu
-+++ b/ggml/src/ggml-cuda/scale.cu
-@@ -1,18 +1,19 @@
- #include "scale.cuh"
- 
-static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-+#define MAX_GRIDDIM_X 0x7FFFFFFF
- 
-    if (i >= k) {
-        return;
-    }
-+static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
-+    int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
-+    int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
- 
-    dst[i] = scale * x[i] + bias;
-+    for (int64_t i = tid; i < nelements; i += stride) {
-+        dst[i] = scale * x[i] + bias;
-+    }
- }
- 
-static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
-    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
-+static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
-+    const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
-+    scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
- }
- 
- void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {