ggml update to b7108 (#12992)

* Revert "vulkan: temporary cary of vulkan fixes (#12971)" This reverts commit 3a9e8e9fd4. * ggml update to b7087 * fix argsort on metal * update to b7108 * fix bakllava regression This model lacks the metadata for the projector type. * update to b7209 * fix TopK perf * only build arm code on arm
2025-12-21 22:33:56 +00:00 · 2025-12-03 19:43:29 -08:00
parent 854d40edc5
commit 0cf7794b16
303 changed files with 32711 additions and 23435 deletions
--- a/llama/patches/0004-solar-pro.patch
+++ b/llama/patches/0004-solar-pro.patch
@@ -5,20 +5,36 @@ Subject: [PATCH] solar-pro

 adds support for the Solar Pro architecture
 ---
- src/llama-arch.cpp         |  21 ++++
+ src/CMakeLists.txt         |   1 +
+ src/llama-arch.cpp         |  21 +++++
 src/llama-arch.h           |   3 +
 src/llama-hparams.cpp      |   8 ++
- src/llama-hparams.h        |   5 +
+ src/llama-hparams.h        |   5 ++
 src/llama-model-loader.cpp |   2 +-
- src/llama-model.cpp        | 207 +++++++++++++++++++++++++++++++++++++
+ src/llama-model.cpp        |  48 +++++++++++
 src/llama-model.h          |   3 +
- 7 files changed, 248 insertions(+), 1 deletion(-)
+ src/models/models.h        |   5 ++
+ src/models/solar.cpp       | 158 +++++++++++++++++++++++++++++++++++++
+ 10 files changed, 253 insertions(+), 1 deletion(-)
+ create mode 100644 src/models/solar.cpp

+diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+index 67c7807e0..fda881640 100644
+--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
+@@ -125,6 +125,7 @@ add_library(llama
+             models/seed-oss.cpp
+             models/smallthinker.cpp
+             models/smollm3.cpp
+            models/solar.cpp
+             models/stablelm.cpp
+             models/starcoder.cpp
+             models/starcoder2.cpp
 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 8ca769c5f..ab262ec0c 100644
+index 8571a2e02..b6bde25d5 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
-@@ -82,6 +82,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
     { LLM_ARCH_GRANITE_HYBRID,   "granitehybrid"    },
     { LLM_ARCH_CHAMELEON,        "chameleon"        },
@@ -26,7 +42,7 @@ index 8ca769c5f..ab262ec0c 100644
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
     { LLM_ARCH_PLM,              "plm"              },
     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
-@@ -183,6 +184,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+@@ -204,6 +205,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
     { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
     { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
@@ -34,7 +50,7 @@ index 8ca769c5f..ab262ec0c 100644
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
-@@ -1901,6 +1903,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -2023,6 +2025,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
@@ -59,7 +75,7 @@ index 8ca769c5f..ab262ec0c 100644
     {
         LLM_ARCH_WAVTOKENIZER_DEC,
         {
-@@ -2469,6 +2489,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -2681,6 +2701,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_LAUREL_POST_NORM,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -68,10 +84,10 @@ index 8ca769c5f..ab262ec0c 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index dea725c1a..ea2b4ffb9 100644
+index 150646478..3936a4687 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -86,6 +86,7 @@ enum llm_arch {
+@@ -89,6 +89,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_GRANITE_HYBRID,
     LLM_ARCH_CHAMELEON,
@@ -79,7 +95,7 @@ index dea725c1a..ea2b4ffb9 100644
     LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_PLM,
     LLM_ARCH_BAILINGMOE,
-@@ -187,6 +188,7 @@ enum llm_kv {
+@@ -208,6 +209,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_OUTPUT_SCALE,
     LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
@@ -87,7 +103,7 @@ index dea725c1a..ea2b4ffb9 100644
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
-@@ -436,6 +438,7 @@ enum llm_tensor {
+@@ -459,6 +461,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -96,11 +112,11 @@ index dea725c1a..ea2b4ffb9 100644
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index db65d69ea..b6bf6bbf2 100644
+index 8cdbaf69f..41127bf91 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
-@@ -151,6 +151,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
-     return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
+@@ -161,6 +161,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
+     return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
 }
 
 +bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
@@ -115,7 +131,7 @@ index db65d69ea..b6bf6bbf2 100644
     if (il < n_layer) {
         return swa_layers[il];
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 6fcf91b7d..24569a258 100644
+index c3a53be79..2ffe7dd30 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
@@ -64,6 +64,8 @@ struct llama_hparams {
@@ -127,7 +143,7 @@ index 6fcf91b7d..24569a258 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -250,6 +252,9 @@ struct llama_hparams {
+@@ -256,6 +258,9 @@ struct llama_hparams {
 
     uint32_t n_pos_per_embd() const;
 
@@ -151,10 +167,10 @@ index aa3a65f87..ee303bd58 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 2a83d6627..54621ea39 100644
+index c2a545531..4468de2f9 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1890,6 +1890,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1961,6 +1961,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@@ -176,7 +192,7 @@ index 2a83d6627..54621ea39 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -5224,6 +5239,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -5350,6 +5365,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -211,12 +227,71 @@ index 2a83d6627..54621ea39 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -16515,6 +16558,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
-     }
+@@ -7425,6 +7468,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
+             {
+                 llm = std::make_unique<llm_build_chameleon>(*this, params);
+             } break;
+        case LLM_ARCH_SOLAR:
+            {
+                llm = std::make_unique<llm_build_solar>(*this, params);
+            } break;
+         case LLM_ARCH_WAVTOKENIZER_DEC:
+             {
+                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
+@@ -7684,6 +7731,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+         case LLM_ARCH_GRANITE_MOE:
+         case LLM_ARCH_GRANITE_HYBRID:
+         case LLM_ARCH_CHAMELEON:
+        case LLM_ARCH_SOLAR:
+         case LLM_ARCH_BAILINGMOE:
+         case LLM_ARCH_NEO_BERT:
+         case LLM_ARCH_SMOLLM3:
+diff --git a/src/llama-model.h b/src/llama-model.h
+index f8342cf2c..cbf4e1bfa 100644
+--- a/src/llama-model.h
+++ b/src/llama-model.h
+@@ -76,6 +76,7 @@ enum llm_type {
+     LLM_TYPE_15B,
+     LLM_TYPE_16B,
+     LLM_TYPE_20B,
+    LLM_TYPE_22B,
+     LLM_TYPE_26B,
+     LLM_TYPE_27B,
+     LLM_TYPE_30B,
+@@ -404,6 +405,8 @@ struct llama_layer {
+     struct ggml_tensor * ffn_act_beta    = nullptr;
+     struct ggml_tensor * ffn_act_eps     = nullptr;
+ 
+    struct ggml_tensor * bskcn_tv = nullptr;
+
+     struct llama_layer_posnet posnet;
+ 
+     struct llama_layer_convnext convnext;
+diff --git a/src/models/models.h b/src/models/models.h
+index 7ba225b47..71fea796d 100644
+--- a/src/models/models.h
+++ b/src/models/models.h
+@@ -510,6 +510,11 @@ struct llm_build_smollm3 : public llm_graph_context {
+     llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
 };
 
 +struct llm_build_solar : public llm_graph_context {
-+    llm_build_solar(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    llm_build_solar(const llama_model & model, const llm_graph_params & params);
+};
+
+
+ struct llm_build_stablelm : public llm_graph_context {
+     llm_build_stablelm(const llama_model & model, const llm_graph_params & params);
+ };
+diff --git a/src/models/solar.cpp b/src/models/solar.cpp
+new file mode 100644
+index 000000000..97383928c
+--- /dev/null
+++ b/src/models/solar.cpp
+@@ -0,0 +1,158 @@
+#include "models.h"
+
+llm_build_solar::llm_build_solar(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
 +        const int64_t n_embd_head = hparams.n_embd_head_v;
 +        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 +        GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -285,7 +360,7 @@ index 2a83d6627..54621ea39 100644
 +                cb(Kcur, "Kcur", il);
 +                if (model.layers[il].bk) {
 +                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-+                    cb(Kcur, "Kcur", il);
+                   cb(Kcur, "Kcur", il);
 +                }
 +
 +                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
@@ -371,49 +446,4 @@ index 2a83d6627..54621ea39 100644
 +        res->t_logits = cur;
 +
 +        ggml_build_forward_expand(gf, cur);
-+    }
-+};
-+
- // ref: https://github.com/facebookresearch/chameleon
- // based on the original build_llama() function, changes:
- //   * qk-norm
-@@ -20096,6 +20298,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
-             {
-                 llm = std::make_unique<llm_build_chameleon>(*this, params);
-             } break;
-+        case LLM_ARCH_SOLAR:
-+            {
-+                llm = std::make_unique<llm_build_solar>(*this, params);
-+            } break;
-         case LLM_ARCH_WAVTOKENIZER_DEC:
-             {
-                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
-@@ -20331,6 +20537,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
-         case LLM_ARCH_GRANITE_MOE:
-         case LLM_ARCH_GRANITE_HYBRID:
-         case LLM_ARCH_CHAMELEON:
-+        case LLM_ARCH_SOLAR:
-         case LLM_ARCH_BAILINGMOE:
-         case LLM_ARCH_NEO_BERT:
-         case LLM_ARCH_SMOLLM3:
-diff --git a/src/llama-model.h b/src/llama-model.h
-index 248f85410..4a7924aaa 100644
--- a/src/llama-model.h
-+++ b/src/llama-model.h
-@@ -76,6 +76,7 @@ enum llm_type {
-     LLM_TYPE_15B,
-     LLM_TYPE_16B,
-     LLM_TYPE_20B,
-+    LLM_TYPE_22B,
-     LLM_TYPE_27B,
-     LLM_TYPE_30B,
-     LLM_TYPE_32B,
-@@ -390,6 +391,8 @@ struct llama_layer {
-     struct ggml_tensor * ffn_act_beta    = nullptr;
-     struct ggml_tensor * ffn_act_eps     = nullptr;
- 
-+    struct ggml_tensor * bskcn_tv = nullptr;
-+
-     struct llama_layer_posnet posnet;
- 
-     struct llama_layer_convnext convnext;
+}