Merge remote-tracking branch 'upstream/main'

2025-12-22 23:03:55 +00:00 · 2024-10-21 15:22:06 +08:00
parent eec4cd6b52 48708ca0d5
commit 80ed8f850e
331 changed files with 19407 additions and 11604 deletions
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -22,7 +22,8 @@

 #include "common.h"
 #include "llama.h"
-#include "grammar-parser.h"
+#include "log.h"
+#include "sampling.h"
 #include "utils.hpp"

 #include "../llava/clip.h"
@@ -137,7 +138,6 @@ struct server_slot {

    json prompt;
    std::string generated_text;
-    llama_token sampled;
    std::vector<llama_token> cache_tokens;
    std::vector<completion_token_output> generated_token_probs;

@@ -151,8 +151,9 @@ struct server_slot {
    std::string stopping_word;

    // sampling
-    struct llama_sampling_params sparams;
-    llama_sampling_context *ctx_sampling = nullptr;
+    struct gpt_sampler_params sparams;
+    struct gpt_sampler * smpl = nullptr;
+    llama_token sampled;

    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
@@ -416,7 +417,7 @@ struct llama_server_context
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
            const int n_embd_llm  = llama_n_embd(model);
            if (n_embd_clip != n_embd_llm) {
-                LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
+                LOG_WRN("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                llama_free(ctx);
                llama_free_model(model);
                return false;
@@ -551,7 +552,7 @@ struct llama_server_context

    bool launch_slot_with_data(server_slot* &slot, json data) {
        slot_params default_params;
-        llama_sampling_params default_sparams;
+        gpt_sampler_params default_sparams;

        slot->params.stream             = json_value(data, "stream",            false);
        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
@@ -560,7 +561,7 @@ struct llama_server_context
        slot->sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
        slot->sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
        slot->sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
-        slot->sparams.typical_p         = json_value(data, "typical_p",         default_sparams.typical_p);
+        slot->sparams.typ_p             = json_value(data, "typ_p",             default_sparams.typ_p);
        slot->sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
        slot->sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
        slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
@@ -605,47 +606,11 @@ struct llama_server_context
            slot->prompt = "";
        }

-        slot->sparams.penalty_prompt_tokens.clear();
-        slot->sparams.use_penalty_prompt_tokens = false;
-        const auto &penalty_prompt = data.find("penalty_prompt");
-        if (penalty_prompt != data.end())
-        {
-            if (penalty_prompt->is_string())
-            {
-                const auto penalty_prompt_string = penalty_prompt->get<std::string>();
-                auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
-                slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
-                if (slot->params.n_predict > 0)
-                {
-                    slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
-                }
-                slot->sparams.use_penalty_prompt_tokens = true;
-            }
-            else if (penalty_prompt->is_array())
-            {
-                const auto n_tokens = penalty_prompt->size();
-                slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
-                const int n_vocab = llama_n_vocab(model);
-                for (const auto &penalty_token : *penalty_prompt)
-                {
-                    if (penalty_token.is_number_integer())
-                    {
-                        const auto tok = penalty_token.get<llama_token>();
-                        if (tok >= 0 && tok < n_vocab)
-                        {
-                            slot->sparams.penalty_prompt_tokens.push_back(tok);
-                        }
-                    }
-                }
-                slot->sparams.use_penalty_prompt_tokens = true;
-            }
-        }
-
        slot->sparams.logit_bias.clear();

        if (json_value(data, "ignore_eos", false))
        {
-            slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+            slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
        }

        const auto &logit_bias = data.find("logit_bias");
@@ -675,7 +640,7 @@ struct llama_server_context
                        llama_token tok = el[0].get<llama_token>();
                        if (tok >= 0 && tok < n_vocab)
                        {
-                            slot->sparams.logit_bias[tok] = bias;
+                            slot->sparams.logit_bias.push_back({tok, bias});
                        }
                    }
                    else if (el[0].is_string())
@@ -683,7 +648,7 @@ struct llama_server_context
                        auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
                        for (auto tok : toks)
                        {
-                            slot->sparams.logit_bias[tok] = bias;
+                            slot->sparams.logit_bias.push_back({tok, bias});
                        }
                    }
                }
@@ -704,22 +669,22 @@ struct llama_server_context
            }
        }

-        const auto &samplers_sequence = data.find("samplers");
-        if (samplers_sequence != data.end() && samplers_sequence->is_array())
+        const auto &samplers = data.find("samplers");
+        if (samplers != data.end() && samplers->is_array())
        {
            std::vector<std::string> sampler_names;
-            for (const auto &sampler_name : *samplers_sequence)
+            for (const auto &name : *samplers)
            {
-                if (sampler_name.is_string())
+                if (name.is_string())
                {
-                    sampler_names.emplace_back(sampler_name);
+                    sampler_names.emplace_back(name);
                }
            }
-            slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
+            slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
        }
        else
        {
-            slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
+            slot->sparams.samplers = default_sparams.samplers;
        }

        if (multimodal)
@@ -777,12 +742,12 @@ struct llama_server_context
                                    }
                                }
                                if (!found) {
-                                    LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
+                                    LOG_WRN("ERROR: Image with id: %i, not found.\n", img_id);
                                    slot->images.clear();
                                    return false;
                                }
                            } catch (const std::invalid_argument& e) {
-                                LOG_TEE("Invalid image number id in prompt\n");
+                                LOG_WRN("Invalid image number id in prompt\n");
                                slot->images.clear();
                                return false;
                            }
@@ -795,11 +760,11 @@ struct llama_server_context
            }
        }

-        if (slot->ctx_sampling != nullptr)
+        if (slot->smpl != nullptr)
        {
-            llama_sampling_free(slot->ctx_sampling);
+            gpt_sampler_free(slot->smpl);
        }
-        slot->ctx_sampling = llama_sampling_init(slot->sparams);
+        slot->smpl = gpt_sampler_init(model, slot->sparams);
        slot->command = LOAD_PROMPT;

        all_slots_are_idle = false;
@@ -847,7 +812,7 @@ struct llama_server_context
                };
                if (llama_decode(ctx, batch_view) != 0)
                {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG_WRN("%s: llama_decode() failed\n", __func__);
                    return;
                }
            }
@@ -859,7 +824,7 @@ struct llama_server_context
            }
        }

-        LOG_TEE("system prompt updated\n");
+        LOG_INF("system prompt updated\n");
        system_need_update = false;
    }

@@ -918,12 +883,6 @@ struct llama_server_context

        slot.has_next_token = true;

-        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
-        {
-            // we can change penalty_prompt_tokens because it is always created from scratch each request
-            slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
-        }
-
        // check if there is incomplete UTF-8 character at the end
        bool incomplete = false;
        for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
@@ -1040,7 +999,7 @@ struct llama_server_context
            }

            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
-                LOG_TEE("Error processing the given image");
+                LOG_WRN("Error processing the given image");
                return false;
            }

@@ -1053,7 +1012,7 @@ struct llama_server_context

    void send_error(task_server& task, const std::string &error)
    {
-        LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
+        LOG_WRN("task %i - error: %s\n", task.id, error.c_str());
        task_result res;
        res.id = task.id;
        res.multitask_id = task.multitask_id;
@@ -1065,13 +1024,10 @@ struct llama_server_context

    json get_formated_generation(server_slot &slot)
    {
-        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
-        const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
-                                eos_bias->second < 0.0f && std::isinf(eos_bias->second);
-        std::vector<std::string> samplers_sequence;
-        for (const auto &sampler_type : slot.sparams.samplers_sequence)
-        {
-            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
+        std::vector<std::string> samplers;
+        samplers.reserve(slot.sparams.samplers.size());
+        for (const auto & sampler : slot.sparams.samplers) {
+            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
        }

        return json {
@@ -1086,13 +1042,11 @@ struct llama_server_context
            {"top_p",             slot.sparams.top_p},
            {"min_p",             slot.sparams.min_p},
            {"tfs_z",             slot.sparams.tfs_z},
-            {"typical_p",         slot.sparams.typical_p},
+            {"typical_p",         slot.sparams.typ_p},
            {"repeat_last_n",     slot.sparams.penalty_last_n},
            {"repeat_penalty",    slot.sparams.penalty_repeat},
            {"presence_penalty",  slot.sparams.penalty_present},
            {"frequency_penalty", slot.sparams.penalty_freq},
-            {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
-            {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
            {"mirostat",          slot.sparams.mirostat},
            {"mirostat_tau",      slot.sparams.mirostat_tau},
            {"mirostat_eta",      slot.sparams.mirostat_eta},
@@ -1100,13 +1054,13 @@ struct llama_server_context
            {"stop",              slot.params.antiprompt},
            {"n_predict",         slot.params.n_predict},
            {"n_keep",            params.n_keep},
-            {"ignore_eos",        ignore_eos},
+            {"ignore_eos",        slot.sparams.ignore_eos},
            {"stream",            slot.params.stream},
-            {"logit_bias",        slot.sparams.logit_bias},
+            //{"logit_bias",        slot.sparams.logit_bias},
            {"n_probs",           slot.sparams.n_probs},
            {"min_keep",          slot.sparams.min_keep},
            {"grammar",           slot.sparams.grammar},
-            {"samplers",          samplers_sequence}
+            {"samplers",          samplers}
        };
    }

@@ -1302,7 +1256,7 @@ struct llama_server_context
                };
                if (llama_decode(ctx, batch_view))
                {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                    LOG_WRN("%s : failed to eval\n", __func__);
                    return false;
                }
            }
@@ -1330,7 +1284,7 @@ struct llama_server_context
                };
                if (llama_decode(ctx, batch_img))
                {
-                    LOG_TEE("%s : failed to eval image\n", __func__);
+                    LOG_WRN("%s : failed to eval image\n", __func__);
                    return false;
                }
                slot.n_past += n_eval;
@@ -1735,7 +1689,7 @@ struct llama_server_context

                    if (!slot.params.cache_prompt)
                    {
-                        llama_sampling_reset(slot.ctx_sampling);
+                        gpt_sampler_reset(slot.smpl);

                        slot.n_past    = 0;
                        slot.n_past_se = 0;
@@ -1747,7 +1701,7 @@ struct llama_server_context
                        // push the prompt into the sampling context (do not apply grammar)
                        for (auto &token : prompt_tokens)
                        {
-                            llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
+                            gpt_sampler_accept(slot.smpl, token, false);
                        }

                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@@ -1890,10 +1844,10 @@ struct llama_server_context
                        const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
                        const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;

-                        LOG_TEE("\n");
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+                        LOG_DBG("\n");
+                        LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+                        LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+                        LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);

                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
@@ -1903,7 +1857,7 @@ struct llama_server_context

                        slot.ga_i += slot.ga_w / slot.ga_n;

-                        LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+                        LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
                    }
                    slot.n_past_se += n_tokens;
                }
@@ -1928,11 +1882,11 @@ struct llama_server_context
                if (n_batch == 1 || ret < 0)
                {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG_WRN("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                    return false;
                }

-                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
+                LOG_WRN("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);

                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
@@ -1957,9 +1911,9 @@ struct llama_server_context
                }

                completion_token_output result;
-                const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
+                const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i);

-                llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
+                gpt_sampler_accept(slot.smpl, id, true);

                slot.n_decoded += 1;
                if (slot.n_decoded == 1)
@@ -1969,20 +1923,15 @@ struct llama_server_context
                    metrics.on_prompt_eval(slot);
                }

-                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
                result.tok = id;
+                const auto * cur_p = gpt_sampler_get_candidates(slot.smpl);

-                const int32_t n_probs = slot.sparams.n_probs;
-                if (slot.sparams.temp <= 0 && n_probs > 0)
-                {
-                    // for llama_sample_token_greedy we need to sort candidates
-                    llama_sample_softmax(ctx, &cur_p);
-                }
-
-                for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
-                {
-                    result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
-                }
+                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
+                    result.probs.push_back({
+                        cur_p->data[i].id,
+                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
+                    });
+                 }

                if (!process_token(result, slot))
                {
@@ -2552,8 +2501,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
        }
        else if (arg == "--log-disable")
        {
-            log_set_target(stdout);
-            LOG_DEBUG("logging to file is disabled.", {});
+            LOG_WARNING("DEPRECATED: --log-disable does nothing anymore", {});
        }
        else if (arg == "--slots-endpoint-disable")
        {
@@ -2763,7 +2711,7 @@ int main(int argc, char **argv) {
 #endif

 #if SERVER_VERBOSE != 1
-    log_disable();
+    gpt_log_set_verbosity_thold(-1);
 #endif
    // own arguments required by this example
    gpt_params params;
--- a/llm/ext_server/utils.hpp
+++ b/llm/ext_server/utils.hpp
@@ -28,6 +28,9 @@
 #include <mutex>
 #include <condition_variable>
 #include <unordered_map>
+#include <random>
+#include <iostream>
+#include <thread>

 #include "json.hpp"

--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -51,8 +51,8 @@ func (llm *ggla) KV() KV {
 	return llm.kv
 }

-func (llm *ggla) Tensors() Tensors {
-	return Tensors{
+func (llm *ggla) Tensors() *Tensors {
+	return &Tensors{
 		Items:  llm.tensors,
 		Offset: llm.tensorOffset,
 	}
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -5,7 +5,9 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"slices"
 	"strings"
+	"sync"

 	"github.com/ollama/ollama/util/bufioutil"
 )
@@ -17,7 +19,7 @@ type GGML struct {

 type model interface {
 	KV() KV
-	Tensors() Tensors
+	Tensors() *Tensors
 }

 type KV map[string]any
@@ -123,25 +125,34 @@ func (kv KV) ChatTemplate() string {
 type Tensors struct {
 	Items  []*Tensor
 	Offset uint64
+
+	layers     map[string]Layer
+	layersOnce sync.Once
 }

-func (ts Tensors) Layers() map[string]Layer {
-	layers := make(map[string]Layer)
-	for _, t := range ts.Items {
-		parts := strings.Split(t.Name, ".")
-		if parts[0] == "blk" {
-			// join first and second part, e.g. blk.%d
-			parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...)
+func (ts *Tensors) Layers() map[string]Layer {
+	ts.layersOnce.Do(func() {
+		ts.layers = make(map[string]Layer)
+		for _, t := range ts.Items {
+			parts := strings.Split(t.Name, ".")
+			if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
+				if len(parts) > index+2 {
+					// blk and mm should have a number after them, join it
+					parts = append(
+						[]string{strings.Join(parts[:index+2], ".")},
+						parts[index+2:]...)
+				}
+			}
+
+			if _, ok := ts.layers[parts[0]]; !ok {
+				ts.layers[parts[0]] = make(Layer)
+			}
+
+			ts.layers[parts[0]][strings.Join(parts[1:], ".")] = t
 		}
+	})

-		if _, ok := layers[parts[0]]; !ok {
-			layers[parts[0]] = make(Layer)
-		}
-
-		layers[parts[0]][strings.Join(parts[1:], ".")] = t
-	}
-
-	return layers
+	return ts.layers
 }

 type Layer map[string]*Tensor
@@ -244,6 +255,8 @@ func (t Tensor) typeSize() uint64 {
 		return 8
 	case 29: // IQ1_M
 		return blockSize/8 + blockSize/16 + blockSize/32
+	case 30: // BF16
+		return 2
 	default:
 		return 0
 	}
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -110,8 +110,8 @@ func (llm *gguf) KV() KV {
 	return llm.kv
 }

-func (llm *gguf) Tensors() Tensors {
-	return Tensors{
+func (llm *gguf) Tensors() *Tensors {
+	return &Tensors{
 		Items:  llm.tensors,
 		Offset: llm.tensorOffset,
 	}
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -3,17 +3,18 @@ package llm
 import (
 	"fmt"
 	"log/slog"
+	"os"
 	"strconv"
 	"strings"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/gpu"
 )

 // This algorithm looks for a complete fit to determine if we need to unload other models
-func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
+func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
@@ -63,11 +64,13 @@ type MemoryEstimate struct {
 	memoryLayerOutput   uint64
 	graphFullOffload    uint64
 	graphPartialOffload uint64
+
+	projectorWeights, projectorGraph uint64
 }

 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
+func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
 	// Graph size for a partial offload, applies to all GPUs
 	var graphPartialOffload uint64

@@ -78,7 +81,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	var graphOffload uint64

 	// Projectors loaded into GPU0 only
-	var projectorSize uint64
+	var projectorWeights uint64
+	var projectorGraph uint64

 	// Conditional output size on GPU 0
 	var memoryLayerOutput uint64
@@ -103,7 +107,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)

 	for _, projector := range projectors {
-		projectorSize += projectorMemoryRequirements(projector)
+		weight, graph := projectorMemoryRequirements(projector)
+		projectorWeights += weight
+		projectorGraph += graph

 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
@@ -149,7 +155,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	}

 	// Output layer handled at the end if we have space
-	gpuZeroOverhead := projectorSize
+	gpuZeroOverhead := projectorWeights + projectorGraph

 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
@@ -157,7 +163,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	gpuAllocations := make([]uint64, len(gpus))
 	type gs struct {
 		i int
-		g *gpu.GpuInfo
+		g *discover.GpuInfo
 	}
 	gpusWithSpace := []gs{}
 	for i := range gpus {
@@ -303,6 +309,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		memoryLayerOutput:   memoryLayerOutput,
 		graphFullOffload:    graphFullOffload,
 		graphPartialOffload: graphPartialOffload,
+		projectorWeights:    projectorWeights,
+		projectorGraph:      projectorGraph,
 	}

 	if gpus[0].Library == "cpu" {
@@ -323,7 +331,19 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts

 func (m MemoryEstimate) log() {
 	overhead := envconfig.GpuOverhead()
-	slog.Info(
+
+	log := slog.With()
+	if m.projectorWeights > 0 {
+		log = log.With(
+			slog.Group(
+				"projector",
+				"weights", format.HumanBytes2(m.projectorWeights),
+				"graph", format.HumanBytes2(m.projectorGraph),
+			),
+		)
+	}
+
+	log.Info(
 		"offload to "+m.inferenceLibrary,
 		slog.Group(
 			"layers",
@@ -371,3 +391,52 @@ func (m MemoryEstimate) log() {
 		),
 	)
 }
+
+func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
+	file, err := os.Open(filename)
+	if err != nil {
+		return 0, 0
+	}
+	defer file.Close()
+
+	ggml, _, err := DecodeGGML(file, 0)
+	if err != nil {
+		return 0, 0
+	}
+
+	for _, layer := range ggml.Tensors().Layers() {
+		weights += layer.size()
+	}
+
+	switch arch := ggml.KV().Architecture(); arch {
+	case "mllama":
+		kv := func(n string) uint64 {
+			if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
+				return uint64(v)
+			}
+
+			return 0
+		}
+
+		imageSize := kv("image_size")
+
+		maxNumTiles := kv("max_num_tiles")
+		embeddingLength := kv("embedding_length")
+		headCount := kv("attention.head_count")
+
+		numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
+		if _, ok := ggml.Tensors().Layers()["v"]["class_embd"]; ok {
+			numPatches++
+		}
+
+		numPaddedPatches := numPatches + 8 - (numPatches%8)%8
+
+		graphSize = 4 * (8 +
+			imageSize*imageSize*kv("num_channels")*maxNumTiles +
+			embeddingLength*numPatches*maxNumTiles +
+			9*embeddingLength*numPaddedPatches*maxNumTiles +
+			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
+	}
+
+	return weights, graphSize
+}
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -10,7 +10,7 @@ import (
 	"github.com/stretchr/testify/require"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/gpu"
+	"github.com/ollama/ollama/discover"
 )

 func TestEstimateGPULayers(t *testing.T) {
@@ -50,7 +50,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	}

 	// Simple CPU scenario
-	gpus := []gpu.GpuInfo{
+	gpus := []discover.GpuInfo{
 		{
 			Library: "cpu",
 		},
@@ -72,7 +72,7 @@ func TestEstimateGPULayers(t *testing.T) {

 	// Dual CUDA scenario with assymetry
 	gpuMinimumMemory := uint64(2048)
-	gpus = []gpu.GpuInfo{
+	gpus = []discover.GpuInfo{
 		{
 			Library:       "cuda",
 			MinimumMemory: gpuMinimumMemory,
--- a/llm/patches/0000-cmakelist.patch
+++ b/llm/patches/0000-cmakelist.patch
@@ -1,4 +1,4 @@
-From 8b8d83ffca775840acc5dc700f3b3703e9f5cfe4 Mon Sep 17 00:00:00 2001
+From 7a3555098d4591c9b329c677654497ed8cee07ec Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Fri, 23 Aug 2024 11:27:48 -0700
 Subject: [PATCH] patch cmakelist
@@ -8,15 +8,15 @@ Subject: [PATCH] patch cmakelist
 1 file changed, 2 insertions(+)

 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index a3132063..6a2a9912 100644
+index 415743c2..aaadd13e 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -199,3 +199,5 @@ if (LLAMA_BUILD_EXAMPLES)
+@@ -210,3 +210,5 @@ if (LLAMA_BUILD_EXAMPLES)
     add_subdirectory(examples)
     add_subdirectory(pocs)
 endif()
 +
 +add_subdirectory(../ext_server ext_server) # ollama
 -- 
-2.45.2
+2.39.3 (Apple Git-146)

--- a/llm/patches/0001-load-progress.patch
+++ b/llm/patches/0001-load-progress.patch
@@ -1,7 +1,7 @@
-From 2cfaa0a04faa9c87ba8f1ac8527eb953e69c6cde Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:10 -0700
-Subject: [PATCH] 01-load-progress.diff
+From c97ed60c3369294d5551ba099a88ddc509687df1 Mon Sep 17 00:00:00 2001
+From: Gabe Goodhart <ghart@us.ibm.com>
+Date: Thu, 19 Sep 2024 16:55:15 -0600
+Subject: [PATCH] patch load progress

 ---
 common/common.cpp | 2 ++
@@ -9,10 +9,10 @@ Subject: [PATCH] 01-load-progress.diff
 2 files changed, 9 insertions(+)

 diff --git a/common/common.cpp b/common/common.cpp
-index 9fa18472..48ff41e9 100644
+index 8d0ed4f9..a09e8a53 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
-@@ -2573,6 +2573,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
+@@ -955,6 +955,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
@@ -22,11 +22,11 @@ index 9fa18472..48ff41e9 100644
         mparams.kv_overrides = NULL;
     } else {
 diff --git a/common/common.h b/common/common.h
-index cb5e7f6d..d8f043f7 100644
+index cb87c447..818a4a4a 100644
 --- a/common/common.h
 +++ b/common/common.h
-@@ -204,6 +204,13 @@ struct gpt_params {
-     std::string mmproj = "";        // path to multimodal projector
+@@ -266,6 +266,13 @@ struct gpt_params {
+     std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
     std::vector<std::string> image; // path to image file(s)
 
 +    // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
@@ -40,5 +40,5 @@ index cb5e7f6d..d8f043f7 100644
     bool embedding         = false; // get only sentence embedding
     int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
 -- 
-2.46.0
+2.39.3 (Apple Git-146)

--- a/llm/patches/0002-clip-log.patch
+++ b/llm/patches/0002-clip-log.patch
@@ -1,14 +1,14 @@
-From ba4bba80a744f76ac67b8234451c259a3c5da83b Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:11 -0700
-Subject: [PATCH] 02-clip-log.diff
+From 6fdf4268e13e56f0050fa6a29b029cbd54be49d2 Mon Sep 17 00:00:00 2001
+From: Gabe Goodhart <ghart@us.ibm.com>
+Date: Thu, 19 Sep 2024 16:58:03 -0600
+Subject: [PATCH] clip log

 ---
 examples/llava/clip.cpp | 1 +
 1 file changed, 1 insertion(+)

 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 9b890571..cb51793d 100644
+index 8aa7b075..b8941c74 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
@@ -16,9 +16,9 @@ index 9b890571..cb51793d 100644
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
 +#include "common.h"
- #include "log.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
+ #include "ggml-backend.h"
 -- 
-2.46.0
+2.39.3 (Apple Git-146)

--- a/llm/patches/0003-load_exception.patch
+++ b/llm/patches/0003-load_exception.patch
@@ -1,17 +1,17 @@
-From e43bfd3f607a6dfcaba2d490d35f412a52e55e30 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:12 -0700
-Subject: [PATCH] 03-load_exception.diff
+From 4f2b9cd0f012c49f40d0784454864ad41ca418b2 Mon Sep 17 00:00:00 2001
+From: Gabe Goodhart <ghart@us.ibm.com>
+Date: Thu, 19 Sep 2024 17:00:28 -0600
+Subject: [PATCH] load exception

 ---
 src/llama.cpp | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index 88355971..926bb71a 100644
+index af8afd84..4d1db3d5 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -8635,7 +8635,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
+@@ -8871,7 +8871,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         }
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
@@ -19,8 +19,8 @@ index 88355971..926bb71a 100644
 +        throw;
     }
 
-     return 0;
-@@ -18022,16 +18022,23 @@ struct llama_model * llama_load_model_from_file(
+     // loading time will be recalculate after the first eval, so
+@@ -18675,16 +18675,23 @@ struct llama_model * llama_load_model_from_file(
         }
         model->rpc_servers.push_back(servers);
     }
@@ -53,5 +53,5 @@ index 88355971..926bb71a 100644
 
     return model;
 -- 
-2.46.0
+2.39.3 (Apple Git-146)

--- a/llm/patches/0004-metal.patch
+++ b/llm/patches/0004-metal.patch
@@ -1,24 +1,24 @@
-From 29411d9a9d2b6a0af6425ffe88498f17f71f7d5d Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:12 -0700
-Subject: [PATCH] 04-metal.diff
+From 91d3f886f1645b38d9658c0e125603e8d5338146 Mon Sep 17 00:00:00 2001
+From: nobody <>
+Date: Tue, 1 Oct 2024 13:55:01 -0600
+Subject: [PATCH] metal

 ---
 ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

 diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
-index 91b5e61b..9cfa72ac 100644
+index 9da08fe2..3a433703 100644
 --- a/ggml/src/ggml-metal.m
 +++ b/ggml/src/ggml-metal.m
-@@ -1734,27 +1734,23 @@ static enum ggml_status ggml_metal_graph_compute(
-                         // to the matrix-vector kernel
-                         int ne11_mm_min = 1;
+@@ -1720,27 +1720,23 @@ static void ggml_metal_encode_node(
+                 // to the matrix-vector kernel
+                 int ne11_mm_min = 1;
 
 -#if 0
-                         // the numbers below are measured on M2 Ultra for 7B and 13B models
-                         // these numbers do not translate to other devices or model sizes
-                         // TODO: need to find a better approach
+                 // the numbers below are measured on M2 Ultra for 7B and 13B models
+                 // these numbers do not translate to other devices or model sizes
+                 // TODO: need to find a better approach
 -                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
 -                            switch (src0t) {
 -                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
@@ -53,5 +53,5 @@ index 91b5e61b..9cfa72ac 100644
                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
 -- 
-2.46.0
+2.39.3 (Apple Git-146)

--- a/llm/patches/0005-default-pretokenizer.patch
+++ b/llm/patches/0005-default-pretokenizer.patch
@@ -1,4 +1,4 @@
-From b298ac8614d1e38da28f760eb1d2ae8af0fbbe62 Mon Sep 17 00:00:00 2001
+From 0e531d69786c4a96a3a2bcf7b2d576bd6f7edf25 Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Mon, 16 Sep 2024 15:53:13 -0700
 Subject: [PATCH] 05-default-pretokenizer.diff
@@ -8,10 +8,10 @@ Subject: [PATCH] 05-default-pretokenizer.diff
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index 926bb71a..d1e959fc 100644
+index 4c0a1bb6..800dfb95 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -6083,16 +6083,7 @@ static void llm_load_vocab(
+@@ -6287,16 +6287,7 @@ static void llm_load_vocab(
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = true;
@@ -29,9 +29,9 @@ index 926bb71a..d1e959fc 100644
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -6188,7 +6179,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "exaone") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
+@@ -6398,7 +6389,8 @@ static void llm_load_vocab(
+                 vocab.tokenizer_add_bos = true;
+                 vocab.tokenizer_clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
@@ -40,5 +40,5 @@ index 926bb71a..d1e959fc 100644
         } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
 -- 
-2.46.0
+2.39.3 (Apple Git-146)

--- a/llm/patches/0006-embeddings.patch
+++ b/llm/patches/0006-embeddings.patch
@@ -1,17 +1,17 @@
-From c9a6ca9fc039233dee746a4da9705762cd9e515d Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:14 -0700
-Subject: [PATCH] 06-embeddings.diff
+From 235b6d876a74cb09abe26985fa89ebe5bfc9f562 Mon Sep 17 00:00:00 2001
+From: Gabe Goodhart <ghart@us.ibm.com>
+Date: Thu, 19 Sep 2024 17:06:17 -0600
+Subject: [PATCH] embeddings

 ---
- src/llama.cpp | 17 ++++++++++-------
- 1 file changed, 10 insertions(+), 7 deletions(-)
+ src/llama.cpp | 15 +++++++++------
+ 1 file changed, 9 insertions(+), 6 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index d1e959fc..f79bd782 100644
+index 1a8e0c51..e55ec3f8 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -15898,7 +15898,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
+@@ -16516,7 +16516,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
@@ -20,7 +20,7 @@ index d1e959fc..f79bd782 100644
     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
 
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-@@ -16167,20 +16167,23 @@ static int llama_decode_internal(
+@@ -16794,20 +16794,23 @@ static int llama_decode_internal(
             // no output
             res  = nullptr;
             embd = nullptr;
@@ -30,11 +30,10 @@ index d1e959fc..f79bd782 100644
 +        }
 +
 +        if (cparams.embeddings) {
-             for (int i = gf->n_nodes - 1; i >= 0; --i) {
-                if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
-                    embd = gf->nodes[i];
-+                embd = gf->nodes[i];
-+                if (strcmp(embd->name, "result_embd_pooled") == 0) {
+             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+                embd = ggml_graph_node(gf, i);
+                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
+-                    embd = ggml_graph_node(gf, i);
                     break;
                 }
             }
@@ -51,5 +50,5 @@ index d1e959fc..f79bd782 100644
 
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
 -- 
-2.46.0
+2.39.3 (Apple Git-146)

--- a/llm/patches/0007-clip-unicode.patch
+++ b/llm/patches/0007-clip-unicode.patch
@@ -1,17 +1,17 @@
-From ae2b188a679c83ce105aa1e823499441dfab3c57 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:15 -0700
-Subject: [PATCH] 07-clip-unicode.diff
+From 01c42149cbdc194644a2f138598029938e0dd447 Mon Sep 17 00:00:00 2001
+From: Gabe Goodhart <ghart@us.ibm.com>
+Date: Thu, 19 Sep 2024 17:09:57 -0600
+Subject: [PATCH] clip unicode

 ---
 examples/llava/clip.cpp | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index cb51793d..8716472b 100644
+index b8941c74..3a735f17 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -41,6 +41,14 @@
+@@ -40,6 +40,14 @@
 #include <cinttypes>
 #include <limits>
 
@@ -23,10 +23,10 @@ index cb51793d..8716472b 100644
 +#include <windows.h>
 +#endif
 +
- //#define CLIP_DEBUG_FUNCTIONS
- 
- // RGB uint8 image
-@@ -1223,7 +1231,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+ #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+ #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+ #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+@@ -1227,7 +1235,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             return nullptr;
         }
 
@@ -47,8 +47,8 @@ index cb51793d..8716472b 100644
         auto fin = std::ifstream(fname, std::ios::binary);
 +#endif
         if (!fin) {
-             LOG_TEE("cannot open model file for loading tensors\n");
+             LOG_ERR("cannot open model file for loading tensors\n");
             clip_free(new_clip);
 -- 
-2.46.0
+2.39.3 (Apple Git-146)

--- a/llm/patches/0008-solar-pro.patch
+++ b/llm/patches/0008-solar-pro.patch
@@ -1,4 +1,4 @@
-From 8313ce5f43f11f3d84f352f97f3802792e90e18c Mon Sep 17 00:00:00 2001
+From a8fe40fa7b026d2db9bb6aeecd24fcd2027110ec Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Mon, 16 Sep 2024 15:53:16 -0700
 Subject: [PATCH] add solar-pro support
@@ -11,40 +11,40 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
 tensor with 2 elements dervied from the model's bskcn_tv configuration.
 in general, the values are (bskcn_tv, 1 - bskcn_tv)
 ---
- src/llama.cpp | 267 +++++++++++++++++++++++++++++++++++++++++++++++---
- 1 file changed, 254 insertions(+), 13 deletions(-)
+ src/llama.cpp | 270 +++++++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 255 insertions(+), 15 deletions(-)

 diff --git a/src/llama.cpp b/src/llama.cpp
-index f79bd782..b7771f53 100644
+index 4c0a1bb6..c6fc0c3f 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -213,6 +213,7 @@ enum llm_arch {
-     LLM_ARCH_NEMOTRON,
-     LLM_ARCH_EXAONE,
-     LLM_ARCH_RWKV6,
+@@ -217,6 +217,7 @@ enum llm_arch {
+     LLM_ARCH_GRANITE,
+     LLM_ARCH_GRANITE_MOE,
+     LLM_ARCH_CHAMELEON,
 +    LLM_ARCH_SOLAR,
     LLM_ARCH_UNKNOWN,
 };
 
-@@ -261,6 +262,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_NEMOTRON,        "nemotron"     },
-     { LLM_ARCH_EXAONE,          "exaone"       },
-     { LLM_ARCH_RWKV6,           "rwkv6"        },
+@@ -270,6 +271,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+     { LLM_ARCH_GRANITE,         "granite"      },
+     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
+     { LLM_ARCH_CHAMELEON,       "chameleon"    },
 +    { LLM_ARCH_SOLAR,           "solar"        },
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
 
-@@ -314,6 +316,7 @@ enum llm_kv {
-     LLM_KV_ATTENTION_KV_LORA_RANK,
+@@ -327,6 +329,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
+     LLM_KV_ATTENTION_SCALE,
 +    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_FREQ_BASE,
-@@ -405,19 +408,20 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_TIME_MIX_EXTRA_DIM,                "%s.time_mix_extra_dim"                },
-     { LLM_KV_TIME_DECAY_EXTRA_DIM,              "%s.time_decay_extra_dim"              },
+@@ -421,20 +424,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+     { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
+     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
 
 -    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
 -    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
@@ -59,6 +59,7 @@ index f79bd782..b7771f53 100644
 -    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
 -    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
 -    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
+-    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
 +    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"               },
 +    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"            },
 +    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"           },
@@ -72,22 +73,24 @@ index f79bd782..b7771f53 100644
 +    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"             },
 +    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count"   },
 +    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"           },
+    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                    },
 +    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection.%d" },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
-@@ -589,6 +593,7 @@ enum llm_tensor {
-     LLM_TENSOR_ENC_FFN_DOWN,
-     LLM_TENSOR_ENC_FFN_UP,
+@@ -608,6 +612,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
+     LLM_TENSOR_CLS,
+     LLM_TENSOR_CLS_OUT,
 +    LLM_TENSOR_BSKCN_TV,
 };
 
 static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
-@@ -1408,6 +1413,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
-             { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,    "blk.%d.channel_mix_receptance" },
+@@ -1527,6 +1532,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
+             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
+
 +    {
 +        LLM_ARCH_SOLAR,
 +        {
@@ -109,7 +112,7 @@ index f79bd782..b7771f53 100644
     {
         LLM_ARCH_UNKNOWN,
         {
-@@ -2237,6 +2260,7 @@ enum e_model {
+@@ -2360,6 +2384,7 @@ enum e_model {
     MODEL_15B,
     MODEL_16B,
     MODEL_20B,
@@ -117,7 +120,7 @@ index f79bd782..b7771f53 100644
     MODEL_30B,
     MODEL_34B,
     MODEL_35B,
-@@ -2284,6 +2308,8 @@ struct llama_hparams {
+@@ -2409,6 +2434,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
@@ -126,7 +129,7 @@ index f79bd782..b7771f53 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q = 0;
     uint32_t n_lora_kv = 0;
-@@ -2349,6 +2375,7 @@ struct llama_hparams {
+@@ -2479,6 +2506,7 @@ struct llama_hparams {
         if (this->n_head_arr    != other.n_head_arr)    return true;
         if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
         if (this->n_ff_arr      != other.n_ff_arr)      return true;
@@ -134,7 +137,7 @@ index f79bd782..b7771f53 100644
 
         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
-@@ -2455,6 +2482,14 @@ struct llama_hparams {
+@@ -2588,6 +2616,14 @@ struct llama_hparams {
             return ssm_d_state * ssm_d_inner;
         }
     }
@@ -149,7 +152,7 @@ index f79bd782..b7771f53 100644
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-@@ -2635,6 +2670,8 @@ struct llama_layer {
+@@ -2769,6 +2805,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_gate_scale;
     struct ggml_tensor * ffn_up_scale;
     struct ggml_tensor * ffn_down_scale;
@@ -158,9 +161,9 @@ index f79bd782..b7771f53 100644
 };
 
 // very similar to llama_batch,
-@@ -5937,6 +5974,21 @@ static void llm_load_hparams(
+@@ -6134,6 +6172,21 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
-                 }
+                }
             } break;
 +        case LLM_ARCH_SOLAR:
 +            {
@@ -180,10 +183,15 @@ index f79bd782..b7771f53 100644
         default: (void)0;
     }
 
-@@ -8420,6 +8472,38 @@ static bool llm_load_tensors(
-                     }
+@@ -8839,6 +8892,37 @@ static bool llm_load_tensors(
 
-                 } break;
+                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ 
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                    }
+                } break;
 +            case LLM_ARCH_SOLAR:
 +                {
 +                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -201,7 +209,6 @@ index f79bd782..b7771f53 100644
 +                        auto & layer = model.layers[i];
 +
 +                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-+
 +                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
 +                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
 +                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
@@ -211,15 +218,18 @@ index f79bd782..b7771f53 100644
 +
 +                        layer.bskcn_tv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
 +
-+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-+                    }
-+                } break;
-             default:
-                 throw std::runtime_error("unknown architecture");
-         }
-@@ -15173,6 +15257,158 @@ struct llm_build_context {
+                         layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+@@ -16009,7 +16093,6 @@ struct llm_build_context {
+ 
+         return gf;
+     }
+-
+     // ref: https://github.com/facebookresearch/chameleon
+     // based on the original build_llama() function, changes:
+     //   * qk-norm
+@@ -16187,6 +16270,158 @@ struct llm_build_context {
 
         return gf;
     }
@@ -378,9 +388,9 @@ index f79bd782..b7771f53 100644
 };
 
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
-@@ -15423,6 +15659,10 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -16451,6 +16686,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
-                 result = llm.build_rwkv6();
+                 result = llm.build_chameleon();
             } break;
 +        case LLM_ARCH_SOLAR:
 +            {
@@ -389,14 +399,14 @@ index f79bd782..b7771f53 100644
         default:
             GGML_ABORT("fatal error");
     }
-@@ -18503,6 +18743,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_ARCTIC:
-         case LLM_ARCH_DEEPSEEK2:
-         case LLM_ARCH_CHATGLM:
+@@ -19594,6 +19833,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+         case LLM_ARCH_GRANITE:
+         case LLM_ARCH_GRANITE_MOE:
+         case LLM_ARCH_CHAMELEON:
 +        case LLM_ARCH_SOLAR:
             return LLAMA_ROPE_TYPE_NORM;
 
         // the pairs of head values are offset by n_rot/2
 -- 
-2.46.0
+2.39.3 (Apple Git-146)

--- a/llm/server.go
+++ b/llm/server.go
@@ -26,9 +26,9 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/build"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/runners"
 )
@@ -61,8 +61,8 @@ type llmServer struct {
 	estimate    MemoryEstimate
 	totalLayers uint64
 	// gpuCount     int
-	gpus         gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
-	loadDuration time.Duration   // Record how long it took the model to load
+	gpus         discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
+	loadDuration time.Duration        // Record how long it took the model to load
 	loadProgress float32

 	sem *semaphore.Weighted
@@ -90,7 +90,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {

 // NewLlamaServer will run a server for the given GPUs
 // The gpu list must be a single family.
-func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
+func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
 	var err error
 	var cpuRunner string
 	var estimate MemoryEstimate
@@ -98,19 +98,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	var systemFreeMemory uint64
 	var systemSwapFreeMemory uint64

-	systemMemInfo, err := gpu.GetCPUMem()
-	if err != nil {
-		slog.Error("failed to lookup system memory", "error", err)
-	} else {
-		systemTotalMemory = systemMemInfo.TotalMemory
-		systemFreeMemory = systemMemInfo.FreeMemory
-		systemSwapFreeMemory = systemMemInfo.FreeSwap
-		slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
-	}
+	systemInfo := discover.GetSystemInfo()
+	systemTotalMemory = systemInfo.System.TotalMemory
+	systemFreeMemory = systemInfo.System.FreeMemory
+	systemSwapFreeMemory = systemInfo.System.FreeSwap
+	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))

 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
 	if opts.NumGPU == 0 {
-		gpus = gpu.GetCPUInfo()
+		gpus = discover.GetCPUInfo()
 	}
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 		cpuRunner = runners.ServerForCpu()
@@ -126,7 +122,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
 			cpuRunner = runners.ServerForCpu()
-			gpus = gpu.GetCPUInfo()
+			gpus = discover.GetCPUInfo()
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = estimate.Layers
 		}
@@ -193,8 +189,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		"--embedding",
 	}

-	params = append(params, "--log-disable")
-
 	if opts.NumGPU >= 0 {
 		params = append(params, "--n-gpu-layers", strconv.Itoa(opts.NumGPU))
 	}
@@ -217,8 +211,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--mmproj", projectors[0])
 	}

+	defaultThreads := systemInfo.GetOptimalThreadCount()
 	if opts.NumThread > 0 {
 		params = append(params, "--threads", strconv.Itoa(opts.NumThread))
+	} else if defaultThreads > 0 {
+		params = append(params, "--threads", strconv.Itoa(defaultThreads))
 	}

 	if !opts.F16KV {
@@ -260,15 +257,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--mlock")
 	}

-	if gpu.IsNUMA() && gpus[0].Library == "cpu" {
-		numaMode := "distribute"
-		if runtime.GOOS == "linux" {
-			if _, err := exec.LookPath("numactl"); err == nil {
-				numaMode = "numactl"
-			}
-		}
-		params = append(params, "--numa", numaMode)
-	}
+	// TODO - NUMA support currently doesn't work properly

 	params = append(params, "--parallel", strconv.Itoa(numParallel))

@@ -290,7 +279,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}

 		if strings.HasPrefix(servers[i], "cpu") {
-			gpus = gpu.GetCPUInfo()
+			gpus = discover.GetCPUInfo()
 		}

 		// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
@@ -453,26 +442,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	return nil, finalErr
 }

-func projectorMemoryRequirements(filename string) uint64 {
-	file, err := os.Open(filename)
-	if err != nil {
-		return 0
-	}
-	defer file.Close()
-
-	ggml, _, err := DecodeGGML(file, 0)
-	if err != nil {
-		return 0
-	}
-
-	var mem uint64
-	for _, layer := range ggml.Tensors().Layers() {
-		mem += layer.size()
-	}
-
-	return mem
-}
-
 type ServerStatus int

 const ( // iota is reset to 0
@@ -684,8 +653,9 @@ ws ::= ([ \t\n] ws)?
 const maxBufferSize = 512 * format.KiloByte

 type ImageData struct {
-	Data []byte `json:"data"`
-	ID   int    `json:"id"`
+	Data          []byte `json:"data"`
+	ID            int    `json:"id"`
+	AspectRatioID int    `json:"aspect_ratio_id"`
 }

 type completion struct {