Merge branch 'ollama:main' into main

2025-12-23 23:18:26 +00:00 · 2024-07-23 14:49:32 +08:00
parent c44ff579a3 c78089263a
commit fbfc13b6ca
23 changed files with 594 additions and 141 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -31,7 +31,7 @@ jobs:
          security set-keychain-settings -lut 3600 build.keychain
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - name: Build Darwin
        env:
@@ -87,7 +87,7 @@ jobs:
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - run: go get ./...
      - run: |
@@ -141,7 +141,7 @@ jobs:
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - name: 'Install ROCm'
        run: |
@@ -218,7 +218,7 @@ jobs:
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - name: 'Install CUDA'
        run: |
@@ -306,7 +306,7 @@ jobs:
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - run: go get
      - uses: actions/download-artifact@v4
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -63,7 +63,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - run: go get ./...
      - run: |
@@ -163,7 +163,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - name: 'Install ROCm'
        run: |
@@ -200,7 +200,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - name: 'Install CUDA'
        run: |
@@ -255,7 +255,7 @@ jobs:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: false
      - run: |
          case ${{ matrix.arch }} in
@@ -297,7 +297,7 @@ jobs:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: "stable"
          cache: true
      - run: |
          case ${{ matrix.arch }} in
--- a/2
+++ b/2
@@ -1,4 +1,4 @@
-ARG GOLANG_VERSION=1.22.1
+ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1344,7 +1344,6 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
 				envVars["OLLAMA_LLM_LIBRARY"],
-				envVars["OLLAMA_MAX_VRAM"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
--- a/convert/mistral.go
+++ b/convert/mistral.go
@@ -71,6 +71,11 @@ func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
 		"tokenizer.ggml.unknown_token_id": uint32(0),
 	}

+	if m.Params.HeadDimension > 0 {
+		kv["llama.attention.key_length"] = uint32(m.Params.HeadDimension)
+		kv["llama.attention.value_length"] = uint32(m.Params.HeadDimension)
+	}
+
 	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 }

--- a/docs/api.md
+++ b/docs/api.md
@@ -1026,7 +1026,7 @@ If `stream` is set to `false`, then the response is a single JSON object:
 ## Generate Embeddings

 ```shell
-POST /api/embeddings
+POST /api/embed
 ```

 Generate embeddings from a model
@@ -1034,10 +1034,11 @@ Generate embeddings from a model
 ### Parameters

 - `model`: name of model to generate embeddings from
- `prompt`: text to generate embeddings for
+- `input`: text or list of text to generate embeddings for

 Advanced parameters:

+- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

@@ -1046,9 +1047,9 @@ Advanced parameters:
 #### Request

 ```shell
-curl http://localhost:11434/api/embeddings -d '{
+curl http://localhost:11434/api/embed -d '{
  "model": "all-minilm",
-  "prompt": "Here is an article about llamas..."
+  "input": "Why is the sky blue?"
 }'
 ```

@@ -1056,10 +1057,35 @@ curl http://localhost:11434/api/embeddings -d '{

 ```json
 {
-  "embedding": [
-    0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
-    0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
-  ]
+  "model": "all-minilm",
+  "embeddings": [[
+    0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
+    0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
+  ]]
+}
+```
+
+#### Request (Multiple input)
+
+```shell
+curl http://localhost:11434/api/embed -d '{
+  "model": "all-minilm",
+  "input": ["Why is the sky blue?", "Why is the grass green?"]
+}'
+```
+
+#### Response
+
+```json
+{
+  "model": "all-minilm",
+  "embeddings": [[
+    0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
+    0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
+  ],[
+    -0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725,
+    0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481
+  ]]
 }
 ```

@@ -1106,3 +1132,45 @@ A single JSON object will be returned.
  ]
 }
 ```
+
+## Generate Embedding
+
+> Note: this endpoint has been superseded by `/api/embed`
+
+```shell
+POST /api/embeddings
+```
+
+Generate embeddings from a model
+
+### Parameters
+
+- `model`: name of model to generate embeddings from
+- `prompt`: text to generate embeddings for
+
+Advanced parameters:
+
+- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
+- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+
+### Examples
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/embeddings -d '{
+  "model": "all-minilm",
+  "prompt": "Here is an article about llamas..."
+}'
+```
+
+#### Response
+
+```json
+{
+  "embedding": [
+    0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
+    0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
+  ]
+}
+```
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -43,8 +43,6 @@ var (
 	MaxRunners int
 	// Set via OLLAMA_MAX_QUEUE in the environment
 	MaxQueuedRequests int
-	// Set via OLLAMA_MAX_VRAM in the environment
-	MaxVRAM uint64
 	// Set via OLLAMA_MODELS in the environment
 	ModelsDir string
 	// Set via OLLAMA_NOHISTORY in the environment
@@ -89,7 +87,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
-		"OLLAMA_MAX_VRAM":          {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
 		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
@@ -194,16 +191,6 @@ func LoadConfig() {

 	TmpDir = clean("OLLAMA_TMPDIR")

-	userLimit := clean("OLLAMA_MAX_VRAM")
-	if userLimit != "" {
-		avail, err := strconv.ParseUint(userLimit, 10, 64)
-		if err != nil {
-			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
-		} else {
-			MaxVRAM = avail
-		}
-	}
-
 	LLMLibrary = clean("OLLAMA_LLM_LIBRARY")

 	if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -69,7 +69,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 	reqLimit := len(req)
 	iterLimit := 5

-	vram := os.Getenv("OLLAMA_MAX_VRAM")
+	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
 	if vram != "" {
 		max, err := strconv.ParseUint(vram, 10, 64)
 		require.NoError(t, err)
@@ -106,7 +106,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {

 // Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
 func TestMultiModelStress(t *testing.T) {
-	vram := os.Getenv("OLLAMA_MAX_VRAM")
+	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
 	if vram == "" {
 		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
 	}
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 2b9ace28..172640e2 100644
+index 8fe51971..7113ba64 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
+@@ -5433,16 +5433,7 @@ static void llm_load_vocab(
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = true;
@@ -20,9 +20,9 @@ index 2b9ace28..172640e2 100644
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "jais") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
+@@ -5526,7 +5517,8 @@ static void llm_load_vocab(
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
+                 vocab.tokenizer_clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
--- a/llm/patches/06-embeddings.diff
+++ b/llm/patches/06-embeddings.diff
--- a/llm/patches/06-qwen2.diff
+++ b/llm/patches/06-qwen2.diff
@@ -1,13 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 40d2ec2c..f34eb79a 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
-         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
-         cb(kq, "kq", il);
- 
-        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
-+        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
-             // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
-             // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
-             ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
--- a/llm/patches/07-clip-unicode.diff
+++ b/llm/patches/07-clip-unicode.diff
--- a/llm/patches/08-pooling.diff
+++ b/llm/patches/08-pooling.diff
--- a/llm/patches/09-lora.diff
+++ b/llm/patches/09-lora.diff
@@ -0,0 +1,360 @@
+diff --git a/common/common.cpp b/common/common.cpp
+index dbb724fb..c26fe6ee 100644
+--- a/common/common.cpp
+++ b/common/common.cpp
+@@ -2087,14 +2087,29 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
+     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
+         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
+         float lora_scale = std::get<1>(params.lora_adapter[i]);
+
+        // try to load as gguf
+         auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
+         if (adapter == nullptr) {
+-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+-            llama_free(lctx);
+-            llama_free_model(model);
+-            return std::make_tuple(nullptr, nullptr);
+            fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
+
+            // if that fails, try loading as ggla for compatibility
+            int err = llama_model_apply_lora_from_file(model,
+                                                    lora_adapter.c_str(),
+                                                    lora_scale,
+                                                    ((i > 0) || params.lora_base.empty())
+                                                        ? NULL
+                                                        : params.lora_base.c_str(),
+                                                    params.n_threads);
+            if (err != 0) {
+                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+                llama_free(lctx);
+                llama_free_model(model);
+                return std::make_tuple(nullptr, nullptr);
+            }
+        } else {
+            llama_lora_adapter_set(lctx, adapter, lora_scale);
+         }
+-        llama_lora_adapter_set(lctx, adapter, lora_scale);
+     }
+ 
+     if (params.ignore_eos) {
+diff --git a/include/llama.h b/include/llama.h
+index 93fd77ca..b0fb37a6 100644
+--- a/include/llama.h
+++ b/include/llama.h
+@@ -1160,6 +1160,20 @@ extern "C" {
+ 
+     LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
+ 
+    // Apply a LoRA adapter to a loaded model
+    // path_base_model is the path to a higher quality model to use as a base for
+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
+    // will be applied on top of the previous one
+    // Returns 0 on success
+    LLAMA_API int32_t llama_model_apply_lora_from_file(
+            const struct llama_model * model,
+                            const char * path_lora,
+                                float   scale,
+                            const char * path_base_model,
+                                int32_t   n_threads);
+
+
+ #ifdef __cplusplus
+ }
+ #endif
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 80a0dd0f..9d7b0e17 100644
+--- a/src/llama.cpp
+++ b/src/llama.cpp
+@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
+     fputs(text, stderr);
+     fflush(stderr);
+ }
+
+static int llama_apply_lora_from_file_internal(
+    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
+) {
+    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
+
+    const int64_t t_start_lora_us = ggml_time_us();
+
+    llama_file fin(path_lora, "rb");
+
+    // verify magic and version
+    {
+        uint32_t magic = fin.read_u32();
+        if (magic != LLAMA_FILE_MAGIC_GGLA) {
+            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
+            return 1;
+        }
+
+        uint32_t format_version = fin.read_u32();
+        if (format_version != 1) {
+            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
+            return 1;
+        }
+    }
+
+    int32_t lora_r = fin.read_u32();
+    int32_t lora_alpha = fin.read_u32();
+    float scaling = scale * (float)lora_alpha / (float)lora_r;
+
+    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
+
+    // load base model
+    std::unique_ptr<llama_model_loader> ml;
+    if (path_base_model) {
+        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
+        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
+        ml->init_mappings(/*prefetch*/ false); // no prefetching
+    }
+
+    struct tensor_meta {
+        std::string name;
+        ggml_type type;
+        int32_t ne[2];
+        size_t offset;
+    };
+    std::map<std::string, tensor_meta> tensor_meta_map;
+
+    // load all tensor meta
+    while (true) {
+        if (fin.tell() == fin.size) {
+            // eof
+            break;
+        }
+
+        int32_t n_dims;
+        int32_t name_len;
+        int32_t ftype;
+
+        fin.read_raw(&n_dims, sizeof(n_dims));
+        fin.read_raw(&name_len, sizeof(name_len));
+        fin.read_raw(&ftype, sizeof(ftype));
+
+        if (n_dims != 1 && n_dims != 2) {
+            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
+            return 1;
+        }
+
+        int32_t ne[2] = { 1, 1 };
+        for (int i = 0; i < n_dims; ++i) {
+            fin.read_raw(&ne[i], sizeof(ne[i]));
+        }
+
+        std::string name;
+        {
+            GGML_ASSERT(name_len < GGML_MAX_NAME);
+            char buf[GGML_MAX_NAME];
+            fin.read_raw(buf, name_len);
+            name = std::string(buf, name_len);
+        }
+
+        // check for lora suffix
+        std::string lora_suffix;
+        if (name.length() > 6) {
+            lora_suffix = name.substr(name.length() - 6);
+        }
+        if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
+            LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
+            return 1;
+        }
+
+        // tensor type
+        ggml_type wtype;
+        switch (ftype) {
+            case 0: wtype = GGML_TYPE_F32;  break;
+            case 1: wtype = GGML_TYPE_F16;  break;
+            default:
+                    {
+                        LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
+                                __func__, ftype);
+                        return 1;
+                    }
+        }
+
+        // data offset
+        size_t offset = fin.tell();
+        offset = (offset + 31) & -32;
+
+        // skip tensor data
+        fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
+
+        tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
+    }
+
+    bool warned = false;
+    int n_tensors = 0;
+
+    // apply
+    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
+    if (backend_cpu == nullptr) {
+        LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
+        return 1;
+    }
+    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
+
+    std::vector<no_init<uint8_t>> read_buf;
+    for (const auto & it : model.tensors_by_name) {
+        const std::string & base_name = it.first;
+        ggml_tensor * model_t = it.second;
+
+        if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
+            tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
+            continue;
+        }
+
+        tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
+        tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
+
+        ggml_init_params lora_init_params = {
+            /* .mem_size   */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
+            /* .mem_buffer */ nullptr,
+            /* .no_alloc   */ true,
+        };
+        ggml_context * lora_ctx = ggml_init(lora_init_params);
+        if (lora_ctx == nullptr) {
+            LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
+            ggml_backend_free(backend_cpu);
+            return 1;
+        }
+
+        // create tensors
+        ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
+        ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
+        ggml_set_name(loraA, metaA.name.c_str());
+        ggml_set_name(loraB, metaB.name.c_str());
+
+        ggml_tensor * base_t;
+        if (ml) {
+            if (!ml->get_tensor_meta(base_name.c_str())) {
+                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
+                return 1;
+            }
+            base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
+        } else {
+            base_t = ggml_dup_tensor(lora_ctx, model_t);
+        }
+        ggml_set_name(base_t, base_name.c_str());
+
+        // allocate in backend buffer
+        ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
+        if (lora_buf == nullptr) {
+            LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
+            return 1;
+        }
+
+        // load tensor data
+        auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
+            read_buf.resize(ggml_nbytes(tensor));
+            fin.seek(tensor_meta.offset, SEEK_SET);
+            fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
+            ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
+        };
+        load_tensor(metaA, loraA);
+        load_tensor(metaB, loraB);
+
+        // load base model tensor data
+        if (ml) {
+            ml->load_data_for(base_t);
+        } else {
+            ggml_backend_tensor_copy(model_t, base_t);
+        }
+
+        if (ggml_is_quantized(base_t->type) && !warned) {
+            LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
+                            "use a f16 or f32 base model with --lora-base\n", __func__);
+            warned = true;
+        }
+
+        if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
+            LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
+                            " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
+            ggml_free(lora_ctx);
+            ggml_backend_buffer_free(lora_buf);
+            ggml_backend_free(backend_cpu);
+            return 1;
+        }
+
+        auto build_lora_graph = [&]() {
+            // w = w + BA*s
+            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
+            ggml_set_name(BA, "BA");
+
+            if (scaling != 1.0f) {
+                BA = ggml_scale(lora_ctx, BA, scaling);
+                ggml_set_name(BA, "BA_scaled");
+            }
+
+            ggml_tensor * r;
+            r = ggml_add_inplace(lora_ctx, base_t, BA);
+            ggml_set_name(r, "r_add");
+
+            if (base_t->type != model_t->type) {
+                // convert the result to the model type
+                r = ggml_cast(lora_ctx, r, model_t->type);
+                ggml_set_name(r, "r_cast");
+            }
+
+            return r;
+        };
+
+        ggml_cgraph * gf = ggml_new_graph(lora_ctx);
+        ggml_tensor * r = build_lora_graph();
+        ggml_build_forward_expand(gf, r);
+
+        ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
+        if (graph_buf == nullptr) {
+            LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
+            ggml_free(lora_ctx);
+            ggml_backend_buffer_free(lora_buf);
+            ggml_backend_free(backend_cpu);
+            return 1;
+        }
+
+        ggml_backend_graph_compute(backend_cpu, gf);
+
+        ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
+
+#if 0
+        // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
+        //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
+
+        // sched compute
+        ggml_build_forward_expand(gf, build_graph());
+        ggml_backend_sched_init_measure(sched, gf);
+
+        // create the graph again, since the previous one was destroyed by the measure
+        ggml_graph_clear(gf);
+        ggml_build_forward_expand(gf, build_graph());
+        ggml_backend_sched_graph_compute(sched, gf);
+        ggml_backend_sched_free(sched);
+#endif
+
+        ggml_backend_buffer_free(lora_buf);
+        ggml_backend_buffer_free(graph_buf);
+        ggml_free(lora_ctx);
+
+        n_tensors++;
+        if (n_tensors % 4 == 0) {
+            LLAMA_LOG_INFO(".");
+        }
+    }
+
+    ggml_backend_free(backend_cpu);
+
+    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
+    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
+
+    return 0;
+}
+
+int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
+    try {
+        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
+        return 1;
+    }
+}
+\ No newline at end of file
--- a/llm/patches/10-tekken.diff
+++ b/llm/patches/10-tekken.diff
@@ -1,43 +0,0 @@
-diff --git a/include/llama.h b/include/llama.h
-index bb4b05ba..a92174e0 100644
--- a/include/llama.h
-+++ b/include/llama.h
-@@ -92,6 +92,7 @@ extern "C" {
-         LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
-         LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
-         LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
-+        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
-     };
- 
-     // note: these values should be synchronized with ggml_rope
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 18364976..435b6fe5 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -5429,6 +5429,12 @@ static void llm_load_vocab(
-             } else if (
-                 tokenizer_pre == "jais") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
-+            } else if (
-+                tokenizer_pre == "tekken") {
-+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
-+                vocab.tokenizer_clean_spaces = false;
-+                vocab.tokenizer_ignore_merges = true;
-+                vocab.tokenizer_add_bos = true;
-             } else {
-                 LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-@@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe {
-                     " ?[^(\\s|.,!?…。，、।۔،)]+",
-                 };
-                 break;
-+            case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
-+                    // original regex from tokenizer.json
-+                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
-+                regex_exprs = {
-+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-+                };
-+                break;
-             default:
-                 // default regex for BPE tokenization pre-processing
-                 regex_exprs = {
--- a/llm/patches/11-embd_kv.diff
+++ b/llm/patches/11-embd_kv.diff
@@ -1,19 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 2b9ace28..e60d3d8d 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -6052,10 +6052,10 @@ static bool llm_load_tensors(
- 
-                         layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd,  n_embd_head_k * n_head});
-+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
- 
-                         // optional bias tensors
-                         layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
--- a/llm/server.go
+++ b/llm/server.go
@@ -417,7 +417,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr

 		// reap subprocess when it exits
 		go func() {
-			s.done <- s.cmd.Wait()
+			err := s.cmd.Wait()
+			// Favor a more detailed message over the process exit status
+			if err != nil && s.status != nil && s.status.LastErrMsg != "" {
+				slog.Debug("llama runner terminated", "error", err)
+				if strings.Contains(s.status.LastErrMsg, "unknown model") {
+					s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
+				}
+				s.done <- fmt.Errorf(s.status.LastErrMsg)
+			} else {
+				s.done <- err
+			}
 		}()

 		return s, nil
@@ -580,14 +590,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 			slog.Warn("client connection closed before server finished loading, aborting load")
 			return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
 		case err := <-s.done:
-			msg := ""
-			if s.status != nil && s.status.LastErrMsg != "" {
-				msg = s.status.LastErrMsg
-			}
-			if strings.Contains(msg, "unknown model") {
-				return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade")
-			}
-			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
+			return fmt.Errorf("llama runner process has terminated: %w", err)
 		default:
 		}
 		if time.Now().After(stallTimer) {
--- a/server/model.go
+++ b/server/model.go
@@ -344,6 +344,10 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 		}
 	}

+	if name == "" || arguments == "" {
+		return nil, false
+	}
+
 	var objs []map[string]any
 	for offset := 0; offset < len(s); {
 		var obj map[string]any
@@ -361,23 +365,40 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 			return nil, false
 		} else {
 			offset += int(decoder.InputOffset())
-			objs = append(objs, obj)
+
+			// collect all nested objects
+			var collect func(any) []map[string]any
+			collect = func(obj any) (all []map[string]any) {
+				switch o := obj.(type) {
+				case map[string]any:
+					all = append(all, o)
+					for _, v := range o {
+						all = append(all, collect(v)...)
+					}
+				case []any:
+					for _, v := range o {
+						all = append(all, collect(v)...)
+					}
+				}
+
+				return all
+			}
+			objs = append(objs, collect(obj)...)
 		}
 	}

 	var toolCalls []api.ToolCall
 	for _, kv := range objs {
-		var call api.ToolCall
-		for k, v := range kv {
-			switch k {
-			case name:
-				call.Function.Name = v.(string)
-			case arguments:
-				call.Function.Arguments = v.(map[string]any)
-			}
+		n, nok := kv[name].(string)
+		a, aok := kv[arguments].(map[string]any)
+		if nok && aok {
+			toolCalls = append(toolCalls, api.ToolCall{
+				Function: api.ToolCallFunction{
+					Name:      n,
+					Arguments: a,
+				},
+			})
 		}
-
-		toolCalls = append(toolCalls, call)
 	}

 	return toolCalls, len(toolCalls) > 0
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -166,6 +166,7 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
 {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
 {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
 </tool_call>`, true},
+		{"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true},
 	}

 	var tools []api.Tool
--- a/server/routes.go
+++ b/server/routes.go
@@ -609,12 +609,11 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 		defer cancel()

 		quantization := cmp.Or(r.Quantize, r.Quantization)
-		if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); err != nil {
-			if errors.Is(err, errBadTemplate) {
-			  ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
-			}
+		if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); errors.Is(err, errBadTemplate) {
+			ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
+		} else if err != nil {
 			ch <- gin.H{"error": err.Error()}
-		  }
+		}
 	}()

 	if r.Stream != nil && !*r.Stream {
--- a/server/testdata/tools/xlam.gotmpl
+++ b/server/testdata/tools/xlam.gotmpl
@@ -0,0 +1,45 @@
+{{- if .System }}{{ .System }}
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- if eq .Role "user" }}### Instruction:
+{{- if and $.Tools (le (len (slice $.Messages $i)) 2) }}
+[BEGIN OF TASK INSTRUCTION]
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out and refuse to answer. 
+If the given question lacks the parameters required by the function, also point it out.
+[END OF TASK INSTRUCTION]
+
+[BEGIN OF AVAILABLE TOOLS]
+{{ $.Tools }}
+[END OF AVAILABLE TOOLS]
+
+[BEGIN OF FORMAT INSTRUCTION]
+The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
+The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
+```
+{
+    "tool_calls": [
+    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
+    ... (more tool calls as required)
+    ]
+}
+```
+[END OF FORMAT INSTRUCTION]
+
+[BEGIN OF QUERY]
+{{ .Content }}
+[END OF QUERY]
+
+
+{{ else }}
+{{ .Content }}
+{{ end }}
+{{- else if .ToolCalls }}### Response:
+{"tool_calls": [{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{ end }}]}
+<|EOT|>
+{{ else if eq .Role "assistant" }}### Response:
+{{ .Content }}
+<|EOT|>
+{{ end }}
+{{- end }}### Response:
--- a/server/testdata/tools/xlam.out
+++ b/server/testdata/tools/xlam.out
@@ -0,0 +1,40 @@
+You are a knowledgable assistant. You can answer questions and perform tasks.
+### Instruction:
+What's the weather like today in Paris?
+### Response:
+{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]}
+<|EOT|>
+### Response:
+The current temperature in Paris, France is 22 degrees Celsius.
+<|EOT|>
+### Instruction:
+[BEGIN OF TASK INSTRUCTION]
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out and refuse to answer. 
+If the given question lacks the parameters required by the function, also point it out.
+[END OF TASK INSTRUCTION]
+
+[BEGIN OF AVAILABLE TOOLS]
+[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}]
+[END OF AVAILABLE TOOLS]
+
+[BEGIN OF FORMAT INSTRUCTION]
+The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
+The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
+```
+{
+    "tool_calls": [
+    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
+    ... (more tool calls as required)
+    ]
+}
+```
+[END OF FORMAT INSTRUCTION]
+
+[BEGIN OF QUERY]
+What's the weather like today in San Francisco and Toronto?
+[END OF QUERY]
+
+
+### Response: