diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 5ae630c3..f0c6db5d 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -31,7 +31,7 @@ jobs:
           security set-keychain-settings -lut 3600 build.keychain
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: Build Darwin
         env:
@@ -87,7 +87,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: go get ./...
       - run: |
@@ -141,7 +141,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install ROCm'
         run: |
@@ -218,7 +218,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install CUDA'
         run: |
@@ -306,7 +306,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: go get
       - uses: actions/download-artifact@v4
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 90fef6e5..5e002a22 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -63,7 +63,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: go get ./...
       - run: |
@@ -163,7 +163,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install ROCm'
         run: |
@@ -200,7 +200,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install CUDA'
         run: |
@@ -255,7 +255,7 @@ jobs:
           submodules: recursive
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: false
       - run: |
           case ${{ matrix.arch }} in
@@ -297,7 +297,7 @@ jobs:
           submodules: recursive
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: |
           case ${{ matrix.arch }} in
diff --git a/Dockerfile b/Dockerfile
index ca393496..c8efdd8a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-ARG GOLANG_VERSION=1.22.1
+ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1
diff --git a/cmd/cmd.go b/cmd/cmd.go
index 2252a905..b761d018 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1344,7 +1344,6 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
 				envVars["OLLAMA_LLM_LIBRARY"],
-				envVars["OLLAMA_MAX_VRAM"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
diff --git a/convert/mistral.go b/convert/mistral.go
index da6874cf..8fe066d6 100644
--- a/convert/mistral.go
+++ b/convert/mistral.go
@@ -71,6 +71,11 @@ func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
 		"tokenizer.ggml.unknown_token_id": uint32(0),
 	}
 
+	if m.Params.HeadDimension > 0 {
+		kv["llama.attention.key_length"] = uint32(m.Params.HeadDimension)
+		kv["llama.attention.value_length"] = uint32(m.Params.HeadDimension)
+	}
+
 	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 }
 
diff --git a/docs/api.md b/docs/api.md
index c577bb1a..4381c376 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -1026,7 +1026,7 @@ If `stream` is set to `false`, then the response is a single JSON object:
 ## Generate Embeddings
 
 ```shell
-POST /api/embeddings
+POST /api/embed
 ```
 
 Generate embeddings from a model
@@ -1034,10 +1034,11 @@ Generate embeddings from a model
 ### Parameters
 
 - `model`: name of model to generate embeddings from
-- `prompt`: text to generate embeddings for
+- `input`: text or list of text to generate embeddings for
 
 Advanced parameters:
 
+- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
 
@@ -1046,9 +1047,9 @@ Advanced parameters:
 #### Request
 
 ```shell
-curl http://localhost:11434/api/embeddings -d '{
+curl http://localhost:11434/api/embed -d '{
   "model": "all-minilm",
-  "prompt": "Here is an article about llamas..."
+  "input": "Why is the sky blue?"
 }'
 ```
 
@@ -1056,10 +1057,35 @@ curl http://localhost:11434/api/embeddings -d '{
 
 ```json
 {
-  "embedding": [
-    0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
-    0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
-  ]
+  "model": "all-minilm",
+  "embeddings": [[
+    0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
+    0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
+  ]]
+}
+```
+
+#### Request (Multiple input)
+
+```shell
+curl http://localhost:11434/api/embed -d '{
+  "model": "all-minilm",
+  "input": ["Why is the sky blue?", "Why is the grass green?"]
+}'
+```
+
+#### Response
+
+```json
+{
+  "model": "all-minilm",
+  "embeddings": [[
+    0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
+    0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
+  ],[
+    -0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725,
+    0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481
+  ]]
 }
 ```
 
@@ -1106,3 +1132,45 @@ A single JSON object will be returned.
   ]
 }
 ```
+
+## Generate Embedding
+
+> Note: this endpoint has been superseded by `/api/embed`
+
+```shell
+POST /api/embeddings
+```
+
+Generate embeddings from a model
+
+### Parameters
+
+- `model`: name of model to generate embeddings from
+- `prompt`: text to generate embeddings for
+
+Advanced parameters:
+
+- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
+- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+
+### Examples
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/embeddings -d '{
+  "model": "all-minilm",
+  "prompt": "Here is an article about llamas..."
+}'
+```
+
+#### Response
+
+```json
+{
+  "embedding": [
+    0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
+    0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
+  ]
+}
+```
\ No newline at end of file
diff --git a/envconfig/config.go b/envconfig/config.go
index 62d661eb..0abc6968 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -43,8 +43,6 @@ var (
 	MaxRunners int
 	// Set via OLLAMA_MAX_QUEUE in the environment
 	MaxQueuedRequests int
-	// Set via OLLAMA_MAX_VRAM in the environment
-	MaxVRAM uint64
 	// Set via OLLAMA_MODELS in the environment
 	ModelsDir string
 	// Set via OLLAMA_NOHISTORY in the environment
@@ -89,7 +87,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
-		"OLLAMA_MAX_VRAM":          {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
 		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
@@ -194,16 +191,6 @@ func LoadConfig() {
 
 	TmpDir = clean("OLLAMA_TMPDIR")
 
-	userLimit := clean("OLLAMA_MAX_VRAM")
-	if userLimit != "" {
-		avail, err := strconv.ParseUint(userLimit, 10, 64)
-		if err != nil {
-			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
-		} else {
-			MaxVRAM = avail
-		}
-	}
-
 	LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
 
 	if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go
index d66ba9f0..8593285b 100644
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -69,7 +69,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 	reqLimit := len(req)
 	iterLimit := 5
 
-	vram := os.Getenv("OLLAMA_MAX_VRAM")
+	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
 	if vram != "" {
 		max, err := strconv.ParseUint(vram, 10, 64)
 		require.NoError(t, err)
@@ -106,7 +106,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 
 // Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
 func TestMultiModelStress(t *testing.T) {
-	vram := os.Getenv("OLLAMA_MAX_VRAM")
+	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
 	if vram == "" {
 		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
 	}
diff --git a/llm/llama.cpp b/llm/llama.cpp
index a8db2a9c..d94c6e0c 160000
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
-Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584
+Subproject commit d94c6e0ccbd29ee1ba4f44e9caa8682ad94df9fa
diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff
index 341a6f59..646bc49c 100644
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 2b9ace28..172640e2 100644
+index 8fe51971..7113ba64 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
+@@ -5433,16 +5433,7 @@ static void llm_load_vocab(
          if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
              vocab.tokenizer_add_space_prefix = false;
              vocab.tokenizer_clean_spaces = true;
@@ -20,9 +20,9 @@ index 2b9ace28..172640e2 100644
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
              } else if (
                      tokenizer_pre == "llama3"   ||
-@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "jais") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
+@@ -5526,7 +5517,8 @@ static void llm_load_vocab(
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
+                 vocab.tokenizer_clean_spaces = false;
              } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
diff --git a/llm/patches/07-embeddings.diff b/llm/patches/06-embeddings.diff
similarity index 100%
rename from llm/patches/07-embeddings.diff
rename to llm/patches/06-embeddings.diff
diff --git a/llm/patches/06-qwen2.diff b/llm/patches/06-qwen2.diff
deleted file mode 100644
index 1c7109f6..00000000
--- a/llm/patches/06-qwen2.diff
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 40d2ec2c..f34eb79a 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
-         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
-         cb(kq, "kq", il);
- 
--        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
-+        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
-             // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
-             // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
-             ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
diff --git a/llm/patches/08-clip-unicode.diff b/llm/patches/07-clip-unicode.diff
similarity index 100%
rename from llm/patches/08-clip-unicode.diff
rename to llm/patches/07-clip-unicode.diff
diff --git a/llm/patches/09-pooling.diff b/llm/patches/08-pooling.diff
similarity index 100%
rename from llm/patches/09-pooling.diff
rename to llm/patches/08-pooling.diff
diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff
new file mode 100644
index 00000000..fc1017a6
--- /dev/null
+++ b/llm/patches/09-lora.diff
@@ -0,0 +1,360 @@
+diff --git a/common/common.cpp b/common/common.cpp
+index dbb724fb..c26fe6ee 100644
+--- a/common/common.cpp
++++ b/common/common.cpp
+@@ -2087,14 +2087,29 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
+     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
+         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
+         float lora_scale = std::get<1>(params.lora_adapter[i]);
++
++        // try to load as gguf
+         auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
+         if (adapter == nullptr) {
+-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+-            llama_free(lctx);
+-            llama_free_model(model);
+-            return std::make_tuple(nullptr, nullptr);
++            fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
++
++            // if that fails, try loading as ggla for compatibility
++            int err = llama_model_apply_lora_from_file(model,
++                                                    lora_adapter.c_str(),
++                                                    lora_scale,
++                                                    ((i > 0) || params.lora_base.empty())
++                                                        ? NULL
++                                                        : params.lora_base.c_str(),
++                                                    params.n_threads);
++            if (err != 0) {
++                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
++                llama_free(lctx);
++                llama_free_model(model);
++                return std::make_tuple(nullptr, nullptr);
++            }
++        } else {
++            llama_lora_adapter_set(lctx, adapter, lora_scale);
+         }
+-        llama_lora_adapter_set(lctx, adapter, lora_scale);
+     }
+ 
+     if (params.ignore_eos) {
+diff --git a/include/llama.h b/include/llama.h
+index 93fd77ca..b0fb37a6 100644
+--- a/include/llama.h
++++ b/include/llama.h
+@@ -1160,6 +1160,20 @@ extern "C" {
+ 
+     LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
+ 
++    // Apply a LoRA adapter to a loaded model
++    // path_base_model is the path to a higher quality model to use as a base for
++    // the layers modified by the adapter. Can be NULL to use the current loaded model.
++    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
++    // will be applied on top of the previous one
++    // Returns 0 on success
++    LLAMA_API int32_t llama_model_apply_lora_from_file(
++            const struct llama_model * model,
++                            const char * path_lora,
++                                float   scale,
++                            const char * path_base_model,
++                                int32_t   n_threads);
++
++
+ #ifdef __cplusplus
+ }
+ #endif
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 80a0dd0f..9d7b0e17 100644
+--- a/src/llama.cpp
++++ b/src/llama.cpp
+@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
+     fputs(text, stderr);
+     fflush(stderr);
+ }
++
++static int llama_apply_lora_from_file_internal(
++    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
++) {
++    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
++
++    const int64_t t_start_lora_us = ggml_time_us();
++
++    llama_file fin(path_lora, "rb");
++
++    // verify magic and version
++    {
++        uint32_t magic = fin.read_u32();
++        if (magic != LLAMA_FILE_MAGIC_GGLA) {
++            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
++            return 1;
++        }
++
++        uint32_t format_version = fin.read_u32();
++        if (format_version != 1) {
++            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
++            return 1;
++        }
++    }
++
++    int32_t lora_r = fin.read_u32();
++    int32_t lora_alpha = fin.read_u32();
++    float scaling = scale * (float)lora_alpha / (float)lora_r;
++
++    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
++
++    // load base model
++    std::unique_ptr<llama_model_loader> ml;
++    if (path_base_model) {
++        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
++        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
++        ml->init_mappings(/*prefetch*/ false); // no prefetching
++    }
++
++    struct tensor_meta {
++        std::string name;
++        ggml_type type;
++        int32_t ne[2];
++        size_t offset;
++    };
++    std::map<std::string, tensor_meta> tensor_meta_map;
++
++    // load all tensor meta
++    while (true) {
++        if (fin.tell() == fin.size) {
++            // eof
++            break;
++        }
++
++        int32_t n_dims;
++        int32_t name_len;
++        int32_t ftype;
++
++        fin.read_raw(&n_dims, sizeof(n_dims));
++        fin.read_raw(&name_len, sizeof(name_len));
++        fin.read_raw(&ftype, sizeof(ftype));
++
++        if (n_dims != 1 && n_dims != 2) {
++            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
++            return 1;
++        }
++
++        int32_t ne[2] = { 1, 1 };
++        for (int i = 0; i < n_dims; ++i) {
++            fin.read_raw(&ne[i], sizeof(ne[i]));
++        }
++
++        std::string name;
++        {
++            GGML_ASSERT(name_len < GGML_MAX_NAME);
++            char buf[GGML_MAX_NAME];
++            fin.read_raw(buf, name_len);
++            name = std::string(buf, name_len);
++        }
++
++        // check for lora suffix
++        std::string lora_suffix;
++        if (name.length() > 6) {
++            lora_suffix = name.substr(name.length() - 6);
++        }
++        if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
++            LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
++            return 1;
++        }
++
++        // tensor type
++        ggml_type wtype;
++        switch (ftype) {
++            case 0: wtype = GGML_TYPE_F32;  break;
++            case 1: wtype = GGML_TYPE_F16;  break;
++            default:
++                    {
++                        LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
++                                __func__, ftype);
++                        return 1;
++                    }
++        }
++
++        // data offset
++        size_t offset = fin.tell();
++        offset = (offset + 31) & -32;
++
++        // skip tensor data
++        fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
++
++        tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
++    }
++
++    bool warned = false;
++    int n_tensors = 0;
++
++    // apply
++    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
++    if (backend_cpu == nullptr) {
++        LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
++        return 1;
++    }
++    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
++
++    std::vector<no_init<uint8_t>> read_buf;
++    for (const auto & it : model.tensors_by_name) {
++        const std::string & base_name = it.first;
++        ggml_tensor * model_t = it.second;
++
++        if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
++            tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
++            continue;
++        }
++
++        tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
++        tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
++
++        ggml_init_params lora_init_params = {
++            /* .mem_size   */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
++            /* .mem_buffer */ nullptr,
++            /* .no_alloc   */ true,
++        };
++        ggml_context * lora_ctx = ggml_init(lora_init_params);
++        if (lora_ctx == nullptr) {
++            LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
++            ggml_backend_free(backend_cpu);
++            return 1;
++        }
++
++        // create tensors
++        ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
++        ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
++        ggml_set_name(loraA, metaA.name.c_str());
++        ggml_set_name(loraB, metaB.name.c_str());
++
++        ggml_tensor * base_t;
++        if (ml) {
++            if (!ml->get_tensor_meta(base_name.c_str())) {
++                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
++                return 1;
++            }
++            base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
++        } else {
++            base_t = ggml_dup_tensor(lora_ctx, model_t);
++        }
++        ggml_set_name(base_t, base_name.c_str());
++
++        // allocate in backend buffer
++        ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
++        if (lora_buf == nullptr) {
++            LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
++            return 1;
++        }
++
++        // load tensor data
++        auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
++            read_buf.resize(ggml_nbytes(tensor));
++            fin.seek(tensor_meta.offset, SEEK_SET);
++            fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
++            ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
++        };
++        load_tensor(metaA, loraA);
++        load_tensor(metaB, loraB);
++
++        // load base model tensor data
++        if (ml) {
++            ml->load_data_for(base_t);
++        } else {
++            ggml_backend_tensor_copy(model_t, base_t);
++        }
++
++        if (ggml_is_quantized(base_t->type) && !warned) {
++            LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
++                            "use a f16 or f32 base model with --lora-base\n", __func__);
++            warned = true;
++        }
++
++        if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
++            LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
++                            " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
++            ggml_free(lora_ctx);
++            ggml_backend_buffer_free(lora_buf);
++            ggml_backend_free(backend_cpu);
++            return 1;
++        }
++
++        auto build_lora_graph = [&]() {
++            // w = w + BA*s
++            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
++            ggml_set_name(BA, "BA");
++
++            if (scaling != 1.0f) {
++                BA = ggml_scale(lora_ctx, BA, scaling);
++                ggml_set_name(BA, "BA_scaled");
++            }
++
++            ggml_tensor * r;
++            r = ggml_add_inplace(lora_ctx, base_t, BA);
++            ggml_set_name(r, "r_add");
++
++            if (base_t->type != model_t->type) {
++                // convert the result to the model type
++                r = ggml_cast(lora_ctx, r, model_t->type);
++                ggml_set_name(r, "r_cast");
++            }
++
++            return r;
++        };
++
++        ggml_cgraph * gf = ggml_new_graph(lora_ctx);
++        ggml_tensor * r = build_lora_graph();
++        ggml_build_forward_expand(gf, r);
++
++        ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
++        if (graph_buf == nullptr) {
++            LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
++            ggml_free(lora_ctx);
++            ggml_backend_buffer_free(lora_buf);
++            ggml_backend_free(backend_cpu);
++            return 1;
++        }
++
++        ggml_backend_graph_compute(backend_cpu, gf);
++
++        ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
++
++#if 0
++        // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
++        //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
++
++        // sched compute
++        ggml_build_forward_expand(gf, build_graph());
++        ggml_backend_sched_init_measure(sched, gf);
++
++        // create the graph again, since the previous one was destroyed by the measure
++        ggml_graph_clear(gf);
++        ggml_build_forward_expand(gf, build_graph());
++        ggml_backend_sched_graph_compute(sched, gf);
++        ggml_backend_sched_free(sched);
++#endif
++
++        ggml_backend_buffer_free(lora_buf);
++        ggml_backend_buffer_free(graph_buf);
++        ggml_free(lora_ctx);
++
++        n_tensors++;
++        if (n_tensors % 4 == 0) {
++            LLAMA_LOG_INFO(".");
++        }
++    }
++
++    ggml_backend_free(backend_cpu);
++
++    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
++    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
++
++    return 0;
++}
++
++int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
++    try {
++        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
++    } catch (const std::exception & err) {
++        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
++        return 1;
++    }
++}
+\ No newline at end of file
diff --git a/llm/patches/10-tekken.diff b/llm/patches/10-tekken.diff
deleted file mode 100644
index 56a583e0..00000000
--- a/llm/patches/10-tekken.diff
+++ /dev/null
@@ -1,43 +0,0 @@
-diff --git a/include/llama.h b/include/llama.h
-index bb4b05ba..a92174e0 100644
---- a/include/llama.h
-+++ b/include/llama.h
-@@ -92,6 +92,7 @@ extern "C" {
-         LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
-         LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
-         LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
-+        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
-     };
- 
-     // note: these values should be synchronized with ggml_rope
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 18364976..435b6fe5 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -5429,6 +5429,12 @@ static void llm_load_vocab(
-             } else if (
-                 tokenizer_pre == "jais") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
-+            } else if (
-+                tokenizer_pre == "tekken") {
-+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
-+                vocab.tokenizer_clean_spaces = false;
-+                vocab.tokenizer_ignore_merges = true;
-+                vocab.tokenizer_add_bos = true;
-             } else {
-                 LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-@@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe {
-                     " ?[^(\\s|.,!?…。，、।۔،)]+",
-                 };
-                 break;
-+            case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
-+                    // original regex from tokenizer.json
-+                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
-+                regex_exprs = {
-+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-+                };
-+                break;
-             default:
-                 // default regex for BPE tokenization pre-processing
-                 regex_exprs = {
diff --git a/llm/patches/11-embd_kv.diff b/llm/patches/11-embd_kv.diff
deleted file mode 100644
index ad17a700..00000000
--- a/llm/patches/11-embd_kv.diff
+++ /dev/null
@@ -1,19 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 2b9ace28..e60d3d8d 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -6052,10 +6052,10 @@ static bool llm_load_tensors(
- 
-                         layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- 
--                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
--                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
--                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
--                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd,  n_embd_head_k * n_head});
-+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
- 
-                         // optional bias tensors
-                         layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
diff --git a/llm/server.go b/llm/server.go
index ba7eab03..08463ef0 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -417,7 +417,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
 		// reap subprocess when it exits
 		go func() {
-			s.done <- s.cmd.Wait()
+			err := s.cmd.Wait()
+			// Favor a more detailed message over the process exit status
+			if err != nil && s.status != nil && s.status.LastErrMsg != "" {
+				slog.Debug("llama runner terminated", "error", err)
+				if strings.Contains(s.status.LastErrMsg, "unknown model") {
+					s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
+				}
+				s.done <- fmt.Errorf(s.status.LastErrMsg)
+			} else {
+				s.done <- err
+			}
 		}()
 
 		return s, nil
@@ -580,14 +590,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 			slog.Warn("client connection closed before server finished loading, aborting load")
 			return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
 		case err := <-s.done:
-			msg := ""
-			if s.status != nil && s.status.LastErrMsg != "" {
-				msg = s.status.LastErrMsg
-			}
-			if strings.Contains(msg, "unknown model") {
-				return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade")
-			}
-			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
+			return fmt.Errorf("llama runner process has terminated: %w", err)
 		default:
 		}
 		if time.Now().After(stallTimer) {
diff --git a/server/model.go b/server/model.go
index a084dd8c..bf38c415 100644
--- a/server/model.go
+++ b/server/model.go
@@ -344,6 +344,10 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 		}
 	}
 
+	if name == "" || arguments == "" {
+		return nil, false
+	}
+
 	var objs []map[string]any
 	for offset := 0; offset < len(s); {
 		var obj map[string]any
@@ -361,23 +365,40 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 			return nil, false
 		} else {
 			offset += int(decoder.InputOffset())
-			objs = append(objs, obj)
+
+			// collect all nested objects
+			var collect func(any) []map[string]any
+			collect = func(obj any) (all []map[string]any) {
+				switch o := obj.(type) {
+				case map[string]any:
+					all = append(all, o)
+					for _, v := range o {
+						all = append(all, collect(v)...)
+					}
+				case []any:
+					for _, v := range o {
+						all = append(all, collect(v)...)
+					}
+				}
+
+				return all
+			}
+			objs = append(objs, collect(obj)...)
 		}
 	}
 
 	var toolCalls []api.ToolCall
 	for _, kv := range objs {
-		var call api.ToolCall
-		for k, v := range kv {
-			switch k {
-			case name:
-				call.Function.Name = v.(string)
-			case arguments:
-				call.Function.Arguments = v.(map[string]any)
-			}
+		n, nok := kv[name].(string)
+		a, aok := kv[arguments].(map[string]any)
+		if nok && aok {
+			toolCalls = append(toolCalls, api.ToolCall{
+				Function: api.ToolCallFunction{
+					Name:      n,
+					Arguments: a,
+				},
+			})
 		}
-
-		toolCalls = append(toolCalls, call)
 	}
 
 	return toolCalls, len(toolCalls) > 0
diff --git a/server/model_test.go b/server/model_test.go
index 7c826b06..5829adfc 100644
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -166,6 +166,7 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
 {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
 {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
 </tool_call>`, true},
+		{"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true},
 	}
 
 	var tools []api.Tool
diff --git a/server/routes.go b/server/routes.go
index 85db7924..e6ffe526 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -609,12 +609,11 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 		defer cancel()
 
 		quantization := cmp.Or(r.Quantize, r.Quantization)
-		if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); err != nil {
-			if errors.Is(err, errBadTemplate) {
-			  ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
-			}
+		if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); errors.Is(err, errBadTemplate) {
+			ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
+		} else if err != nil {
 			ch <- gin.H{"error": err.Error()}
-		  }
+		}
 	}()
 
 	if r.Stream != nil && !*r.Stream {
diff --git a/server/testdata/tools/xlam.gotmpl b/server/testdata/tools/xlam.gotmpl
new file mode 100644
index 00000000..51513d69
--- /dev/null
+++ b/server/testdata/tools/xlam.gotmpl
@@ -0,0 +1,45 @@
+{{- if .System }}{{ .System }}
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- if eq .Role "user" }}### Instruction:
+{{- if and $.Tools (le (len (slice $.Messages $i)) 2) }}
+[BEGIN OF TASK INSTRUCTION]
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out and refuse to answer. 
+If the given question lacks the parameters required by the function, also point it out.
+[END OF TASK INSTRUCTION]
+
+[BEGIN OF AVAILABLE TOOLS]
+{{ $.Tools }}
+[END OF AVAILABLE TOOLS]
+
+[BEGIN OF FORMAT INSTRUCTION]
+The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
+The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
+```
+{
+    "tool_calls": [
+    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
+    ... (more tool calls as required)
+    ]
+}
+```
+[END OF FORMAT INSTRUCTION]
+
+[BEGIN OF QUERY]
+{{ .Content }}
+[END OF QUERY]
+
+
+{{ else }}
+{{ .Content }}
+{{ end }}
+{{- else if .ToolCalls }}### Response:
+{"tool_calls": [{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{ end }}]}
+<|EOT|>
+{{ else if eq .Role "assistant" }}### Response:
+{{ .Content }}
+<|EOT|>
+{{ end }}
+{{- end }}### Response:
\ No newline at end of file
diff --git a/server/testdata/tools/xlam.out b/server/testdata/tools/xlam.out
new file mode 100644
index 00000000..a4a9952f
--- /dev/null
+++ b/server/testdata/tools/xlam.out
@@ -0,0 +1,40 @@
+You are a knowledgable assistant. You can answer questions and perform tasks.
+### Instruction:
+What's the weather like today in Paris?
+### Response:
+{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]}
+<|EOT|>
+### Response:
+The current temperature in Paris, France is 22 degrees Celsius.
+<|EOT|>
+### Instruction:
+[BEGIN OF TASK INSTRUCTION]
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out and refuse to answer. 
+If the given question lacks the parameters required by the function, also point it out.
+[END OF TASK INSTRUCTION]
+
+[BEGIN OF AVAILABLE TOOLS]
+[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}]
+[END OF AVAILABLE TOOLS]
+
+[BEGIN OF FORMAT INSTRUCTION]
+The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
+The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
+```
+{
+    "tool_calls": [
+    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
+    ... (more tool calls as required)
+    ]
+}
+```
+[END OF FORMAT INSTRUCTION]
+
+[BEGIN OF QUERY]
+What's the weather like today in San Francisco and Toronto?
+[END OF QUERY]
+
+
+### Response:
\ No newline at end of file