From f02f83660c2e6f0741932bb31a28b82950144dfc Mon Sep 17 00:00:00 2001
From: lreed <lreed@ip-10-244-31-12.ec2.internal>
Date: Wed, 17 Jul 2024 21:44:19 +0000
Subject: [PATCH 1/9] bump go version to 1.22.5 to fix security vulnerabilities

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index ca393496..c8efdd8a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-ARG GOLANG_VERSION=1.22.1
+ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1

From a3c20e3f181607760ee86893baaf31b3c7fd3012 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 22 Jul 2024 08:52:16 -0700
Subject: [PATCH 2/9] Refine error reporting for subprocess crash

On windows, the exit status winds up being the search term many
users search for and end up piling in on issues that are unrelated.
This refines the reporting so that if we have a more detailed message
we'll suppress the exit status portion of the message.
---
 llm/server.go | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/llm/server.go b/llm/server.go
index ba7eab03..08463ef0 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -417,7 +417,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
 		// reap subprocess when it exits
 		go func() {
-			s.done <- s.cmd.Wait()
+			err := s.cmd.Wait()
+			// Favor a more detailed message over the process exit status
+			if err != nil && s.status != nil && s.status.LastErrMsg != "" {
+				slog.Debug("llama runner terminated", "error", err)
+				if strings.Contains(s.status.LastErrMsg, "unknown model") {
+					s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
+				}
+				s.done <- fmt.Errorf(s.status.LastErrMsg)
+			} else {
+				s.done <- err
+			}
 		}()
 
 		return s, nil
@@ -580,14 +590,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 			slog.Warn("client connection closed before server finished loading, aborting load")
 			return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
 		case err := <-s.done:
-			msg := ""
-			if s.status != nil && s.status.LastErrMsg != "" {
-				msg = s.status.LastErrMsg
-			}
-			if strings.Contains(msg, "unknown model") {
-				return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade")
-			}
-			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
+			return fmt.Errorf("llama runner process has terminated: %w", err)
 		default:
 		}
 		if time.Now().After(stallTimer) {

From cc269ba0943ee1fa0bddcce8027d0a6d1b86fec5 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 22 Jul 2024 09:08:11 -0700
Subject: [PATCH 3/9] Remove no longer supported max vram var

The OLLAMA_MAX_VRAM env var was a temporary workaround for OOM
scenarios.  With Concurrency this was no longer wired up, and the simplistic
value doesn't map to multi-GPU setups.  Users can still set `num_gpu`
to limit memory usage to avoid OOM if we get our predictions wrong.
---
 cmd/cmd.go                      |  1 -
 envconfig/config.go             | 13 -------------
 integration/concurrency_test.go |  4 ++--
 3 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 2252a905..b761d018 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1344,7 +1344,6 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
 				envVars["OLLAMA_LLM_LIBRARY"],
-				envVars["OLLAMA_MAX_VRAM"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
diff --git a/envconfig/config.go b/envconfig/config.go
index 62d661eb..0abc6968 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -43,8 +43,6 @@ var (
 	MaxRunners int
 	// Set via OLLAMA_MAX_QUEUE in the environment
 	MaxQueuedRequests int
-	// Set via OLLAMA_MAX_VRAM in the environment
-	MaxVRAM uint64
 	// Set via OLLAMA_MODELS in the environment
 	ModelsDir string
 	// Set via OLLAMA_NOHISTORY in the environment
@@ -89,7 +87,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
-		"OLLAMA_MAX_VRAM":          {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
 		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
@@ -194,16 +191,6 @@ func LoadConfig() {
 
 	TmpDir = clean("OLLAMA_TMPDIR")
 
-	userLimit := clean("OLLAMA_MAX_VRAM")
-	if userLimit != "" {
-		avail, err := strconv.ParseUint(userLimit, 10, 64)
-		if err != nil {
-			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
-		} else {
-			MaxVRAM = avail
-		}
-	}
-
 	LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
 
 	if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go
index d66ba9f0..8593285b 100644
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -69,7 +69,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 	reqLimit := len(req)
 	iterLimit := 5
 
-	vram := os.Getenv("OLLAMA_MAX_VRAM")
+	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
 	if vram != "" {
 		max, err := strconv.ParseUint(vram, 10, 64)
 		require.NoError(t, err)
@@ -106,7 +106,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 
 // Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
 func TestMultiModelStress(t *testing.T) {
-	vram := os.Getenv("OLLAMA_MAX_VRAM")
+	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
 	if vram == "" {
 		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
 	}

From b3e5491e41811294de9d81649a96581af6522d08 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 22 Jul 2024 12:38:03 -0400
Subject: [PATCH 4/9] server: collect nested tool call objects when parsing
 (#5824)

---
 server/model.go                   | 43 +++++++++++++++++++++--------
 server/model_test.go              |  1 +
 server/routes.go                  |  4 +--
 server/testdata/tools/xlam.gotmpl | 45 +++++++++++++++++++++++++++++++
 server/testdata/tools/xlam.out    | 40 +++++++++++++++++++++++++++
 5 files changed, 120 insertions(+), 13 deletions(-)
 create mode 100644 server/testdata/tools/xlam.gotmpl
 create mode 100644 server/testdata/tools/xlam.out

diff --git a/server/model.go b/server/model.go
index a084dd8c..bf38c415 100644
--- a/server/model.go
+++ b/server/model.go
@@ -344,6 +344,10 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 		}
 	}
 
+	if name == "" || arguments == "" {
+		return nil, false
+	}
+
 	var objs []map[string]any
 	for offset := 0; offset < len(s); {
 		var obj map[string]any
@@ -361,23 +365,40 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 			return nil, false
 		} else {
 			offset += int(decoder.InputOffset())
-			objs = append(objs, obj)
+
+			// collect all nested objects
+			var collect func(any) []map[string]any
+			collect = func(obj any) (all []map[string]any) {
+				switch o := obj.(type) {
+				case map[string]any:
+					all = append(all, o)
+					for _, v := range o {
+						all = append(all, collect(v)...)
+					}
+				case []any:
+					for _, v := range o {
+						all = append(all, collect(v)...)
+					}
+				}
+
+				return all
+			}
+			objs = append(objs, collect(obj)...)
 		}
 	}
 
 	var toolCalls []api.ToolCall
 	for _, kv := range objs {
-		var call api.ToolCall
-		for k, v := range kv {
-			switch k {
-			case name:
-				call.Function.Name = v.(string)
-			case arguments:
-				call.Function.Arguments = v.(map[string]any)
-			}
+		n, nok := kv[name].(string)
+		a, aok := kv[arguments].(map[string]any)
+		if nok && aok {
+			toolCalls = append(toolCalls, api.ToolCall{
+				Function: api.ToolCallFunction{
+					Name:      n,
+					Arguments: a,
+				},
+			})
 		}
-
-		toolCalls = append(toolCalls, call)
 	}
 
 	return toolCalls, len(toolCalls) > 0
diff --git a/server/model_test.go b/server/model_test.go
index 7c826b06..5829adfc 100644
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -166,6 +166,7 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
 {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
 {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
 </tool_call>`, true},
+		{"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true},
 	}
 
 	var tools []api.Tool
diff --git a/server/routes.go b/server/routes.go
index 85db7924..0d7ca003 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -611,10 +611,10 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 		quantization := cmp.Or(r.Quantize, r.Quantization)
 		if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); err != nil {
 			if errors.Is(err, errBadTemplate) {
-			  ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
+				ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
 			}
 			ch <- gin.H{"error": err.Error()}
-		  }
+		}
 	}()
 
 	if r.Stream != nil && !*r.Stream {
diff --git a/server/testdata/tools/xlam.gotmpl b/server/testdata/tools/xlam.gotmpl
new file mode 100644
index 00000000..51513d69
--- /dev/null
+++ b/server/testdata/tools/xlam.gotmpl
@@ -0,0 +1,45 @@
+{{- if .System }}{{ .System }}
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- if eq .Role "user" }}### Instruction:
+{{- if and $.Tools (le (len (slice $.Messages $i)) 2) }}
+[BEGIN OF TASK INSTRUCTION]
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out and refuse to answer. 
+If the given question lacks the parameters required by the function, also point it out.
+[END OF TASK INSTRUCTION]
+
+[BEGIN OF AVAILABLE TOOLS]
+{{ $.Tools }}
+[END OF AVAILABLE TOOLS]
+
+[BEGIN OF FORMAT INSTRUCTION]
+The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
+The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
+```
+{
+    "tool_calls": [
+    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
+    ... (more tool calls as required)
+    ]
+}
+```
+[END OF FORMAT INSTRUCTION]
+
+[BEGIN OF QUERY]
+{{ .Content }}
+[END OF QUERY]
+
+
+{{ else }}
+{{ .Content }}
+{{ end }}
+{{- else if .ToolCalls }}### Response:
+{"tool_calls": [{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{ end }}]}
+<|EOT|>
+{{ else if eq .Role "assistant" }}### Response:
+{{ .Content }}
+<|EOT|>
+{{ end }}
+{{- end }}### Response:
\ No newline at end of file
diff --git a/server/testdata/tools/xlam.out b/server/testdata/tools/xlam.out
new file mode 100644
index 00000000..a4a9952f
--- /dev/null
+++ b/server/testdata/tools/xlam.out
@@ -0,0 +1,40 @@
+You are a knowledgable assistant. You can answer questions and perform tasks.
+### Instruction:
+What's the weather like today in Paris?
+### Response:
+{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]}
+<|EOT|>
+### Response:
+The current temperature in Paris, France is 22 degrees Celsius.
+<|EOT|>
+### Instruction:
+[BEGIN OF TASK INSTRUCTION]
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out and refuse to answer. 
+If the given question lacks the parameters required by the function, also point it out.
+[END OF TASK INSTRUCTION]
+
+[BEGIN OF AVAILABLE TOOLS]
+[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}]
+[END OF AVAILABLE TOOLS]
+
+[BEGIN OF FORMAT INSTRUCTION]
+The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
+The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
+```
+{
+    "tool_calls": [
+    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
+    ... (more tool calls as required)
+    ]
+}
+```
+[END OF FORMAT INSTRUCTION]
+
+[BEGIN OF QUERY]
+What's the weather like today in San Francisco and Toronto?
+[END OF QUERY]
+
+
+### Response:
\ No newline at end of file

From f8fedbda20b1b2531499ba64758642b0568b6f01 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 22 Jul 2024 12:42:00 -0400
Subject: [PATCH 5/9] Update llama.cpp submodule commit to `d94c6e0c` (#5805)

---
 llm/llama.cpp                                 |   2 +-
 llm/patches/05-default-pretokenizer.diff      |  10 +-
 ...{07-embeddings.diff => 06-embeddings.diff} |   0
 llm/patches/06-qwen2.diff                     |  13 -
 ...clip-unicode.diff => 07-clip-unicode.diff} |   0
 .../{09-pooling.diff => 08-pooling.diff}      |   0
 llm/patches/09-lora.diff                      | 360 ++++++++++++++++++
 llm/patches/10-tekken.diff                    |  43 ---
 llm/patches/11-embd_kv.diff                   |  19 -
 9 files changed, 366 insertions(+), 81 deletions(-)
 rename llm/patches/{07-embeddings.diff => 06-embeddings.diff} (100%)
 delete mode 100644 llm/patches/06-qwen2.diff
 rename llm/patches/{08-clip-unicode.diff => 07-clip-unicode.diff} (100%)
 rename llm/patches/{09-pooling.diff => 08-pooling.diff} (100%)
 create mode 100644 llm/patches/09-lora.diff
 delete mode 100644 llm/patches/10-tekken.diff
 delete mode 100644 llm/patches/11-embd_kv.diff

diff --git a/llm/llama.cpp b/llm/llama.cpp
index a8db2a9c..d94c6e0c 160000
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
-Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584
+Subproject commit d94c6e0ccbd29ee1ba4f44e9caa8682ad94df9fa
diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff
index 341a6f59..646bc49c 100644
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 2b9ace28..172640e2 100644
+index 8fe51971..7113ba64 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
+@@ -5433,16 +5433,7 @@ static void llm_load_vocab(
          if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
              vocab.tokenizer_add_space_prefix = false;
              vocab.tokenizer_clean_spaces = true;
@@ -20,9 +20,9 @@ index 2b9ace28..172640e2 100644
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
              } else if (
                      tokenizer_pre == "llama3"   ||
-@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "jais") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
+@@ -5526,7 +5517,8 @@ static void llm_load_vocab(
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
+                 vocab.tokenizer_clean_spaces = false;
              } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
diff --git a/llm/patches/07-embeddings.diff b/llm/patches/06-embeddings.diff
similarity index 100%
rename from llm/patches/07-embeddings.diff
rename to llm/patches/06-embeddings.diff
diff --git a/llm/patches/06-qwen2.diff b/llm/patches/06-qwen2.diff
deleted file mode 100644
index 1c7109f6..00000000
--- a/llm/patches/06-qwen2.diff
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 40d2ec2c..f34eb79a 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
-         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
-         cb(kq, "kq", il);
- 
--        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
-+        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
-             // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
-             // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
-             ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
diff --git a/llm/patches/08-clip-unicode.diff b/llm/patches/07-clip-unicode.diff
similarity index 100%
rename from llm/patches/08-clip-unicode.diff
rename to llm/patches/07-clip-unicode.diff
diff --git a/llm/patches/09-pooling.diff b/llm/patches/08-pooling.diff
similarity index 100%
rename from llm/patches/09-pooling.diff
rename to llm/patches/08-pooling.diff
diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff
new file mode 100644
index 00000000..fc1017a6
--- /dev/null
+++ b/llm/patches/09-lora.diff
@@ -0,0 +1,360 @@
+diff --git a/common/common.cpp b/common/common.cpp
+index dbb724fb..c26fe6ee 100644
+--- a/common/common.cpp
++++ b/common/common.cpp
+@@ -2087,14 +2087,29 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
+     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
+         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
+         float lora_scale = std::get<1>(params.lora_adapter[i]);
++
++        // try to load as gguf
+         auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
+         if (adapter == nullptr) {
+-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+-            llama_free(lctx);
+-            llama_free_model(model);
+-            return std::make_tuple(nullptr, nullptr);
++            fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
++
++            // if that fails, try loading as ggla for compatibility
++            int err = llama_model_apply_lora_from_file(model,
++                                                    lora_adapter.c_str(),
++                                                    lora_scale,
++                                                    ((i > 0) || params.lora_base.empty())
++                                                        ? NULL
++                                                        : params.lora_base.c_str(),
++                                                    params.n_threads);
++            if (err != 0) {
++                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
++                llama_free(lctx);
++                llama_free_model(model);
++                return std::make_tuple(nullptr, nullptr);
++            }
++        } else {
++            llama_lora_adapter_set(lctx, adapter, lora_scale);
+         }
+-        llama_lora_adapter_set(lctx, adapter, lora_scale);
+     }
+ 
+     if (params.ignore_eos) {
+diff --git a/include/llama.h b/include/llama.h
+index 93fd77ca..b0fb37a6 100644
+--- a/include/llama.h
++++ b/include/llama.h
+@@ -1160,6 +1160,20 @@ extern "C" {
+ 
+     LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
+ 
++    // Apply a LoRA adapter to a loaded model
++    // path_base_model is the path to a higher quality model to use as a base for
++    // the layers modified by the adapter. Can be NULL to use the current loaded model.
++    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
++    // will be applied on top of the previous one
++    // Returns 0 on success
++    LLAMA_API int32_t llama_model_apply_lora_from_file(
++            const struct llama_model * model,
++                            const char * path_lora,
++                                float   scale,
++                            const char * path_base_model,
++                                int32_t   n_threads);
++
++
+ #ifdef __cplusplus
+ }
+ #endif
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 80a0dd0f..9d7b0e17 100644
+--- a/src/llama.cpp
++++ b/src/llama.cpp
+@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
+     fputs(text, stderr);
+     fflush(stderr);
+ }
++
++static int llama_apply_lora_from_file_internal(
++    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
++) {
++    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
++
++    const int64_t t_start_lora_us = ggml_time_us();
++
++    llama_file fin(path_lora, "rb");
++
++    // verify magic and version
++    {
++        uint32_t magic = fin.read_u32();
++        if (magic != LLAMA_FILE_MAGIC_GGLA) {
++            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
++            return 1;
++        }
++
++        uint32_t format_version = fin.read_u32();
++        if (format_version != 1) {
++            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
++            return 1;
++        }
++    }
++
++    int32_t lora_r = fin.read_u32();
++    int32_t lora_alpha = fin.read_u32();
++    float scaling = scale * (float)lora_alpha / (float)lora_r;
++
++    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
++
++    // load base model
++    std::unique_ptr<llama_model_loader> ml;
++    if (path_base_model) {
++        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
++        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
++        ml->init_mappings(/*prefetch*/ false); // no prefetching
++    }
++
++    struct tensor_meta {
++        std::string name;
++        ggml_type type;
++        int32_t ne[2];
++        size_t offset;
++    };
++    std::map<std::string, tensor_meta> tensor_meta_map;
++
++    // load all tensor meta
++    while (true) {
++        if (fin.tell() == fin.size) {
++            // eof
++            break;
++        }
++
++        int32_t n_dims;
++        int32_t name_len;
++        int32_t ftype;
++
++        fin.read_raw(&n_dims, sizeof(n_dims));
++        fin.read_raw(&name_len, sizeof(name_len));
++        fin.read_raw(&ftype, sizeof(ftype));
++
++        if (n_dims != 1 && n_dims != 2) {
++            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
++            return 1;
++        }
++
++        int32_t ne[2] = { 1, 1 };
++        for (int i = 0; i < n_dims; ++i) {
++            fin.read_raw(&ne[i], sizeof(ne[i]));
++        }
++
++        std::string name;
++        {
++            GGML_ASSERT(name_len < GGML_MAX_NAME);
++            char buf[GGML_MAX_NAME];
++            fin.read_raw(buf, name_len);
++            name = std::string(buf, name_len);
++        }
++
++        // check for lora suffix
++        std::string lora_suffix;
++        if (name.length() > 6) {
++            lora_suffix = name.substr(name.length() - 6);
++        }
++        if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
++            LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
++            return 1;
++        }
++
++        // tensor type
++        ggml_type wtype;
++        switch (ftype) {
++            case 0: wtype = GGML_TYPE_F32;  break;
++            case 1: wtype = GGML_TYPE_F16;  break;
++            default:
++                    {
++                        LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
++                                __func__, ftype);
++                        return 1;
++                    }
++        }
++
++        // data offset
++        size_t offset = fin.tell();
++        offset = (offset + 31) & -32;
++
++        // skip tensor data
++        fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
++
++        tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
++    }
++
++    bool warned = false;
++    int n_tensors = 0;
++
++    // apply
++    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
++    if (backend_cpu == nullptr) {
++        LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
++        return 1;
++    }
++    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
++
++    std::vector<no_init<uint8_t>> read_buf;
++    for (const auto & it : model.tensors_by_name) {
++        const std::string & base_name = it.first;
++        ggml_tensor * model_t = it.second;
++
++        if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
++            tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
++            continue;
++        }
++
++        tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
++        tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
++
++        ggml_init_params lora_init_params = {
++            /* .mem_size   */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
++            /* .mem_buffer */ nullptr,
++            /* .no_alloc   */ true,
++        };
++        ggml_context * lora_ctx = ggml_init(lora_init_params);
++        if (lora_ctx == nullptr) {
++            LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
++            ggml_backend_free(backend_cpu);
++            return 1;
++        }
++
++        // create tensors
++        ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
++        ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
++        ggml_set_name(loraA, metaA.name.c_str());
++        ggml_set_name(loraB, metaB.name.c_str());
++
++        ggml_tensor * base_t;
++        if (ml) {
++            if (!ml->get_tensor_meta(base_name.c_str())) {
++                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
++                return 1;
++            }
++            base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
++        } else {
++            base_t = ggml_dup_tensor(lora_ctx, model_t);
++        }
++        ggml_set_name(base_t, base_name.c_str());
++
++        // allocate in backend buffer
++        ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
++        if (lora_buf == nullptr) {
++            LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
++            return 1;
++        }
++
++        // load tensor data
++        auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
++            read_buf.resize(ggml_nbytes(tensor));
++            fin.seek(tensor_meta.offset, SEEK_SET);
++            fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
++            ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
++        };
++        load_tensor(metaA, loraA);
++        load_tensor(metaB, loraB);
++
++        // load base model tensor data
++        if (ml) {
++            ml->load_data_for(base_t);
++        } else {
++            ggml_backend_tensor_copy(model_t, base_t);
++        }
++
++        if (ggml_is_quantized(base_t->type) && !warned) {
++            LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
++                            "use a f16 or f32 base model with --lora-base\n", __func__);
++            warned = true;
++        }
++
++        if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
++            LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
++                            " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
++            ggml_free(lora_ctx);
++            ggml_backend_buffer_free(lora_buf);
++            ggml_backend_free(backend_cpu);
++            return 1;
++        }
++
++        auto build_lora_graph = [&]() {
++            // w = w + BA*s
++            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
++            ggml_set_name(BA, "BA");
++
++            if (scaling != 1.0f) {
++                BA = ggml_scale(lora_ctx, BA, scaling);
++                ggml_set_name(BA, "BA_scaled");
++            }
++
++            ggml_tensor * r;
++            r = ggml_add_inplace(lora_ctx, base_t, BA);
++            ggml_set_name(r, "r_add");
++
++            if (base_t->type != model_t->type) {
++                // convert the result to the model type
++                r = ggml_cast(lora_ctx, r, model_t->type);
++                ggml_set_name(r, "r_cast");
++            }
++
++            return r;
++        };
++
++        ggml_cgraph * gf = ggml_new_graph(lora_ctx);
++        ggml_tensor * r = build_lora_graph();
++        ggml_build_forward_expand(gf, r);
++
++        ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
++        if (graph_buf == nullptr) {
++            LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
++            ggml_free(lora_ctx);
++            ggml_backend_buffer_free(lora_buf);
++            ggml_backend_free(backend_cpu);
++            return 1;
++        }
++
++        ggml_backend_graph_compute(backend_cpu, gf);
++
++        ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
++
++#if 0
++        // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
++        //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
++
++        // sched compute
++        ggml_build_forward_expand(gf, build_graph());
++        ggml_backend_sched_init_measure(sched, gf);
++
++        // create the graph again, since the previous one was destroyed by the measure
++        ggml_graph_clear(gf);
++        ggml_build_forward_expand(gf, build_graph());
++        ggml_backend_sched_graph_compute(sched, gf);
++        ggml_backend_sched_free(sched);
++#endif
++
++        ggml_backend_buffer_free(lora_buf);
++        ggml_backend_buffer_free(graph_buf);
++        ggml_free(lora_ctx);
++
++        n_tensors++;
++        if (n_tensors % 4 == 0) {
++            LLAMA_LOG_INFO(".");
++        }
++    }
++
++    ggml_backend_free(backend_cpu);
++
++    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
++    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
++
++    return 0;
++}
++
++int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
++    try {
++        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
++    } catch (const std::exception & err) {
++        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
++        return 1;
++    }
++}
+\ No newline at end of file
diff --git a/llm/patches/10-tekken.diff b/llm/patches/10-tekken.diff
deleted file mode 100644
index 56a583e0..00000000
--- a/llm/patches/10-tekken.diff
+++ /dev/null
@@ -1,43 +0,0 @@
-diff --git a/include/llama.h b/include/llama.h
-index bb4b05ba..a92174e0 100644
---- a/include/llama.h
-+++ b/include/llama.h
-@@ -92,6 +92,7 @@ extern "C" {
-         LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
-         LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
-         LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
-+        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
-     };
- 
-     // note: these values should be synchronized with ggml_rope
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 18364976..435b6fe5 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -5429,6 +5429,12 @@ static void llm_load_vocab(
-             } else if (
-                 tokenizer_pre == "jais") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
-+            } else if (
-+                tokenizer_pre == "tekken") {
-+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
-+                vocab.tokenizer_clean_spaces = false;
-+                vocab.tokenizer_ignore_merges = true;
-+                vocab.tokenizer_add_bos = true;
-             } else {
-                 LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-@@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe {
-                     " ?[^(\\s|.,!?…。，、।۔،)]+",
-                 };
-                 break;
-+            case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
-+                    // original regex from tokenizer.json
-+                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
-+                regex_exprs = {
-+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-+                };
-+                break;
-             default:
-                 // default regex for BPE tokenization pre-processing
-                 regex_exprs = {
diff --git a/llm/patches/11-embd_kv.diff b/llm/patches/11-embd_kv.diff
deleted file mode 100644
index ad17a700..00000000
--- a/llm/patches/11-embd_kv.diff
+++ /dev/null
@@ -1,19 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 2b9ace28..e60d3d8d 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -6052,10 +6052,10 @@ static bool llm_load_tensors(
- 
-                         layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- 
--                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
--                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
--                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
--                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd,  n_embd_head_k * n_head});
-+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
- 
-                         // optional bias tensors
-                         layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);

From d835368eb8599b4f4c2f8a766bad5b57498a988d Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 22 Jul 2024 16:16:22 -0400
Subject: [PATCH 6/9] convert: capture `head_dim` for mistral (#5818)

---
 convert/mistral.go | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/convert/mistral.go b/convert/mistral.go
index da6874cf..8fe066d6 100644
--- a/convert/mistral.go
+++ b/convert/mistral.go
@@ -71,6 +71,11 @@ func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
 		"tokenizer.ggml.unknown_token_id": uint32(0),
 	}
 
+	if m.Params.HeadDimension > 0 {
+		kv["llama.attention.key_length"] = uint32(m.Params.HeadDimension)
+		kv["llama.attention.value_length"] = uint32(m.Params.HeadDimension)
+	}
+
 	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 }
 

From c0648233f2236f82f6830d2aaed552ae0f72379b Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Mon, 22 Jul 2024 13:37:08 -0700
Subject: [PATCH 7/9] api embed docs (#5282)

---
 docs/api.md | 84 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 76 insertions(+), 8 deletions(-)

diff --git a/docs/api.md b/docs/api.md
index c577bb1a..4381c376 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -1026,7 +1026,7 @@ If `stream` is set to `false`, then the response is a single JSON object:
 ## Generate Embeddings
 
 ```shell
-POST /api/embeddings
+POST /api/embed
 ```
 
 Generate embeddings from a model
@@ -1034,10 +1034,11 @@ Generate embeddings from a model
 ### Parameters
 
 - `model`: name of model to generate embeddings from
-- `prompt`: text to generate embeddings for
+- `input`: text or list of text to generate embeddings for
 
 Advanced parameters:
 
+- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
 
@@ -1046,9 +1047,9 @@ Advanced parameters:
 #### Request
 
 ```shell
-curl http://localhost:11434/api/embeddings -d '{
+curl http://localhost:11434/api/embed -d '{
   "model": "all-minilm",
-  "prompt": "Here is an article about llamas..."
+  "input": "Why is the sky blue?"
 }'
 ```
 
@@ -1056,10 +1057,35 @@ curl http://localhost:11434/api/embeddings -d '{
 
 ```json
 {
-  "embedding": [
-    0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
-    0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
-  ]
+  "model": "all-minilm",
+  "embeddings": [[
+    0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
+    0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
+  ]]
+}
+```
+
+#### Request (Multiple input)
+
+```shell
+curl http://localhost:11434/api/embed -d '{
+  "model": "all-minilm",
+  "input": ["Why is the sky blue?", "Why is the grass green?"]
+}'
+```
+
+#### Response
+
+```json
+{
+  "model": "all-minilm",
+  "embeddings": [[
+    0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
+    0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
+  ],[
+    -0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725,
+    0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481
+  ]]
 }
 ```
 
@@ -1106,3 +1132,45 @@ A single JSON object will be returned.
   ]
 }
 ```
+
+## Generate Embedding
+
+> Note: this endpoint has been superseded by `/api/embed`
+
+```shell
+POST /api/embeddings
+```
+
+Generate embeddings from a model
+
+### Parameters
+
+- `model`: name of model to generate embeddings from
+- `prompt`: text to generate embeddings for
+
+Advanced parameters:
+
+- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
+- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+
+### Examples
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/embeddings -d '{
+  "model": "all-minilm",
+  "prompt": "Here is an article about llamas..."
+}'
+```
+
+#### Response
+
+```json
+{
+  "embedding": [
+    0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
+    0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
+  ]
+}
+```
\ No newline at end of file

From db0968f30c895b9f2059da48800018739ef9bca7 Mon Sep 17 00:00:00 2001
From: Josh <76125168+joshyan1@users.noreply.github.com>
Date: Mon, 22 Jul 2024 15:48:15 -0700
Subject: [PATCH 8/9] fix dupe err message (#5857)

---
 server/routes.go | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/server/routes.go b/server/routes.go
index 0d7ca003..e6ffe526 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -609,10 +609,9 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 		defer cancel()
 
 		quantization := cmp.Or(r.Quantize, r.Quantization)
-		if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); err != nil {
-			if errors.Is(err, errBadTemplate) {
-				ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
-			}
+		if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); errors.Is(err, errBadTemplate) {
+			ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
+		} else if err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()

From 5d604eec5bbaba840fcee8cac8574807f3656ea8 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 22 Jul 2024 16:16:28 -0700
Subject: [PATCH 9/9] Bump Go patch version

---
 .github/workflows/release.yaml | 10 +++++-----
 .github/workflows/test.yaml    | 10 +++++-----
 Dockerfile                     |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 5ae630c3..f0c6db5d 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -31,7 +31,7 @@ jobs:
           security set-keychain-settings -lut 3600 build.keychain
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: Build Darwin
         env:
@@ -87,7 +87,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: go get ./...
       - run: |
@@ -141,7 +141,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install ROCm'
         run: |
@@ -218,7 +218,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install CUDA'
         run: |
@@ -306,7 +306,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: go get
       - uses: actions/download-artifact@v4
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 90fef6e5..5e002a22 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -63,7 +63,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: go get ./...
       - run: |
@@ -163,7 +163,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install ROCm'
         run: |
@@ -200,7 +200,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install CUDA'
         run: |
@@ -255,7 +255,7 @@ jobs:
           submodules: recursive
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: false
       - run: |
           case ${{ matrix.arch }} in
@@ -297,7 +297,7 @@ jobs:
           submodules: recursive
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: |
           case ${{ matrix.arch }} in
diff --git a/Dockerfile b/Dockerfile
index ca393496..c8efdd8a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-ARG GOLANG_VERSION=1.22.1
+ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1