Merge branch 'ollama:main' into main

2025-12-22 06:43:57 +00:00 · 2024-05-06 22:32:02 +08:00
parent a1c646af51 1b0e6c9c0e
commit 8d64603d1a
49 changed files with 851 additions and 504 deletions
--- a/README.md
+++ b/README.md
@@ -370,12 +370,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
 - [Hass Ollama Conversation](https://github.com/ej52/hass-ollama-conversation)
 - [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
 - [Cliobot](https://github.com/herval/cliobot) (Telegram bot with Ollama support)
 - [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot)
 - [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
@@ -385,3 +386,4 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Supported backends 
 - [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@@ -5,12 +5,14 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
 	"github.com/ollama/ollama/server/envconfig"
 )
 func InitLogging() {
 	level := slog.LevelInfo
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		level = slog.LevelDebug
 	}
--- a/app/lifecycle/updater_windows.go
+++ b/app/lifecycle/updater_windows.go
@@ -31,16 +31,13 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
 		"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
 		"/FORCECLOSEAPPLICATIONS",               // Force close the tray app - might be needed
 	}
-	// When we're not in debug mode, make the upgrade as quiet as possible (no GUI, no prompts)
+	// make the upgrade as quiet as possible (no GUI, no prompts)
 	// TODO - temporarily disable since we're pinning in debug mode for the preview
 	// if debug := os.Getenv("OLLAMA_DEBUG"); debug == "" {
 	installArgs = append(installArgs,
 		"/SP", // Skip the "This will install... Do you wish to continue" prompt
 		"/SUPPRESSMSGBOXES",
 		"/SILENT",
 		"/VERYSILENT",
 	)
 	// }
 	// Safeguard in case we have requests in flight that need to drain...
 	slog.Info("Waiting for server to shutdown")
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -34,7 +34,6 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/server"
 	"github.com/ollama/ollama/types/errtypes"
@@ -57,13 +56,13 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	p := progress.NewProgress(os.Stderr)
 	defer p.Stop()
-	modelfile, err := os.Open(filename)
+	f, err := os.Open(filename)
 	if err != nil {
 		return err
 	}
-	defer modelfile.Close()
+	defer f.Close()
-	commands, err := parser.Parse(modelfile)
+	modelfile, err := model.ParseFile(f)
 	if err != nil {
 		return err
 	}
@@ -77,10 +76,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	spinner := progress.NewSpinner(status)
 	p.Add(status, spinner)
-	for i := range commands {
+	for i := range modelfile.Commands {
-		switch commands[i].Name {
+		switch modelfile.Commands[i].Name {
 		case "model", "adapter":
-			path := commands[i].Args
+			path := modelfile.Commands[i].Args
 			if path == "~" {
 				path = home
 			} else if strings.HasPrefix(path, "~/") {
@@ -92,7 +91,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 			}
 			fi, err := os.Stat(path)
-			if errors.Is(err, os.ErrNotExist) && commands[i].Name == "model" {
+			if errors.Is(err, os.ErrNotExist) && modelfile.Commands[i].Name == "model" {
 				continue
 			} else if err != nil {
 				return err
@@ -115,7 +114,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				return err
 			}
-			commands[i].Args = "@"+digest
+			modelfile.Commands[i].Args = "@" + digest
 		}
 	}
@@ -145,7 +144,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	quantization, _ := cmd.Flags().GetString("quantization")
-	request := api.CreateRequest{Name: args[0], Modelfile: parser.Format(commands), Quantization: quantization}
+	request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantization: quantization}
 	if err := client.Create(cmd.Context(), &request, fn); err != nil {
 		return err
 	}
--- a/convert/safetensors.go
+++ b/convert/safetensors.go
@@ -53,7 +53,7 @@ func (m *SafetensorFormat) GetTensors(dirpath string, params *Params) ([]llm.Ten
 		var err error
 		t, offset, err = m.readTensors(f, offset, params)
 		if err != nil {
-			slog.Error("%v", err)
+			slog.Error(err.Error())
 			return nil, err
 		}
 		tensors = append(tensors, t...)
@@ -122,7 +122,7 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
 		ggufName, err := m.GetLayerName(k)
 		if err != nil {
-			slog.Error("%v", err)
+			slog.Error(err.Error())
 			return nil, 0, err
 		}
--- a/convert/torch.go
+++ b/convert/torch.go
@@ -74,7 +74,7 @@ func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor,
 			ggufName, err := tf.GetLayerName(k.(string))
 			if err != nil {
-				slog.Error("%v", err)
+				slog.Error(err.Error())
 				return nil, err
 			}
 			slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName))
--- a/docs/api.md
+++ b/docs/api.md
@@ -17,7 +17,7 @@
 ### Model names
-Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
+Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
 ### Durations
@@ -66,7 +66,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama2",
+  "model": "llama3",
  "prompt": "Why is the sky blue?"
 }'
 ```
@@ -77,7 +77,7 @@ A stream of JSON objects is returned:
 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "response": "The",
  "done": false
@@ -99,7 +99,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "",
  "done": true,
@@ -121,7 +121,7 @@ A response can be received in one reply when streaming is off.
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama2",
+  "model": "llama3",
  "prompt": "Why is the sky blue?",
  "stream": false
 }'
@@ -133,7 +133,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@@ -155,7 +155,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama2",
+  "model": "llama3",
  "prompt": "What color is the sky at different times of the day? Respond using JSON",
  "format": "json",
  "stream": false
@@ -166,7 +166,7 @@ curl http://localhost:11434/api/generate -d '{
 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-11-09T21:07:55.186497Z",
  "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
  "done": true,
@@ -289,7 +289,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama2",
+  "model": "llama3",
  "prompt": "Why is the sky blue?",
  "stream": false,
  "options": {
@@ -332,7 +332,7 @@ curl http://localhost:11434/api/generate -d '{
 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@@ -354,7 +354,7 @@ If an empty prompt is provided, the model will be loaded into memory.
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama2"
+  "model": "llama3"
 }'
 ```
@@ -364,7 +364,7 @@ A single JSON object is returned:
 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-12-18T19:52:07.071755Z",
  "response": "",
  "done": true
@@ -407,7 +407,7 @@ Send a chat message with a streaming response.
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama2",
+  "model": "llama3",
  "messages": [
    {
      "role": "user",
@@ -423,7 +423,7 @@ A stream of JSON objects is returned:
 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@@ -438,7 +438,7 @@ Final response:
 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 4883583458,
@@ -456,7 +456,7 @@ Final response:
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama2",
+  "model": "llama3",
  "messages": [
    {
      "role": "user",
@@ -471,7 +471,7 @@ curl http://localhost:11434/api/chat -d '{
 ```json
 {
-  "model": "registry.ollama.ai/library/llama2:latest",
+  "model": "registry.ollama.ai/library/llama3:latest",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@@ -495,7 +495,7 @@ Send a chat message with a conversation history. You can use this same approach
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama2",
+  "model": "llama3",
  "messages": [
    {
      "role": "user",
@@ -519,7 +519,7 @@ A stream of JSON objects is returned:
 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@@ -533,7 +533,7 @@ Final response:
 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 8113331500,
@@ -591,7 +591,7 @@ curl http://localhost:11434/api/chat -d '{
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama2",
+  "model": "llama3",
  "messages": [
    {
      "role": "user",
@@ -609,7 +609,7 @@ curl http://localhost:11434/api/chat -d '{
 ```json
 {
-  "model": "registry.ollama.ai/library/llama2:latest",
+  "model": "registry.ollama.ai/library/llama3:latest",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@@ -651,7 +651,7 @@ Create a new model from a `Modelfile`.
 ```shell
 curl http://localhost:11434/api/create -d '{
  "name": "mario",
-  "modelfile": "FROM llama2\nSYSTEM You are mario from Super Mario Bros."
+  "modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
 }'
 ```
@@ -758,7 +758,7 @@ A single JSON object will be returned.
      }
    },
    {
-      "name": "llama2:latest",
+      "name": "llama3:latest",
      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
      "size": 3825819519,
      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
@@ -792,7 +792,7 @@ Show information about a model including details, modelfile, template, parameter
 ```shell
 curl http://localhost:11434/api/show -d '{
-  "name": "llama2"
+  "name": "llama3"
 }'
 ```
@@ -827,8 +827,8 @@ Copy a model. Creates a model with another name from an existing model.
 ```shell
 curl http://localhost:11434/api/copy -d '{
-  "source": "llama2",
+  "source": "llama3",
-  "destination": "llama2-backup"
+  "destination": "llama3-backup"
 }'
 ```
@@ -854,7 +854,7 @@ Delete a model and its data.
 ```shell
 curl -X DELETE http://localhost:11434/api/delete -d '{
-  "name": "llama2:13b"
+  "name": "llama3:13b"
 }'
 ```
@@ -882,7 +882,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
 ```shell
 curl http://localhost:11434/api/pull -d '{
-  "name": "llama2"
+  "name": "llama3"
 }'
 ```
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
 ```
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama2",
+  "model": "llama3",
  "prompt": "Why is the sky blue?",
  "options": {
    "num_ctx": 4096
@@ -221,14 +221,20 @@ The `keep_alive` parameter can be set to:
 For example, to preload a model and leave it in memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": -1}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
 ```
 To unload the model and free up memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
 ```
 Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
 If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
 ## How do I manage the maximum number of requests the server can queue
 If too many requests are sent to the server, it will respond with a 503 error
 indicating the server is overloaded.  You can adjust how many requests may be
 queue by setting `OLLAMA_MAX_QUEUE`
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -10,7 +10,7 @@ A model file is the blueprint to create and share models with Ollama.
 - [Examples](#examples)
 - [Instructions](#instructions)
  - [FROM (Required)](#from-required)
-    - [Build from llama2](#build-from-llama2)
+    - [Build from llama3](#build-from-llama3)
    - [Build from a bin file](#build-from-a-bin-file)
  - [PARAMETER](#parameter)
    - [Valid Parameters and Values](#valid-parameters-and-values)
@@ -48,7 +48,7 @@ INSTRUCTION arguments
 An example of a `Modelfile` creating a mario blueprint:
 ```modelfile
-FROM llama2
+FROM llama3
 # sets the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
@@ -67,33 +67,25 @@ To use this:
 More examples are available in the [examples directory](../examples).
-### `Modelfile`s in [ollama.com/library][1]
+To view the Modelfile of a given model, use the `ollama show --modelfile` command.
 There are two ways to view `Modelfile`s underlying the models in [ollama.com/library][1]:
 - Option 1: view a details page from a model's tags page:
  1.  Go to a particular model's tags (e.g. https://ollama.com/library/llama2/tags)
  2.  Click on a tag (e.g. https://ollama.com/library/llama2:13b)
  3.  Scroll down to "Layers"
      - Note: if the [`FROM` instruction](#from-required) is not present,
        it means the model was created from a local file
 - Option 2: use `ollama show` to print the `Modelfile` for any local models like so:
  ```bash
-  > ollama show --modelfile llama2:13b
+  > ollama show --modelfile llama3
  # Modelfile generated by "ollama show"
  # To build a new Modelfile based on this one, replace the FROM line with:
-  # FROM llama2:13b
+  # FROM llama3:latest
  FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
  TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
-  FROM /root/.ollama/models/blobs/sha256:123abc
+  {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
  TEMPLATE """[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>>
-  {{ end }}{{ .Prompt }} [/INST] """
+  {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
-  SYSTEM """"""
+
-  PARAMETER stop [INST]
+  {{ .Response }}<|eot_id|>"""
-  PARAMETER stop [/INST]
+  PARAMETER stop "<|start_header_id|>"
-  PARAMETER stop <<SYS>>
+  PARAMETER stop "<|end_header_id|>"
-  PARAMETER stop <</SYS>>
+  PARAMETER stop "<|eot_id|>"
  PARAMETER stop "<|reserved_special_token"
  ```
 ## Instructions
@@ -106,10 +98,10 @@ The `FROM` instruction defines the base model to use when creating a model.
 FROM <model name>:<tag>
 ```
-#### Build from llama2
+#### Build from llama3
 ```modelfile
-FROM llama2
+FROM llama3
 ```
 A list of available base models:
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
            'content': 'Say this is a test',
        }
    ],
-    model='llama2',
+    model='llama3',
 )
 ```
@@ -43,7 +43,7 @@ const openai = new OpenAI({
 const chatCompletion = await openai.chat.completions.create({
  messages: [{ role: 'user', content: 'Say this is a test' }],
-  model: 'llama2',
+  model: 'llama3',
 })
 ```
@@ -53,7 +53,7 @@ const chatCompletion = await openai.chat.completions.create({
 curl http://localhost:11434/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "llama2",
+        "model": "llama3",
        "messages": [
            {
                "role": "system",
@@ -113,7 +113,7 @@ curl http://localhost:11434/v1/chat/completions \
 Before using a model, pull it locally `ollama pull`:
 ```shell
-ollama pull llama2
+ollama pull llama3
 ```
 ### Default model names
@@ -121,7 +121,7 @@ ollama pull llama2
 For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
 ```
-ollama cp llama2 gpt-3.5-turbo
+ollama cp llama3 gpt-3.5-turbo
 ```
 Afterwards, this new model name can be specified the `model` field:
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -15,7 +15,7 @@ import { Ollama } from "langchain/llms/ollama";
 const ollama = new Ollama({
  baseUrl: "http://localhost:11434",
-  model: "llama2",
+  model: "llama3",
 });
 const answer = await ollama.invoke(`why is the sky blue?`);
@@ -23,7 +23,7 @@ const answer = await ollama.invoke(`why is the sky blue?`);
 console.log(answer);
 ```
-That will get us the same thing as if we ran `ollama run llama2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
+That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
 ```bash
 npm install cheerio
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -1,3 +1,4 @@
 <<<<<<< HEAD
 # Ollama Windows Preview
 Welcome to the Ollama Windows preview.
@@ -27,7 +28,7 @@ Logs will often be helpful in diagnosing the problem (see
 Here's a quick example showing API access from `powershell`
 ```powershell
-(Invoke-WebRequest -method POST -Body '{"model":"llama2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
+(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
 ```
 ## Troubleshooting
@@ -45,3 +46,17 @@ the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
 - `explorer %HOMEPATH%\.ollama` contains models and configuration
 - `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
 ## Standalone CLI
 The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
 installer. It installs in your account without requiring Administrator rights.
 We update Ollama regularly to support the latest models, and this installer will
 help you keep up to date.
 If you'd like to install or integrate Ollama as a service, a standalone
 `ollama-windows-amd64.zip` zip file is available containing only the Ollama CLI
 and GPU library dependencies for Nvidia and AMD. This allows for embedding
 Ollama in existing applications, or running it as a system service via `ollama
 serve` with tools such as [NSSM](https://nssm.cc/).
--- a/examples/bash-comparemodels/README.md
+++ b/examples/bash-comparemodels/README.md
@@ -2,7 +2,7 @@
 When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other:
-`ollama run llama2 < sourcequestions.txt`
+`ollama run llama3 < sourcequestions.txt`
 This concept is used in the following example.
--- a/examples/go-chat/main.go
+++ b/examples/go-chat/main.go
@@ -35,7 +35,7 @@ func main() {
 	ctx := context.Background()
 	req := &api.ChatRequest{
-		Model:    "llama2",
+		Model:    "llama3",
 		Messages: messages,
 	}
--- a/examples/langchain-python-rag-document/main.py
+++ b/examples/langchain-python-rag-document/main.py
@@ -51,7 +51,7 @@ while True:
        template=template,
    )
-    llm = Ollama(model="llama2:13b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
+    llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),
--- a/examples/langchain-python-rag-websummary/main.py
+++ b/examples/langchain-python-rag-websummary/main.py
@@ -1,12 +1,12 @@
-from langchain.llms import Ollama
+from langchain_community.llms import Ollama
-from langchain.document_loaders import WebBaseLoader
+from langchain_community.document_loaders import WebBaseLoader
 from langchain.chains.summarize import load_summarize_chain
 loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
 docs = loader.load()
-llm = Ollama(model="llama2")
+llm = Ollama(model="llama3")
 chain = load_summarize_chain(llm, chain_type="stuff")
-result = chain.run(docs)
+result = chain.invoke(docs) 
 print(result)
--- a/examples/langchain-python-simple/README.md
+++ b/examples/langchain-python-simple/README.md
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
 ## Running the Example
-1. Ensure you have the `llama2` model installed:
+1. Ensure you have the `llama3` model installed:
   ```bash
-   ollama pull llama2
+   ollama pull llama3
   ```
 2. Install the Python Requirements.
@@ -21,4 +21,3 @@ This example is a basic "hello world" of using LangChain with Ollama.
   ```bash
   python main.py
   ```
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@@ -1,6 +1,6 @@
 from langchain.llms import Ollama
 input = input("What is your question?")
-llm = Ollama(model="llama2")
+llm = Ollama(model="llama3")
 res = llm.predict(input)
 print (res)
--- a/examples/modelfile-mario/Modelfile
+++ b/examples/modelfile-mario/Modelfile
@@ -1,4 +1,4 @@
-FROM llama2
+FROM llama3
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from super mario bros, acting as an assistant.
--- a/examples/modelfile-mario/readme.md
+++ b/examples/modelfile-mario/readme.md
@@ -2,12 +2,12 @@
 # Example character: Mario
-This example shows how to create a basic character using Llama2 as the base model.
+This example shows how to create a basic character using Llama3 as the base model.
 To run this example:
 1. Download the Modelfile
-2. `ollama pull llama2` to get the base model used in the model file.
+2. `ollama pull llama3` to get the base model used in the model file.
 3. `ollama create NAME -f ./Modelfile`
 4. `ollama run NAME`
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
 What the model file looks like:
 ```
-FROM llama2
+FROM llama3
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from Super Mario Bros, acting as an assistant.
--- a/examples/python-json-datagenerator/predefinedschema.py
+++ b/examples/python-json-datagenerator/predefinedschema.py
@@ -2,7 +2,7 @@ import requests
 import json
 import random
-model = "llama2"
+model = "llama3"
 template = {
  "firstName": "",
  "lastName": "",
--- a/examples/python-json-datagenerator/randomaddresses.py
+++ b/examples/python-json-datagenerator/randomaddresses.py
@@ -12,7 +12,7 @@ countries = [
    "France",
 ]
 country = random.choice(countries)
-model = "llama2"
+model = "llama3"
 prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."
--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
 ## Running the Example
-1. Ensure you have the `llama2` model installed:
+1. Ensure you have the `llama3` model installed:
   ```bash
-   ollama pull llama2
+   ollama pull llama3
   ```
 2. Install the Python Requirements.
--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -2,7 +2,7 @@ import json
 import requests
 # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
-model = "llama2"  # TODO: update this for whatever model you wish to use
+model = "llama3"  # TODO: update this for whatever model you wish to use
 def chat(messages):
--- a/examples/python-simplechat/readme.md
+++ b/examples/python-simplechat/readme.md
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
 ## Running the Example
-1. Ensure you have the `llama2` model installed:
+1. Ensure you have the `llama3` model installed:
   ```bash
-   ollama pull llama2
+   ollama pull llama3
   ```
 2. Install the Python Requirements.
--- a/examples/typescript-mentors/README.md
+++ b/examples/typescript-mentors/README.md
@@ -4,10 +4,10 @@ This example demonstrates how one would create a set of 'mentors' you can have a
 ## Usage
-1. Add llama2 to have the mentors ask your questions:
+1. Add llama3 to have the mentors ask your questions:
   ```bash
-   ollama pull llama2
+   ollama pull llama3
   ```
 2. Install prerequisites:
--- a/examples/typescript-mentors/character-generator.ts
+++ b/examples/typescript-mentors/character-generator.ts
@@ -15,7 +15,7 @@ async function characterGenerator() {
  ollama.setModel("stablebeluga2:70b-q4_K_M");
  const bio = await ollama.generate(`create a bio of ${character} in a single long paragraph. Instead of saying '${character} is...' or '${character} was...' use language like 'You are...' or 'You were...'. Then create a paragraph describing the speaking mannerisms and style of ${character}. Don't include anything about how ${character} looked or what they sounded like, just focus on the words they said. Instead of saying '${character} would say...' use language like 'You should say...'. If you use quotes, always use single quotes instead of double quotes. If there are any specific words or phrases you used a lot, show how you used them. `);
-  const thecontents = `FROM llama2\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;
+  const thecontents = `FROM llama3\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;
  fs.writeFile(path.join(directory, 'Modelfile'), thecontents, (err: any) => {
    if (err) throw err;
--- a/examples/typescript-simplechat/client.ts
+++ b/examples/typescript-simplechat/client.ts
@@ -1,6 +1,6 @@
 import * as readline from "readline";
-const model = "llama2";
+const model = "llama3";
 type Message = {
  role: "assistant" | "user" | "system";
  content: string;
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -12,6 +12,8 @@ import (
 	"sync"
 	"syscall"
 	"time"
 	"github.com/ollama/ollama/server/envconfig"
 )
 var (
@@ -24,45 +26,8 @@ func PayloadsDir() (string, error) {
 	defer lock.Unlock()
 	var err error
 	if payloadsDir == "" {
-		runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
+		runnersDir := envconfig.RunnersDir
 		// On Windows we do not carry the payloads inside the main executable
 		if runtime.GOOS == "windows" && runnersDir == "" {
 			appExe, err := os.Executable()
 			if err != nil {
 				slog.Error("failed to lookup executable path", "error", err)
 				return "", err
 			}
 			cwd, err := os.Getwd()
 			if err != nil {
 				slog.Error("failed to lookup working directory", "error", err)
 				return "", err
 			}
 			var paths []string
 			for _, root := range []string{filepath.Dir(appExe), cwd} {
 				paths = append(paths,
 					filepath.Join(root),
 					filepath.Join(root, "windows-"+runtime.GOARCH),
 					filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
 				)
 			}
 			// Try a few variations to improve developer experience when building from source in the local tree
 			for _, p := range paths {
 				candidate := filepath.Join(p, "ollama_runners")
 				_, err := os.Stat(candidate)
 				if err == nil {
 					runnersDir = candidate
 					break
 				}
 			}
 			if runnersDir == "" {
 				err = fmt.Errorf("unable to locate llm runner directory.  Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
 				slog.Error("incomplete distribution", "error", err)
 				return "", err
 			}
 		}
 		if runnersDir != "" {
 			payloadsDir = runnersDir
 			return payloadsDir, nil
@@ -70,7 +35,7 @@ func PayloadsDir() (string, error) {
 		// The remainder only applies on non-windows where we still carry payloads in the main executable
 		cleanupTmpDirs()
-		tmpDir := os.Getenv("OLLAMA_TMPDIR")
+		tmpDir := envconfig.TmpDir
 		if tmpDir == "" {
 			tmpDir, err = os.MkdirTemp("", "ollama")
 			if err != nil {
@@ -133,7 +98,7 @@ func cleanupTmpDirs() {
 func Cleanup() {
 	lock.Lock()
 	defer lock.Unlock()
-	runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
+	runnersDir := envconfig.RunnersDir
 	if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
 		// We want to fully clean up the tmpdir parent of the payloads dir
 		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -21,6 +21,7 @@ import (
 	"unsafe"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/server/envconfig"
 )
 type handles struct {
@@ -268,7 +269,7 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
 }
 func getVerboseState() C.uint16_t {
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		return C.uint16_t(1)
 	}
 	return C.uint16_t(0)
--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@@ -0,0 +1,117 @@
 //go:build integration
 package integration
 import (
 	"context"
 	"errors"
 	"fmt"
 	"log/slog"
 	"os"
 	"strconv"
 	"strings"
 	"sync"
 	"testing"
 	"time"
 	"github.com/ollama/ollama/api"
 	"github.com/stretchr/testify/require"
 )
 func TestMaxQueue(t *testing.T) {
 	// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
 	// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
 	threadCount := 32
 	mq := os.Getenv("OLLAMA_MAX_QUEUE")
 	if mq != "" {
 		var err error
 		threadCount, err = strconv.Atoi(mq)
 		require.NoError(t, err)
 	} else {
 		os.Setenv("OLLAMA_MAX_QUEUE", fmt.Sprintf("%d", threadCount))
 	}
 	req := api.GenerateRequest{
 		Model:  "orca-mini",
 		Prompt: "write a long historical fiction story about christopher columbus.  use at least 10 facts from his actual journey",
 		Options: map[string]interface{}{
 			"seed":        42,
 			"temperature": 0.0,
 		},
 	}
 	resp := []string{"explore", "discover", "ocean"}
 	// CPU mode takes much longer at the limit with a large queue setting
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	require.NoError(t, PullIfMissing(ctx, client, req.Model))
 	// Context for the worker threads so we can shut them down
 	// embedCtx, embedCancel := context.WithCancel(ctx)
 	embedCtx := ctx
 	var genwg sync.WaitGroup
 	go func() {
 		genwg.Add(1)
 		defer genwg.Done()
 		slog.Info("Starting generate request")
 		DoGenerate(ctx, t, client, req, resp, 45*time.Second, 5*time.Second)
 		slog.Info("generate completed")
 	}()
 	// Give the generate a chance to get started before we start hammering on embed requests
 	time.Sleep(5 * time.Millisecond)
 	threadCount += 10 // Add a few extra to ensure we push the queue past its limit
 	busyCount := 0
 	resetByPeerCount := 0
 	canceledCount := 0
 	succesCount := 0
 	counterMu := sync.Mutex{}
 	var embedwg sync.WaitGroup
 	for i := 0; i < threadCount; i++ {
 		go func(i int) {
 			embedwg.Add(1)
 			defer embedwg.Done()
 			slog.Info("embed started", "id", i)
 			embedReq := api.EmbeddingRequest{
 				Model:   req.Model,
 				Prompt:  req.Prompt,
 				Options: req.Options,
 			}
 			// Fresh client for every request
 			client, _ = GetTestEndpoint()
 			resp, genErr := client.Embeddings(embedCtx, &embedReq)
 			counterMu.Lock()
 			defer counterMu.Unlock()
 			switch {
 			case genErr == nil:
 				succesCount++
 				require.Greater(t, len(resp.Embedding), 5) // somewhat arbitrary, but sufficient to be reasonable
 			case errors.Is(genErr, context.Canceled):
 				canceledCount++
 			case strings.Contains(genErr.Error(), "busy"):
 				busyCount++
 			case strings.Contains(genErr.Error(), "connection reset by peer"):
 				resetByPeerCount++
 			default:
 				require.NoError(t, genErr, "%d request failed", i)
 			}
 			slog.Info("embed finished", "id", i)
 		}(i)
 	}
 	genwg.Wait()
 	slog.Info("generate done, waiting for embeds")
 	embedwg.Wait()
 	require.Equal(t, resetByPeerCount, 0, "Connections reset by peer, have you updated your fd and socket limits?")
 	require.True(t, busyCount > 0, "no requests hit busy error but some should have")
 	require.True(t, canceledCount == 0, "no requests should have been canceled due to timeout")
 	slog.Info("embeds completed", "success", succesCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
 }
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1186,8 +1186,6 @@ struct llama_server_context
            {"model",               params.model_alias},
            {"tokens_predicted",    slot.n_decoded},
            {"tokens_evaluated",    slot.n_prompt_tokens},
            {"generation_settings", get_formated_generation(slot)},
            {"prompt",              slot.prompt},
            {"truncated",           slot.truncated},
            {"stopped_eos",         slot.stopped_eos},
            {"stopped_word",        slot.stopped_word},
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -3,12 +3,11 @@ package llm
 import (
 	"fmt"
 	"log/slog"
 	"os"
 	"strconv"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/server/envconfig"
 )
 // This algorithm looks for a complete fit to determine if we need to unload other models
@@ -50,15 +49,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	for _, info := range gpus {
 		memoryAvailable += info.FreeMemory
 	}
-	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
+	if envconfig.MaxVRAM > 0 {
-	if userLimit != "" {
+		memoryAvailable = envconfig.MaxVRAM
 		avail, err := strconv.ParseUint(userLimit, 10, 64)
 		if err != nil {
 			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
 		} else {
 			slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable)
 			memoryAvailable = avail
 		}
 	}
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
--- a/llm/patches/05-clip-fix.diff
+++ b/llm/patches/05-clip-fix.diff
@@ -0,0 +1,24 @@
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
 index e3c9bcd4..b43f892d 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -573,14 +573,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     struct ggml_tensor * embeddings = inp;
     if (ctx->has_class_embedding) {
         embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
 +    }
 +    ggml_set_name(embeddings, "embeddings");
 +    ggml_set_input(embeddings);
 +
 +    if (ctx->has_class_embedding) {
         embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
                 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
         embeddings = ggml_acc(ctx0, embeddings, inp,
                 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
     }
 -    ggml_set_name(embeddings, "embeddings");
 -    ggml_set_input(embeddings);
 -
     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
     ggml_set_name(positions, "positions");
--- a/llm/server.go
+++ b/llm/server.go
@@ -26,6 +26,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/server/envconfig"
 )
 type LlamaServer interface {
@@ -124,7 +125,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	} else {
 		servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
 	}
-	demandLib := strings.Trim(os.Getenv("OLLAMA_LLM_LIBRARY"), "\"' ")
+	demandLib := envconfig.LLMLibrary
 	if demandLib != "" {
 		serverPath := availableServers[demandLib]
 		if serverPath == "" {
@@ -145,7 +146,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
 		"--embedding",
 	}
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		params = append(params, "--log-format", "json")
 	} else {
 		params = append(params, "--log-disable")
@@ -155,7 +156,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
 	}
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		params = append(params, "--verbose")
 	}
@@ -193,16 +194,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--numa")
 	}
-	// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
+	numParallel := envconfig.NumParallel
-	numParallel := 1
+
-	if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
+	// TODO (jmorganca): multimodal models don't support parallel yet
-		numParallel, err = strconv.Atoi(onp)
+	// see https://github.com/ollama/ollama/issues/4165
-		if err != nil || numParallel <= 0 {
+	if len(projectors) > 0 {
-			err = fmt.Errorf("invalid OLLAMA_NUM_PARALLEL=%s must be greater than zero - %w", onp, err)
+		numParallel = 1
-			slog.Error("misconfiguration", "error", err)
+		slog.Warn("multimodal models don't support parallel requests yet")
 			return nil, err
 		}
 	}
 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
 	for i := 0; i < len(servers); i++ {
--- a/server/envconfig/config.go
+++ b/server/envconfig/config.go
@@ -0,0 +1,174 @@
 package envconfig
 import (
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strconv"
 	"strings"
 )
 var (
 	// Set via OLLAMA_ORIGINS in the environment
 	AllowOrigins []string
 	// Set via OLLAMA_DEBUG in the environment
 	Debug bool
 	// Set via OLLAMA_LLM_LIBRARY in the environment
 	LLMLibrary string
 	// Set via OLLAMA_MAX_LOADED_MODELS in the environment
 	MaxRunners int
 	// Set via OLLAMA_MAX_QUEUE in the environment
 	MaxQueuedRequests int
 	// Set via OLLAMA_MAX_VRAM in the environment
 	MaxVRAM uint64
 	// Set via OLLAMA_NOPRUNE in the environment
 	NoPrune bool
 	// Set via OLLAMA_NUM_PARALLEL in the environment
 	NumParallel int
 	// Set via OLLAMA_RUNNERS_DIR in the environment
 	RunnersDir string
 	// Set via OLLAMA_TMPDIR in the environment
 	TmpDir string
 )
 func AsMap() map[string]string {
 	return map[string]string{
 		"OLLAMA_ORIGINS":           fmt.Sprintf("%v", AllowOrigins),
 		"OLLAMA_DEBUG":             fmt.Sprintf("%v", Debug),
 		"OLLAMA_LLM_LIBRARY":       fmt.Sprintf("%v", LLMLibrary),
 		"OLLAMA_MAX_LOADED_MODELS": fmt.Sprintf("%v", MaxRunners),
 		"OLLAMA_MAX_QUEUE":         fmt.Sprintf("%v", MaxQueuedRequests),
 		"OLLAMA_MAX_VRAM":          fmt.Sprintf("%v", MaxVRAM),
 		"OLLAMA_NOPRUNE":           fmt.Sprintf("%v", NoPrune),
 		"OLLAMA_NUM_PARALLEL":      fmt.Sprintf("%v", NumParallel),
 		"OLLAMA_RUNNERS_DIR":       fmt.Sprintf("%v", RunnersDir),
 		"OLLAMA_TMPDIR":            fmt.Sprintf("%v", TmpDir),
 	}
 }
 var defaultAllowOrigins = []string{
 	"localhost",
 	"127.0.0.1",
 	"0.0.0.0",
 }
 // Clean quotes and spaces from the value
 func clean(key string) string {
 	return strings.Trim(os.Getenv(key), "\"' ")
 }
 func init() {
 	// default values
 	NumParallel = 1
 	MaxRunners = 1
 	MaxQueuedRequests = 512
 	LoadConfig()
 }
 func LoadConfig() {
 	if debug := clean("OLLAMA_DEBUG"); debug != "" {
 		d, err := strconv.ParseBool(debug)
 		if err == nil {
 			Debug = d
 		} else {
 			Debug = true
 		}
 	}
 	RunnersDir = clean("OLLAMA_RUNNERS_DIR")
 	if runtime.GOOS == "windows" && RunnersDir == "" {
 		// On Windows we do not carry the payloads inside the main executable
 		appExe, err := os.Executable()
 		if err != nil {
 			slog.Error("failed to lookup executable path", "error", err)
 		}
 		cwd, err := os.Getwd()
 		if err != nil {
 			slog.Error("failed to lookup working directory", "error", err)
 		}
 		var paths []string
 		for _, root := range []string{filepath.Dir(appExe), cwd} {
 			paths = append(paths,
 				filepath.Join(root),
 				filepath.Join(root, "windows-"+runtime.GOARCH),
 				filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
 			)
 		}
 		// Try a few variations to improve developer experience when building from source in the local tree
 		for _, p := range paths {
 			candidate := filepath.Join(p, "ollama_runners")
 			_, err := os.Stat(candidate)
 			if err == nil {
 				RunnersDir = candidate
 				break
 			}
 		}
 		if RunnersDir == "" {
 			slog.Error("unable to locate llm runner directory.  Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
 		}
 	}
 	TmpDir = clean("OLLAMA_TMPDIR")
 	userLimit := clean("OLLAMA_MAX_VRAM")
 	if userLimit != "" {
 		avail, err := strconv.ParseUint(userLimit, 10, 64)
 		if err != nil {
 			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
 		} else {
 			MaxVRAM = avail
 		}
 	}
 	LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
 	if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
 		val, err := strconv.Atoi(onp)
 		if err != nil || val <= 0 {
 			slog.Error("invalid setting must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
 		} else {
 			NumParallel = val
 		}
 	}
 	if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
 		NoPrune = true
 	}
 	if origins := clean("OLLAMA_ORIGINS"); origins != "" {
 		AllowOrigins = strings.Split(origins, ",")
 	}
 	for _, allowOrigin := range defaultAllowOrigins {
 		AllowOrigins = append(AllowOrigins,
 			fmt.Sprintf("http://%s", allowOrigin),
 			fmt.Sprintf("https://%s", allowOrigin),
 			fmt.Sprintf("http://%s:*", allowOrigin),
 			fmt.Sprintf("https://%s:*", allowOrigin),
 		)
 	}
 	maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
 	if maxRunners != "" {
 		m, err := strconv.Atoi(maxRunners)
 		if err != nil {
 			slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
 		} else {
 			MaxRunners = m
 		}
 	}
 	if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
 		p, err := strconv.Atoi(onp)
 		if err != nil || p <= 0 {
 			slog.Error("invalid setting", "OLLAMA_MAX_QUEUE", onp, "error", err)
 		} else {
 			MaxQueuedRequests = p
 		}
 	}
 }
--- a/server/envconfig/config_test.go
+++ b/server/envconfig/config_test.go
@@ -0,0 +1,20 @@
 package envconfig
 import (
 	"os"
 	"testing"
 	"github.com/stretchr/testify/require"
 )
 func TestConfig(t *testing.T) {
 	os.Setenv("OLLAMA_DEBUG", "")
 	LoadConfig()
 	require.False(t, Debug)
 	os.Setenv("OLLAMA_DEBUG", "false")
 	LoadConfig()
 	require.False(t, Debug)
 	os.Setenv("OLLAMA_DEBUG", "1")
 	LoadConfig()
 	require.True(t, Debug)
 }
--- a/server/images.go
+++ b/server/images.go
@@ -29,7 +29,7 @@ import (
 	"github.com/ollama/ollama/convert"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llm"
-	"github.com/ollama/ollama/parser"
+	"github.com/ollama/ollama/server/envconfig"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
@@ -63,46 +63,74 @@ func (m *Model) IsEmbedding() bool {
 	return slices.Contains(m.Config.ModelFamilies, "bert") || slices.Contains(m.Config.ModelFamilies, "nomic-bert")
 }
-func (m *Model) Commands() (cmds []parser.Command) {
+func (m *Model) String() string {
-	cmds = append(cmds, parser.Command{Name: "model", Args: m.ModelPath})
+	var modelfile model.File
 	modelfile.Commands = append(modelfile.Commands, model.Command{
 		Name: "model",
 		Args: m.ModelPath,
 	})
 	if m.Template != "" {
-		cmds = append(cmds, parser.Command{Name: "template", Args: m.Template})
+		modelfile.Commands = append(modelfile.Commands, model.Command{
 			Name: "template",
 			Args: m.Template,
 		})
 	}
 	if m.System != "" {
-		cmds = append(cmds, parser.Command{Name: "system", Args: m.System})
+		modelfile.Commands = append(modelfile.Commands, model.Command{
 			Name: "system",
 			Args: m.System,
 		})
 	}
 	for _, adapter := range m.AdapterPaths {
-		cmds = append(cmds, parser.Command{Name: "adapter", Args: adapter})
+		modelfile.Commands = append(modelfile.Commands, model.Command{
 			Name: "adapter",
 			Args: adapter,
 		})
 	}
 	for _, projector := range m.ProjectorPaths {
-		cmds = append(cmds, parser.Command{Name: "projector", Args: projector})
+		modelfile.Commands = append(modelfile.Commands, model.Command{
 			Name: "projector",
 			Args: projector,
 		})
 	}
 	for k, v := range m.Options {
 		switch v := v.(type) {
 		case []any:
 			for _, s := range v {
-				cmds = append(cmds, parser.Command{Name: k, Args: fmt.Sprintf("%v", s)})
+				modelfile.Commands = append(modelfile.Commands, model.Command{
 					Name: k,
 					Args: fmt.Sprintf("%v", s),
 				})
 			}
 		default:
-			cmds = append(cmds, parser.Command{Name: k, Args: fmt.Sprintf("%v", v)})
+			modelfile.Commands = append(modelfile.Commands, model.Command{
 				Name: k,
 				Args: fmt.Sprintf("%v", v),
 			})
 		}
 	}
 	for _, license := range m.License {
-		cmds = append(cmds, parser.Command{Name: "license", Args: license})
+		modelfile.Commands = append(modelfile.Commands, model.Command{
 			Name: "license",
 			Args: license,
 		})
 	}
 	for _, msg := range m.Messages {
-		cmds = append(cmds, parser.Command{Name: "message", Args: fmt.Sprintf("%s %s", msg.Role, msg.Content)})
+		modelfile.Commands = append(modelfile.Commands, model.Command{
 			Name: "message",
 			Args: fmt.Sprintf("%s %s", msg.Role, msg.Content),
 		})
 	}
-	return cmds
+	return modelfile.String()
 }
 type Message struct {
@@ -329,7 +357,7 @@ func realpath(mfDir, from string) string {
 	return abspath
 }
-func CreateModel(ctx context.Context, name, modelFileDir, quantization string, commands []parser.Command, fn func(resp api.ProgressResponse)) error {
+func CreateModel(ctx context.Context, name, modelFileDir, quantization string, modelfile *model.File, fn func(resp api.ProgressResponse)) error {
 	deleteMap := make(map[string]struct{})
 	if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil {
 		for _, layer := range append(manifest.Layers, manifest.Config) {
@@ -351,7 +379,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
 	params := make(map[string][]string)
 	fromParams := make(map[string]any)
-	for _, c := range commands {
+	for _, c := range modelfile.Commands {
 		mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
 		switch c.Name {
@@ -668,7 +696,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
 		return err
 	}
-	if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
+	if !envconfig.NoPrune {
 		if err := deleteUnusedLayers(nil, deleteMap, false); err != nil {
 			return err
 		}
@@ -999,7 +1027,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]struct{})
-	if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
+	if !envconfig.NoPrune {
 		manifest, _, err = GetManifest(mp)
 		if err != nil && !errors.Is(err, os.ErrNotExist) {
 			return err
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -6,6 +6,7 @@ import (
 	"net/url"
 	"os"
 	"path/filepath"
 	"regexp"
 	"strings"
 )
@@ -25,9 +26,10 @@ const (
 )
 var (
-	ErrInvalidImageFormat = errors.New("invalid image format")
+	ErrInvalidImageFormat  = errors.New("invalid image format")
-	ErrInvalidProtocol    = errors.New("invalid protocol scheme")
+	ErrInvalidProtocol     = errors.New("invalid protocol scheme")
-	ErrInsecureProtocol   = errors.New("insecure protocol http")
+	ErrInsecureProtocol    = errors.New("insecure protocol http")
 	ErrInvalidDigestFormat = errors.New("invalid digest format")
 )
 func ParseModelPath(name string) ModelPath {
@@ -149,6 +151,17 @@ func GetBlobsPath(digest string) (string, error) {
 		return "", err
 	}
 	// only accept actual sha256 digests
 	pattern := "^sha256[:-][0-9a-fA-F]{64}$"
 	re := regexp.MustCompile(pattern)
 	if err != nil {
 		return "", err
 	}
 	if digest != "" && !re.MatchString(digest) {
 		return "", ErrInvalidDigestFormat
 	}
 	digest = strings.ReplaceAll(digest, ":", "-")
 	path := filepath.Join(dir, "blobs", digest)
 	dirPath := filepath.Dir(path)
--- a/server/modelpath_test.go
+++ b/server/modelpath_test.go
@@ -1,6 +1,73 @@
 package server
-import "testing"
+import (
 	"os"
 	"path/filepath"
 	"testing"
 	"github.com/stretchr/testify/assert"
 )
 func TestGetBlobsPath(t *testing.T) {
 	// GetBlobsPath expects an actual directory to exist
 	dir, err := os.MkdirTemp("", "ollama-test")
 	assert.Nil(t, err)
 	defer os.RemoveAll(dir)
 	tests := []struct {
 		name     string
 		digest   string
 		expected string
 		err      error
 	}{
 		{
 			"empty digest",
 			"",
 			filepath.Join(dir, "blobs"),
 			nil,
 		},
 		{
 			"valid with colon",
 			"sha256:456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9",
 			filepath.Join(dir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"),
 			nil,
 		},
 		{
 			"valid with dash",
 			"sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9",
 			filepath.Join(dir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"),
 			nil,
 		},
 		{
 			"digest too short",
 			"sha256-45640291",
 			"",
 			ErrInvalidDigestFormat,
 		},
 		{
 			"digest too long",
 			"sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9aaaaaaaaaa",
 			"",
 			ErrInvalidDigestFormat,
 		},
 		{
 			"digest invalid chars",
 			"../sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7a",
 			"",
 			ErrInvalidDigestFormat,
 		},
 	}
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
 			t.Setenv("OLLAMA_MODELS", dir)
 			got, err := GetBlobsPath(tc.digest)
 			assert.ErrorIs(t, tc.err, err, tc.name)
 			assert.Equal(t, tc.expected, got, tc.name)
 		})
 	}
 }
 func TestParseModelPath(t *testing.T) {
 	tests := []struct {
--- a/server/routes.go
+++ b/server/routes.go
@@ -1,6 +1,7 @@
 package server
 import (
 	"cmp"
 	"context"
 	"encoding/json"
 	"errors"
@@ -28,7 +29,7 @@ import (
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
-	"github.com/ollama/ollama/parser"
+	"github.com/ollama/ollama/server/envconfig"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
@@ -146,12 +147,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	select {
 	case runner = <-rCh:
 	case err = <-eCh:
-		if errors.Is(err, context.Canceled) {
+		handleErrorResponse(c, err)
 			c.JSON(499, gin.H{"error": "request canceled"})
 			return
 		}
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -394,12 +390,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	select {
 	case runner = <-rCh:
 	case err = <-eCh:
-		if errors.Is(err, context.Canceled) {
+		handleErrorResponse(c, err)
 			c.JSON(499, gin.H{"error": "request canceled"})
 			return
 		}
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -522,28 +513,17 @@ func (s *Server) PushModelHandler(c *gin.Context) {
 func (s *Server) CreateModelHandler(c *gin.Context) {
 	var req api.CreateRequest
-	err := c.ShouldBindJSON(&req)
+	if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) {
 	switch {
 	case errors.Is(err, io.EOF):
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
 		return
-	case err != nil:
+	} else if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
-	var model string
+	name := model.ParseName(cmp.Or(req.Model, req.Name))
-	if req.Model != "" {
+	if !name.IsValid() {
-		model = req.Model
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid model name"})
 	} else if req.Name != "" {
 		model = req.Name
 	} else {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
 		return
 	}
 	if err := ParseModelPath(model).Validate(); err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@@ -552,19 +532,19 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 		return
 	}
-	var modelfile io.Reader = strings.NewReader(req.Modelfile)
+	var r io.Reader = strings.NewReader(req.Modelfile)
 	if req.Path != "" && req.Modelfile == "" {
-		mf, err := os.Open(req.Path)
+		f, err := os.Open(req.Path)
 		if err != nil {
 			c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("error reading modelfile: %s", err)})
 			return
 		}
-		defer mf.Close()
+		defer f.Close()
-		modelfile = mf
+		r = f
 	}
-	commands, err := parser.Parse(modelfile)
+	modelfile, err := model.ParseFile(r)
 	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
@@ -580,7 +560,7 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()
-		if err := CreateModel(ctx, model, filepath.Dir(req.Path), req.Quantization, commands, fn); err != nil {
+		if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), req.Quantization, modelfile, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -732,7 +712,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	fmt.Fprintln(&sb, "# Modelfile generate by \"ollama show\"")
 	fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
 	fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName)
-	fmt.Fprint(&sb, parser.Format(model.Commands()))
+	fmt.Fprint(&sb, model.String())
 	resp.Modelfile = sb.String()
 	return resp, nil
@@ -880,12 +860,6 @@ func (s *Server) CreateBlobHandler(c *gin.Context) {
 	c.Status(http.StatusCreated)
 }
 var defaultAllowOrigins = []string{
 	"localhost",
 	"127.0.0.1",
 	"0.0.0.0",
 }
 func isLocalIP(ip netip.Addr) bool {
 	if interfaces, err := net.Interfaces(); err == nil {
 		for _, iface := range interfaces {
@@ -969,19 +943,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 	config := cors.DefaultConfig()
 	config.AllowWildcard = true
 	config.AllowBrowserExtensions = true
-
+	config.AllowOrigins = envconfig.AllowOrigins
 	if allowedOrigins := strings.Trim(os.Getenv("OLLAMA_ORIGINS"), "\"'"); allowedOrigins != "" {
 		config.AllowOrigins = strings.Split(allowedOrigins, ",")
 	}
 	for _, allowOrigin := range defaultAllowOrigins {
 		config.AllowOrigins = append(config.AllowOrigins,
 			fmt.Sprintf("http://%s", allowOrigin),
 			fmt.Sprintf("https://%s", allowOrigin),
 			fmt.Sprintf("http://%s:*", allowOrigin),
 			fmt.Sprintf("https://%s:*", allowOrigin),
 		)
 	}
 	r := gin.Default()
 	r.Use(
@@ -1020,10 +982,11 @@ func (s *Server) GenerateRoutes() http.Handler {
 func Serve(ln net.Listener) error {
 	level := slog.LevelInfo
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		level = slog.LevelDebug
 	}
 	slog.Info("server config", "env", envconfig.AsMap())
 	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level:     level,
 		AddSource: true,
@@ -1047,7 +1010,7 @@ func Serve(ln net.Listener) error {
 		return err
 	}
-	if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
+	if !envconfig.NoPrune {
 		// clean up unused layers and manifests
 		if err := PruneLayers(); err != nil {
 			return err
@@ -1223,12 +1186,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	select {
 	case runner = <-rCh:
 	case err = <-eCh:
-		if errors.Is(err, context.Canceled) {
+		handleErrorResponse(c, err)
 			c.JSON(499, gin.H{"error": "request canceled"})
 			return
 		}
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -1349,3 +1307,15 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }
 func handleErrorResponse(c *gin.Context, err error) {
 	if errors.Is(err, context.Canceled) {
 		c.JSON(499, gin.H{"error": "request canceled"})
 		return
 	}
 	if errors.Is(err, ErrMaxQueue) {
 		c.JSON(http.StatusServiceUnavailable, gin.H{"error": err.Error()})
 		return
 	}
 	c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 }
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -17,7 +17,7 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/parser"
+	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
@@ -55,13 +55,13 @@ func Test_Routes(t *testing.T) {
 	createTestModel := func(t *testing.T, name string) {
 		fname := createTestFile(t, "ollama-model")
-		modelfile := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
+		r := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
-		commands, err := parser.Parse(modelfile)
+		modelfile, err := model.ParseFile(r)
 		assert.Nil(t, err)
 		fn := func(resp api.ProgressResponse) {
 			t.Logf("Status: %s", resp.Status)
 		}
-		err = CreateModel(context.TODO(), name, "", "", commands, fn)
+		err = CreateModel(context.TODO(), name, "", "", modelfile, fn)
 		assert.Nil(t, err)
 	}
--- a/server/sched.go
+++ b/server/sched.go
@@ -5,10 +5,8 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
 	"os"
 	"reflect"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
@@ -17,6 +15,7 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/server/envconfig"
 	"golang.org/x/exp/slices"
 )
@@ -43,35 +42,14 @@ type Scheduler struct {
 	getGpuFn    func() gpu.GpuInfoList
 }
-// TODO set this to zero after a release or two, to enable multiple models by default
+var ErrMaxQueue = fmt.Errorf("server busy, please try again.  maximum pending requests exceeded")
 var loadedMax = 1          // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
 var maxQueuedRequests = 10 // TODO configurable
 var numParallel = 1
 func InitScheduler(ctx context.Context) *Scheduler {
 	maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS")
 	if maxRunners != "" {
 		m, err := strconv.Atoi(maxRunners)
 		if err != nil {
 			slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
 		} else {
 			loadedMax = m
 		}
 	}
 	if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
 		p, err := strconv.Atoi(onp)
 		if err != nil || p <= 0 {
 			slog.Error("invalid parallel setting, must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
 		} else {
 			numParallel = p
 		}
 	}
 	sched := &Scheduler{
-		pendingReqCh:  make(chan *LlmRequest, maxQueuedRequests),
+		pendingReqCh:  make(chan *LlmRequest, envconfig.MaxQueuedRequests),
-		finishedReqCh: make(chan *LlmRequest, maxQueuedRequests),
+		finishedReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
-		expiredCh:     make(chan *runnerRef, maxQueuedRequests),
+		expiredCh:     make(chan *runnerRef, envconfig.MaxQueuedRequests),
-		unloadedCh:    make(chan interface{}, maxQueuedRequests),
+		unloadedCh:    make(chan interface{}, envconfig.MaxQueuedRequests),
 		loaded:        make(map[string]*runnerRef),
 		newServerFn:   llm.NewLlamaServer,
 		getGpuFn:      gpu.GetGPUInfo,
@@ -82,6 +60,9 @@ func InitScheduler(ctx context.Context) *Scheduler {
 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
 	// allocate a large enough kv cache for all parallel requests
 	opts.NumCtx = opts.NumCtx * envconfig.NumParallel
 	req := &LlmRequest{
 		ctx:             c,
 		model:           model,
@@ -90,12 +71,11 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options,
 		successCh:       make(chan *runnerRef),
 		errCh:           make(chan error, 1),
 	}
-	// context split across parallel threads
+
 	opts.NumCtx = opts.NumCtx * numParallel
 	select {
 	case s.pendingReqCh <- req:
 	default:
-		req.errCh <- fmt.Errorf("server busy, please try again.  maximum pending requests exceeded")
+		req.errCh <- ErrMaxQueue
 	}
 	return req.successCh, req.errCh
 }
@@ -134,11 +114,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						pending.useLoadedRunner(runner, s.finishedReqCh)
 						break
 					}
-				} else if loadedMax > 0 && loadedCount >= loadedMax {
+				} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
 					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
-					runnerToExpire = s.findRunnerToUnload(pending)
+					runnerToExpire = s.findRunnerToUnload()
 				} else {
-					// Either no models are loaded or below loadedMax
+					// Either no models are loaded or below envconfig.MaxRunners
 					// Get a refreshed GPU list
 					gpus := s.getGpuFn()
@@ -149,7 +129,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}
-					// If we're CPU only mode, just limit by loadedMax above
+					// If we're CPU only mode, just limit by envconfig.MaxRunners above
 					// TODO handle system memory exhaustion
 					if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
 						slog.Debug("cpu mode with existing models, loading")
@@ -177,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						s.loadFn(pending, ggml, gpus)
 						break
 					}
-					runnerToExpire = s.findRunnerToUnload(pending)
+					runnerToExpire = s.findRunnerToUnload()
 				}
 				if runnerToExpire == nil {
@@ -277,9 +257,9 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 				continue
 			}
 			s.loadedMu.Lock()
 			slog.Debug("got lock to unload", "model", runner.model)
 			runner.unload()
 			s.loadedMu.Lock()
 			delete(s.loaded, runner.model)
 			s.loadedMu.Unlock()
 			slog.Debug("runner released", "model", runner.model)
@@ -524,7 +504,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
 }
 // findRunnerToUnload finds a runner to unload to make room for a new model
-func (s *Scheduler) findRunnerToUnload(req *LlmRequest) *runnerRef {
+func (s *Scheduler) findRunnerToUnload() *runnerRef {
 	s.loadedMu.Lock()
 	runnerList := make([]*runnerRef, 0, len(s.loaded))
 	for _, r := range s.loaded {
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -15,6 +15,7 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/server/envconfig"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
@@ -27,38 +28,14 @@ func init() {
 func TestInitScheduler(t *testing.T) {
 	ctx, done := context.WithCancel(context.Background())
 	defer done()
 	initialMax := loadedMax
 	initialParallel := numParallel
 	s := InitScheduler(ctx)
 	require.Equal(t, initialMax, loadedMax)
 	s.loadedMu.Lock()
 	require.NotNil(t, s.loaded)
 	s.loadedMu.Unlock()
 	os.Setenv("OLLAMA_MAX_LOADED_MODELS", "blue")
 	s = InitScheduler(ctx)
 	require.Equal(t, initialMax, loadedMax)
 	s.loadedMu.Lock()
 	require.NotNil(t, s.loaded)
 	s.loadedMu.Unlock()
 	os.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
 	s = InitScheduler(ctx)
 	require.Equal(t, 0, loadedMax)
 	s.loadedMu.Lock()
 	require.NotNil(t, s.loaded)
 	s.loadedMu.Unlock()
 	os.Setenv("OLLAMA_NUM_PARALLEL", "blue")
 	_ = InitScheduler(ctx)
 	require.Equal(t, initialParallel, numParallel)
 	os.Setenv("OLLAMA_NUM_PARALLEL", "10")
 	_ = InitScheduler(ctx)
 	require.Equal(t, 10, numParallel)
 }
 func TestLoad(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
+	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
 	defer done()
 	s := InitScheduler(ctx)
 	var ggml *llm.GGML // value not used in tests
@@ -174,7 +151,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 }
 func TestRequests(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
 	defer done()
 	// Same model, same request
@@ -249,7 +226,7 @@ func TestRequests(t *testing.T) {
 		t.Errorf("timeout")
 	}
-	loadedMax = 1
+	envconfig.MaxRunners = 1
 	s.newServerFn = scenario3a.newServer
 	slog.Info("scenario3a")
 	s.pendingReqCh <- scenario3a.req
@@ -268,7 +245,7 @@ func TestRequests(t *testing.T) {
 	require.Len(t, s.loaded, 1)
 	s.loadedMu.Unlock()
-	loadedMax = 0
+	envconfig.MaxRunners = 0
 	s.newServerFn = scenario3b.newServer
 	slog.Info("scenario3b")
 	s.pendingReqCh <- scenario3b.req
@@ -329,7 +306,7 @@ func TestRequests(t *testing.T) {
 }
 func TestGetRunner(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
+	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
 	// Same model, same request
@@ -339,7 +316,7 @@ func TestGetRunner(t *testing.T) {
 	scenario1b.req.sessionDuration = 0
 	scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
 	scenario1c.req.sessionDuration = 0
-	maxQueuedRequests = 1
+	envconfig.MaxQueuedRequests = 1
 	s := InitScheduler(ctx)
 	s.getGpuFn = func() gpu.GpuInfoList {
 		g := gpu.GpuInfo{Library: "metal"}
@@ -391,7 +368,7 @@ func TestGetRunner(t *testing.T) {
 // TODO - add one scenario that triggers the bogus finished event with positive ref count
 func TestPrematureExpired(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
 	defer done()
 	// Same model, same request
@@ -436,7 +413,7 @@ func TestPrematureExpired(t *testing.T) {
 }
 func TestUseLoadedRunner(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
+	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	req := &LlmRequest{
 		ctx:             ctx,
 		opts:            api.DefaultOptions(),
@@ -461,7 +438,7 @@ func TestUseLoadedRunner(t *testing.T) {
 }
 func TestUpdateFreeSpace(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
+	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
 	gpus := gpu.GpuInfoList{
 		{
@@ -494,12 +471,9 @@ func TestUpdateFreeSpace(t *testing.T) {
 }
 func TestFindRunnerToUnload(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
+	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
-	req := &LlmRequest{
+
 		ctx:  ctx,
 		opts: api.DefaultOptions(),
 	}
 	r1 := &runnerRef{refCount: 1, sessionDuration: 1}
 	r2 := &runnerRef{sessionDuration: 2}
@@ -509,16 +483,16 @@ func TestFindRunnerToUnload(t *testing.T) {
 	s.loaded["b"] = r2
 	s.loadedMu.Unlock()
-	resp := s.findRunnerToUnload(req)
+	resp := s.findRunnerToUnload()
 	require.Equal(t, r2, resp)
 	r2.refCount = 1
-	resp = s.findRunnerToUnload(req)
+	resp = s.findRunnerToUnload()
 	require.Equal(t, r1, resp)
 }
 func TestNeedsReload(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
+	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
 	llm := &mockLlm{}
@@ -562,7 +536,7 @@ func TestNeedsReload(t *testing.T) {
 }
 func TestUnloadAllRunners(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
+	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
 	llm1 := &mockLlm{}
--- a/types/model/file.go
+++ b/types/model/file.go
@@ -1,4 +1,4 @@
-package parser
+package model
 import (
 	"bufio"
@@ -10,11 +10,41 @@ import (
 	"strings"
 )
 type File struct {
 	Commands []Command
 }
 func (f File) String() string {
 	var sb strings.Builder
 	for _, cmd := range f.Commands {
 		fmt.Fprintln(&sb, cmd.String())
 	}
 	return sb.String()
 }
 type Command struct {
 	Name string
 	Args string
 }
 func (c Command) String() string {
 	var sb strings.Builder
 	switch c.Name {
 	case "model":
 		fmt.Fprintf(&sb, "FROM %s", c.Args)
 	case "license", "template", "system", "adapter":
 		fmt.Fprintf(&sb, "%s %s", strings.ToUpper(c.Name), quote(c.Args))
 	case "message":
 		role, message, _ := strings.Cut(c.Args, ": ")
 		fmt.Fprintf(&sb, "MESSAGE %s %s", role, quote(message))
 	default:
 		fmt.Fprintf(&sb, "PARAMETER %s %s", c.Name, quote(c.Args))
 	}
 	return sb.String()
 }
 type state int
 const (
@@ -32,38 +62,14 @@ var (
 	errInvalidCommand     = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
 )
-func Format(cmds []Command) string {
+func ParseFile(r io.Reader) (*File, error) {
 	var sb strings.Builder
 	for _, cmd := range cmds {
 		name := cmd.Name
 		args := cmd.Args
 		switch cmd.Name {
 		case "model":
 			name = "from"
 			args = cmd.Args
 		case "license", "template", "system", "adapter":
 			args = quote(args)
 		case "message":
 			role, message, _ := strings.Cut(cmd.Args, ": ")
 			args = role + " " + quote(message)
 		default:
 			name = "parameter"
 			args = cmd.Name + " " + quote(cmd.Args)
 		}
 		fmt.Fprintln(&sb, strings.ToUpper(name), args)
 	}
 	return sb.String()
 }
 func Parse(r io.Reader) (cmds []Command, err error) {
 	var cmd Command
 	var curr state
 	var b bytes.Buffer
 	var role string
 	var f File
 	br := bufio.NewReader(r)
 	for {
 		r, _, err := br.ReadRune()
@@ -128,7 +134,7 @@ func Parse(r io.Reader) (cmds []Command, err error) {
 				}
 				cmd.Args = s
-				cmds = append(cmds, cmd)
+				f.Commands = append(f.Commands, cmd)
 			}
 			b.Reset()
@@ -157,14 +163,14 @@ func Parse(r io.Reader) (cmds []Command, err error) {
 		}
 		cmd.Args = s
-		cmds = append(cmds, cmd)
+		f.Commands = append(f.Commands, cmd)
 	default:
 		return nil, io.ErrUnexpectedEOF
 	}
-	for _, cmd := range cmds {
+	for _, cmd := range f.Commands {
 		if cmd.Name == "model" {
-			return cmds, nil
+			return &f, nil
 		}
 	}
--- a/types/model/file_test.go
+++ b/types/model/file_test.go
@@ -1,4 +1,4 @@
-package parser
+package model
 import (
 	"bytes"
@@ -10,7 +10,7 @@ import (
 	"github.com/stretchr/testify/assert"
 )
-func TestParser(t *testing.T) {
+func TestParseFileFile(t *testing.T) {
 	input := `
 FROM model1
 ADAPTER adapter1
@@ -22,8 +22,8 @@ TEMPLATE template1
 	reader := strings.NewReader(input)
-	commands, err := Parse(reader)
+	modelfile, err := ParseFile(reader)
-	assert.Nil(t, err)
+	assert.NoError(t, err)
 	expectedCommands := []Command{
 		{Name: "model", Args: "model1"},
@@ -34,10 +34,10 @@ TEMPLATE template1
 		{Name: "template", Args: "template1"},
 	}
-	assert.Equal(t, expectedCommands, commands)
+	assert.Equal(t, expectedCommands, modelfile.Commands)
 }
-func TestParserFrom(t *testing.T) {
+func TestParseFileFrom(t *testing.T) {
 	var cases = []struct {
 		input    string
 		expected []Command
@@ -85,14 +85,16 @@ func TestParserFrom(t *testing.T) {
 	for _, c := range cases {
 		t.Run("", func(t *testing.T) {
-			commands, err := Parse(strings.NewReader(c.input))
+			modelfile, err := ParseFile(strings.NewReader(c.input))
 			assert.ErrorIs(t, err, c.err)
-			assert.Equal(t, c.expected, commands)
+			if modelfile != nil {
 				assert.Equal(t, c.expected, modelfile.Commands)
 			}
 		})
 	}
 }
-func TestParserParametersMissingValue(t *testing.T) {
+func TestParseFileParametersMissingValue(t *testing.T) {
 	input := `
 FROM foo
 PARAMETER param1
@@ -100,21 +102,21 @@ PARAMETER param1
 	reader := strings.NewReader(input)
-	_, err := Parse(reader)
+	_, err := ParseFile(reader)
 	assert.ErrorIs(t, err, io.ErrUnexpectedEOF)
 }
-func TestParserBadCommand(t *testing.T) {
+func TestParseFileBadCommand(t *testing.T) {
 	input := `
 FROM foo
 BADCOMMAND param1 value1
 `
-	_, err := Parse(strings.NewReader(input))
+	_, err := ParseFile(strings.NewReader(input))
 	assert.ErrorIs(t, err, errInvalidCommand)
 }
-func TestParserMessages(t *testing.T) {
+func TestParseFileMessages(t *testing.T) {
 	var cases = []struct {
 		input    string
 		expected []Command
@@ -123,34 +125,34 @@ func TestParserMessages(t *testing.T) {
 		{
 			`
 FROM foo
-MESSAGE system You are a Parser. Always Parse things.
+MESSAGE system You are a file parser. Always parse things.
 `,
 			[]Command{
 				{Name: "model", Args: "foo"},
-				{Name: "message", Args: "system: You are a Parser. Always Parse things."},
+				{Name: "message", Args: "system: You are a file parser. Always parse things."},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
-MESSAGE system You are a Parser. Always Parse things.`,
+MESSAGE system You are a file parser. Always parse things.`,
 			[]Command{
 				{Name: "model", Args: "foo"},
-				{Name: "message", Args: "system: You are a Parser. Always Parse things."},
+				{Name: "message", Args: "system: You are a file parser. Always parse things."},
 			},
 			nil,
 		},
 		{
 			`
 FROM foo
-MESSAGE system You are a Parser. Always Parse things.
+MESSAGE system You are a file parser. Always parse things.
 MESSAGE user Hey there!
 MESSAGE assistant Hello, I want to parse all the things!
 `,
 			[]Command{
 				{Name: "model", Args: "foo"},
-				{Name: "message", Args: "system: You are a Parser. Always Parse things."},
+				{Name: "message", Args: "system: You are a file parser. Always parse things."},
 				{Name: "message", Args: "user: Hey there!"},
 				{Name: "message", Args: "assistant: Hello, I want to parse all the things!"},
 			},
@@ -160,12 +162,12 @@ MESSAGE assistant Hello, I want to parse all the things!
 			`
 FROM foo
 MESSAGE system """
-You are a multiline Parser. Always Parse things.
+You are a multiline file parser. Always parse things.
 """
 			`,
 			[]Command{
 				{Name: "model", Args: "foo"},
-				{Name: "message", Args: "system: \nYou are a multiline Parser. Always Parse things.\n"},
+				{Name: "message", Args: "system: \nYou are a multiline file parser. Always parse things.\n"},
 			},
 			nil,
 		},
@@ -196,14 +198,16 @@ MESSAGE system`,
 	for _, c := range cases {
 		t.Run("", func(t *testing.T) {
-			commands, err := Parse(strings.NewReader(c.input))
+			modelfile, err := ParseFile(strings.NewReader(c.input))
 			assert.ErrorIs(t, err, c.err)
-			assert.Equal(t, c.expected, commands)
+			if modelfile != nil {
 				assert.Equal(t, c.expected, modelfile.Commands)
 			}
 		})
 	}
 }
-func TestParserQuoted(t *testing.T) {
+func TestParseFileQuoted(t *testing.T) {
 	var cases = []struct {
 		multiline string
 		expected  []Command
@@ -348,14 +352,16 @@ TEMPLATE """
 	for _, c := range cases {
 		t.Run("", func(t *testing.T) {
-			commands, err := Parse(strings.NewReader(c.multiline))
+			modelfile, err := ParseFile(strings.NewReader(c.multiline))
 			assert.ErrorIs(t, err, c.err)
-			assert.Equal(t, c.expected, commands)
+			if modelfile != nil {
 				assert.Equal(t, c.expected, modelfile.Commands)
 			}
 		})
 	}
 }
-func TestParserParameters(t *testing.T) {
+func TestParseFileParameters(t *testing.T) {
 	var cases = map[string]struct {
 		name, value string
 	}{
@@ -404,18 +410,18 @@ func TestParserParameters(t *testing.T) {
 			var b bytes.Buffer
 			fmt.Fprintln(&b, "FROM foo")
 			fmt.Fprintln(&b, "PARAMETER", k)
-			commands, err := Parse(&b)
+			modelfile, err := ParseFile(&b)
-			assert.Nil(t, err)
+			assert.NoError(t, err)
 			assert.Equal(t, []Command{
 				{Name: "model", Args: "foo"},
 				{Name: v.name, Args: v.value},
-			}, commands)
+			}, modelfile.Commands)
 		})
 	}
 }
-func TestParserComments(t *testing.T) {
+func TestParseFileComments(t *testing.T) {
 	var cases = []struct {
 		input    string
 		expected []Command
@@ -433,14 +439,14 @@ FROM foo
 	for _, c := range cases {
 		t.Run("", func(t *testing.T) {
-			commands, err := Parse(strings.NewReader(c.input))
+			modelfile, err := ParseFile(strings.NewReader(c.input))
-			assert.Nil(t, err)
+			assert.NoError(t, err)
-			assert.Equal(t, c.expected, commands)
+			assert.Equal(t, c.expected, modelfile.Commands)
 		})
 	}
 }
-func TestParseFormatParse(t *testing.T) {
+func TestParseFileFormatParseFile(t *testing.T) {
 	var cases = []string{
 		`
 FROM foo
@@ -449,7 +455,7 @@ LICENSE MIT
 PARAMETER param1 value1
 PARAMETER param2 value2
 TEMPLATE template1
-MESSAGE system You are a Parser. Always Parse things.
+MESSAGE system You are a file parser. Always parse things.
 MESSAGE user Hey there!
 MESSAGE assistant Hello, I want to parse all the things!
 `,
@@ -488,13 +494,13 @@ MESSAGE assistant Hello, I want to parse all the things!
 	for _, c := range cases {
 		t.Run("", func(t *testing.T) {
-			commands, err := Parse(strings.NewReader(c))
+			modelfile, err := ParseFile(strings.NewReader(c))
 			assert.NoError(t, err)
-			commands2, err := Parse(strings.NewReader(Format(commands)))
+			modelfile2, err := ParseFile(strings.NewReader(modelfile.String()))
 			assert.NoError(t, err)
-			assert.Equal(t, commands, commands2)
+			assert.Equal(t, modelfile, modelfile2)
 		})
 	}
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -161,7 +161,7 @@ func ParseNameBare(s string) Name {
 	}
 	scheme, host, ok := strings.Cut(s, "://")
-	if ! ok {
+	if !ok {
 		host = scheme
 	}
 	n.Host = host