Merge branch 'ollama:main' into main

2025-12-21 14:26:30 +00:00 · 2024-11-22 21:15:15 +08:00
parent 8b2fc1078b 25c9339e2d
commit 24668cddf6
27 changed files with 391 additions and 412 deletions
--- a/README.md
+++ b/README.md
@@ -326,11 +326,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
 - [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
 - [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)
+- [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
+- [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
 - [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
+- [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
+- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education)
+- [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application)
 - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
 - [AI Studio](https://github.com/MindWorkAI/AI-Studio)
@@ -338,6 +343,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
+- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac)
+- [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
 - [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
 - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
 - [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
@@ -347,15 +354,31 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
+- [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
+- [Local Multimodal AI Chat](https://github.com/Leon-Sander/Local-Multimodal-AI-Chat) (Ollama-based LLM Chat with support for multiple features, including PDF RAG, voice chat, image-based interactions, and integration with OpenAI.)
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
+- [OrionChat](https://github.com/EliasPereirah/OrionChat) - OrionChat is a web interface for chatting with different AI providers
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
+- [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)
+- [Promptery](https://github.com/promptery/promptery) (desktop client for Ollama.)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
+- [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
+- [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
 - [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
 - [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
+- [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt)
 - [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app, with dynamic conversation routing. Supports local models via Ollama)
- [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux) 
+- [Nosia](https://github.com/nosia-ai/nosia) (Easy to install and use RAG platform based on Ollama)
+- [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux)
+- [Abbey](https://github.com/US-Artificial-Intelligence/abbey) (A configurable AI interface server with notebooks, document storage, and YouTube support)
+
+### Cloud
+
+- [Google Cloud](https://cloud.google.com/run/docs/tutorials/gpu-gemma2-with-ollama)
+- [Fly.io](https://fly.io/docs/python/do-more/add-ollama/)
+- [Koyeb](https://www.koyeb.com/deploy/ollama)

 ### Terminal

@@ -379,12 +402,18 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [tlm](https://github.com/yusufcanb/tlm)
 - [podman-ollama](https://github.com/ericcurtin/podman-ollama)
 - [gollama](https://github.com/sammcj/gollama)
+- [ParLlama](https://github.com/paulrobello/parllama)
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
 - [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
 - [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
+- [x-cmd ollama](https://x-cmd.com/mod/ollama)
+- [bb7](https://github.com/drunkwcodes/bb7)
+- [SwollamaCLI](https://github.com/marcusziade/Swollama) bundled with the Swollama Swift package. [Demo](https://github.com/marcusziade/Swollama?tab=readme-ov-file#cli-usage)
 - [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
+- [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.

 ### Apple Vision Pro
+
 - [Enchanted](https://github.com/AugustDev/enchanted)

 ### Database
@@ -406,9 +435,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
 - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
 - [crewAI](https://github.com/crewAIInc/crewAI)
+- [Spring AI](https://github.com/spring-projects/spring-ai) with [reference](https://docs.spring.io/spring-ai/reference/api/chat/ollama-chat.html) and [example](https://github.com/tzolov/ollama-tools)
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
+- [LLPhant](https://github.com/theodo-group/LLPhant?tab=readme-ov-file#ollama)
 - [LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/llm/ollama/) and [LlamaIndexTS](https://ts.llamaindex.ai/modules/llms/available_llms/ollama)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
@@ -433,12 +464,17 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
 - [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
 - [LlamaScript](https://github.com/Project-Llama/llamascript)
+- [llm-axe](https://github.com/emirsahin1/llm-axe) (Python Toolkit for Building LLM Powered Apps)
 - [Gollm](https://docs.gollm.co/examples/ollama-example)
+- [Gollama for Golang](https://github.com/jonathanhecl/gollama)
 - [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient)
 - [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
 - [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
 - [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
+- [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
+- [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
+- [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
 - [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
@@ -455,6 +491,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Raycast extension](https://github.com/MassimilianoPasquini97/raycast_ollama)
 - [Discollama](https://github.com/mxyng/discollama) (Discord bot inside the Ollama discord channel)
 - [Continue](https://github.com/continuedev/continue)
+- [Vibe](https://github.com/thewh1teagle/vibe) (Transcribe and analyze meetings with Ollama)
 - [Obsidian Ollama plugin](https://github.com/hinterdupfinger/obsidian-ollama)
 - [Logseq Ollama plugin](https://github.com/omagdy7/ollama-logseq)
 - [NotesOllama](https://github.com/andersrex/notesollama) (Apple Notes Ollama plugin)
@@ -479,6 +516,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
 - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
+- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
+- [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
 - [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
 - [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
 - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
@@ -490,4 +529,3 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Supported backends

 - [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
-
--- a/app/store/store.go
+++ b/app/store/store.go
@@ -64,7 +64,7 @@ func initStore() {
 		slog.Debug(fmt.Sprintf("unexpected error searching for store: %s", err))
 	}
 	slog.Debug("initializing new store")
-	store.ID = uuid.New().String()
+	store.ID = uuid.NewString()
 	writeStore(getStorePath())
 }

--- a/app/tray/wintray/menus.go
+++ b/app/tray/wintray/menus.go
@@ -39,7 +39,7 @@ func (t *winTray) UpdateAvailable(ver string) error {
 		if err := t.addOrUpdateMenuItem(updateAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
 			return fmt.Errorf("unable to create menu entries %w", err)
 		}
-		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
+		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenuTitle, false); err != nil {
 			return fmt.Errorf("unable to create menu entries %w", err)
 		}
 		if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
--- a/app/tray/wintray/messages.go
+++ b/app/tray/wintray/messages.go
@@ -10,6 +10,6 @@ const (

 	quitMenuTitle            = "Quit Ollama"
 	updateAvailableMenuTitle = "An update is available"
-	updateMenutTitle         = "Restart to update"
+	updateMenuTitle          = "Restart to update"
 	diagLogsMenuTitle        = "View logs"
 )
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -319,8 +319,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 						opts.Messages = append(opts.Messages, newMessage)
 					}
 					fmt.Println("Set system message.")
-					sb.Reset()
-
 					sb.Reset()
 					continue
 				default:
--- a/docs/api.md
+++ b/docs/api.md
@@ -830,10 +830,30 @@ Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `m

 ### Parameters

- `name`: name of the model to create
+- `model`: name of the model to create
 - `modelfile` (optional): contents of the Modelfile
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
 - `path` (optional): path to the Modelfile
+- `quantize` (optional): quantize a non-quantized (e.g. float16) model
+
+#### Quantization types
+
+| Type | Recommended |
+| --- | :-: |
+| q2_K | |
+| q3_K_L | |
+| q3_K_M | |
+| q3_K_S | |
+| q4_0 | |
+| q4_1 | |
+| q4_K_M | * |
+| q4_K_S | |
+| q5_0 | |
+| q5_1 | |
+| q5_K_M | |
+| q5_K_S | |
+| q6_K | |
+| q8_0 | * |

 ### Examples

@@ -845,14 +865,14 @@ Create a new model from a `Modelfile`.

 ```shell
 curl http://localhost:11434/api/create -d '{
-  "name": "mario",
+  "model": "mario",
  "modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
 }'
 ```

 ##### Response

-A stream of JSON objects. Notice that the final JSON object shows a `"status": "success"`.
+A stream of JSON objects is returned:

 ```json
 {"status":"reading model metadata"}
@@ -868,13 +888,43 @@ A stream of JSON objects. Notice that the final JSON object shows a `"status": "
 {"status":"success"}
 ```

+#### Quantize a model
+
+Quantize a non-quantized model.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/create -d '{
+  "model": "llama3.1:quantized",
+  "modelfile": "FROM llama3.1:8b-instruct-fp16",
+  "quantize": "q4_K_M"
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```
+{"status":"quantizing F16 model to Q4_K_M"}
+{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
+{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
+{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
+{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
+{"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
+{"status":"writing manifest"}
+{"status":"success"}
+```
+
+
 ### Check if a Blob Exists

 ```shell
 HEAD /api/blobs/:digest
 ```

-Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not Ollama.ai.
+Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not ollama.com.

 #### Query Parameters

@@ -979,7 +1029,7 @@ Show information about a model including details, modelfile, template, parameter

 ### Parameters

- `name`: name of the model to show
+- `model`: name of the model to show
 - `verbose`: (optional) if set to `true`, returns full data for verbose response fields

 ### Examples
@@ -988,7 +1038,7 @@ Show information about a model including details, modelfile, template, parameter

 ```shell
 curl http://localhost:11434/api/show -d '{
-  "name": "llama3.2"
+  "model": "llama3.2"
 }'
 ```

@@ -1068,7 +1118,7 @@ Delete a model and its data.

 ### Parameters

- `name`: model name to delete
+- `model`: model name to delete

 ### Examples

@@ -1076,7 +1126,7 @@ Delete a model and its data.

 ```shell
 curl -X DELETE http://localhost:11434/api/delete -d '{
-  "name": "llama3:13b"
+  "model": "llama3:13b"
 }'
 ```

@@ -1094,7 +1144,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where

 ### Parameters

- `name`: name of the model to pull
+- `model`: name of the model to pull
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects

@@ -1104,7 +1154,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where

 ```shell
 curl http://localhost:11434/api/pull -d '{
-  "name": "llama3.2"
+  "model": "llama3.2"
 }'
 ```

@@ -1166,7 +1216,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding

 ### Parameters

- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
+- `model`: name of the model to push in the form of `<namespace>/<model>:<tag>`
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects

@@ -1176,7 +1226,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding

 ```shell
 curl http://localhost:11434/api/push -d '{
-  "name": "mattw/pygmalion:latest"
+  "model": "mattw/pygmalion:latest"
 }'
 ```

--- a/docs/import.md
+++ b/docs/import.md
@@ -81,7 +81,7 @@ If you have a GGUF based model or adapter it is possible to import it into Ollam
  * converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
  * downloading a model or adapter from a place such as HuggingFace

-To import a GGUF model, create a `Modelfile` containg:
+To import a GGUF model, create a `Modelfile` containing:

 ```dockerfile
 FROM /path/to/file.gguf
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -104,6 +104,12 @@ If you are experiencing problems getting Ollama to correctly discover or use you
 - `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
 - Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`

+## Multiple AMD GPUs
+
+If you experience gibberish responses when models load across multiple AMD GPUs on Linux, see the following guide.
+
+- https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/mgpu.html#mgpu-known-issues-and-limitations
+
 ## Windows Terminal Errors

 Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly.  This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect`  To resolve this problem, please update to Win 10 22H1 or newer.
--- a/docs/tutorials/fly-gpu.md
+++ b/docs/tutorials/fly-gpu.md
@@ -1,83 +0,0 @@
-# Running Ollama on Fly.io GPU Instances
-
-Ollama runs with little to no configuration on [Fly.io GPU instances](https://fly.io/docs/gpus/gpu-quickstart/). If you don't have access to GPUs yet, you'll need to [apply for access](https://fly.io/gpu/) on the waitlist. Once you're accepted, you'll get an email with instructions on how to get started.
-
-Create a new app with `fly apps create`:
-
-```bash
-fly apps create
-```
-
-Then create a `fly.toml` file in a new folder that looks like this:
-
-```toml
-app = "sparkling-violet-709"
-primary_region = "ord"
-vm.size = "a100-40gb" # see https://fly.io/docs/gpus/gpu-quickstart/ for more info
-
-[build]
-  image = "ollama/ollama"
-
-[http_service]
-  internal_port = 11434
-  force_https = false
-  auto_stop_machines = true
-  auto_start_machines = true
-  min_machines_running = 0
-  processes = ["app"]
-
-[mounts]
-  source = "models"
-  destination = "/root/.ollama"
-  initial_size = "100gb"
-```
-
-Then create a [new private IPv6 address](https://fly.io/docs/reference/private-networking/#flycast-private-load-balancing) for your app:
-
-```bash
-fly ips allocate-v6 --private
-```
-
-Then deploy your app:
-
-```bash
-fly deploy
-```
-
-And finally you can access it interactively with a new Fly.io Machine:
-
-```
-fly machine run -e OLLAMA_HOST=http://your-app-name.flycast --shell ollama/ollama
-```
-
-```bash
-$ ollama run openchat:7b-v3.5-fp16
->>> How do I bake chocolate chip cookies?
- To bake chocolate chip cookies, follow these steps:
-
-1. Preheat the oven to 375°F (190°C) and line a baking sheet with parchment paper or silicone baking mat.
-
-2. In a large bowl, mix together 1 cup of unsalted butter (softened), 3/4 cup granulated sugar, and 3/4
-cup packed brown sugar until light and fluffy.
-
-3. Add 2 large eggs, one at a time, to the butter mixture, beating well after each addition. Stir in 1
-teaspoon of pure vanilla extract.
-
-4. In a separate bowl, whisk together 2 cups all-purpose flour, 1/2 teaspoon baking soda, and 1/2 teaspoon
-salt. Gradually add the dry ingredients to the wet ingredients, stirring until just combined.
-
-5. Fold in 2 cups of chocolate chips (or chunks) into the dough.
-
-6. Drop rounded tablespoons of dough onto the prepared baking sheet, spacing them about 2 inches apart.
-
-7. Bake for 10-12 minutes, or until the edges are golden brown. The centers should still be slightly soft.
-
-8. Allow the cookies to cool on the baking sheet for a few minutes before transferring them to a wire rack
-to cool completely.
-
-Enjoy your homemade chocolate chip cookies!
-```
-
-When you set it up like this, it will automatically turn off when you're done using it. Then when you access it again, it will automatically turn back on. This is a great way to save money on GPU instances when you're not using them. If you want a persistent wake-on-use connection to your Ollama instance, you can set up a [connection to your Fly network using WireGuard](https://fly.io/docs/reference/private-networking/#discovering-apps-through-dns-on-a-wireguard-connection). Then you can access your Ollama instance at `http://your-app-name.flycast`.
-
-And that's it!
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -1,77 +0,0 @@
-# Using LangChain with Ollama using JavaScript
-
-In this tutorial, we are going to use JavaScript with LangChain and Ollama to learn about something just a touch more recent. In August 2023, there was a series of wildfires on Maui. There is no way an LLM trained before that time can know about this, since their training data would not include anything as recent as that. So we can find the [Wikipedia article about the fires](https://en.wikipedia.org/wiki/2023_Hawaii_wildfires) and ask questions about the contents.
-
-To get started, let's just use **LangChain** to ask a simple question to a model. To do this with JavaScript, we need to install **LangChain**:
-
-```bash
-npm install @langchain/community
-```
-
-Now we can start building out our JavaScript:
-
-```javascript
-import { Ollama } from "@langchain/community/llms/ollama";
-
-const ollama = new Ollama({
-  baseUrl: "http://localhost:11434",
-  model: "llama3.2",
-});
-
-const answer = await ollama.invoke(`why is the sky blue?`);
-
-console.log(answer);
-```
-
-That will get us the same thing as if we ran `ollama run llama3.2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
-
-```bash
-npm install cheerio
-```
-
-```javascript
-import { CheerioWebBaseLoader } from "langchain/document_loaders/web/cheerio";
-
-const loader = new CheerioWebBaseLoader("https://en.wikipedia.org/wiki/2023_Hawaii_wildfires");
-const data = await loader.load();
-```
-
-That will load the document. Although this page is smaller than the Odyssey, it is certainly bigger than the context size for most LLMs. So we are going to need to split into smaller pieces, and then select just the pieces relevant to our question. This is a great use for a vector datastore. In this example, we will use the **MemoryVectorStore** that is part of **LangChain**. But there is one more thing we need to get the content into the datastore. We have to run an embeddings process that converts the tokens in the text into a series of vectors. And for that, we are going to use **Tensorflow**. There is a lot of stuff going on in this one. First, install the **Tensorflow** components that we need.
-
-```javascript
-npm install @tensorflow/tfjs-core@3.6.0 @tensorflow/tfjs-converter@3.6.0 @tensorflow-models/universal-sentence-encoder@1.3.3 @tensorflow/tfjs-node@4.10.0
-```
-
-If you just install those components without the version numbers, it will install the latest versions, but there are conflicts within **Tensorflow**, so you need to install the compatible versions.
-
-```javascript
-import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
-import { MemoryVectorStore } from "langchain/vectorstores/memory";
-import "@tensorflow/tfjs-node";
-import { TensorFlowEmbeddings } from "langchain/embeddings/tensorflow";
-
-// Split the text into 500 character chunks. And overlap each chunk by 20 characters
-const textSplitter = new RecursiveCharacterTextSplitter({
- chunkSize: 500,
- chunkOverlap: 20
-});
-const splitDocs = await textSplitter.splitDocuments(data);
-
-// Then use the TensorFlow Embedding to store these chunks in the datastore
-const vectorStore = await MemoryVectorStore.fromDocuments(splitDocs, new TensorFlowEmbeddings());
-```
-
-To connect the datastore to a question asked to a LLM, we need to use the concept at the heart of **LangChain**: the chain. Chains are a way to connect a number of activities together to accomplish a particular tasks. There are a number of chain types available, but for this tutorial we are using the **RetrievalQAChain**.
-
-```javascript
-import { RetrievalQAChain } from "langchain/chains";
-
-const retriever = vectorStore.asRetriever();
-const chain = RetrievalQAChain.fromLLM(ollama, retriever);
-const result = await chain.call({query: "When was Hawaii's request for a major disaster declaration approved?"});
-console.log(result.text)
-```
-
-So we created a retriever, which is a way to return the chunks that match a query from a datastore. And then connect the retriever and the model via a chain. Finally, we send a query to the chain, which results in an answer using our document as a source. The answer it returned was correct, August 10, 2023.
-
-And that is a simple introduction to what you can do with **LangChain** and **Ollama.**
--- a/docs/tutorials/langchainpy.md
+++ b/docs/tutorials/langchainpy.md
@@ -1,85 +0,0 @@
-# Using LangChain with Ollama in Python
-
-Let's imagine we are studying the classics, such as **the Odyssey** by **Homer**. We might have a question about Neleus and his family. If you ask llama2 for that info, you may get something like:
-
-> I apologize, but I'm a large language model, I cannot provide information on individuals or families that do not exist in reality. Neleus is not a real person or character, and therefore does not have a family or any other personal details. My apologies for any confusion. Is there anything else I can help you with?
-
-This sounds like a typical censored response, but even llama2-uncensored gives a mediocre answer:
-
-> Neleus was a legendary king of Pylos and the father of Nestor, one of the Argonauts. His mother was Clymene, a sea nymph, while his father was Neptune, the god of the sea.
-
-So let's figure out how we can use **LangChain** with Ollama to ask our question to the actual document, the Odyssey by Homer, using Python.
-
-Let's start by asking a simple question that we can get an answer to from the **Llama3** model using **Ollama**. First, we need to install the **LangChain** package:
-
-`pip install langchain_community`
-
-Then we can create a model and ask the question:
-
-```python
-from langchain_community.llms import Ollama
-ollama = Ollama(
-    base_url='http://localhost:11434',
-    model="llama3"
-)
-print(ollama.invoke("why is the sky blue"))
-```
-
-Notice that we are defining the model and the base URL for Ollama.
-
-Now let's load a document to ask questions against. I'll load up the Odyssey by Homer, which you can find at Project Gutenberg. We will need **WebBaseLoader** which is part of **LangChain** and loads text from any webpage. On my machine, I also needed to install **bs4** to get that to work, so run `pip install bs4`.
-
-```python
-from langchain.document_loaders import WebBaseLoader
-loader = WebBaseLoader("https://www.gutenberg.org/files/1727/1727-h/1727-h.htm")
-data = loader.load()
-```
-
-This file is pretty big. Just the preface is 3000 tokens. Which means the full document won't fit into the context for the model. So we need to split it up into smaller pieces.
-
-```python
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-
-text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
-all_splits = text_splitter.split_documents(data)
-```
-
-It's split up, but we have to find the relevant splits and then submit those to the model. We can do this by creating embeddings and storing them in a vector database. We can use Ollama directly to instantiate an embedding model. We will use ChromaDB in this example for a vector database. `pip install chromadb`
-We also need to pull embedding model: `ollama pull nomic-embed-text`
-```python
-from langchain.embeddings import OllamaEmbeddings
-from langchain.vectorstores import Chroma
-oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="nomic-embed-text")
-vectorstore = Chroma.from_documents(documents=all_splits, embedding=oembed)
-```
-
-Now let's ask a question from the document. **Who was Neleus, and who is in his family?** Neleus is a character in the Odyssey, and the answer can be found in our text.
-
-```python
-question="Who is Neleus and who is in Neleus' family?"
-docs = vectorstore.similarity_search(question)
-len(docs)
-```
-
-This will output the number of matches for chunks of data similar to the search.
-
-The next thing is to send the question and the relevant parts of the docs to the model to see if we can get a good answer. But we are stitching two parts of the process together, and that is called a chain. This means we need to define a chain:
-
-```python
-from langchain.chains import RetrievalQA
-qachain=RetrievalQA.from_chain_type(ollama, retriever=vectorstore.as_retriever())
-res = qachain.invoke({"query": question})
-print(res['result'])
-```
-
-The answer received from this chain was:
-
-> Neleus is a character in Homer's "Odyssey" and is mentioned in the context of Penelope's suitors. Neleus is the father of Chloris, who is married to Neleus and bears him several children, including Nestor, Chromius, Periclymenus, and Pero. Amphinomus, the son of Nisus, is also mentioned as a suitor of Penelope and is known for his good natural disposition and agreeable conversation.
-
-It's not a perfect answer, as it implies Neleus married his daughter when actually Chloris "was the youngest daughter to Amphion son of Iasus and king of Minyan Orchomenus, and was Queen in Pylos".
-
-I updated the chunk_overlap for the text splitter to 20 and tried again and got a much better answer:
-
-> Neleus is a character in Homer's epic poem "The Odyssey." He is the husband of Chloris, who is the youngest daughter of Amphion son of Iasus and king of Minyan Orchomenus. Neleus has several children with Chloris, including Nestor, Chromius, Periclymenus, and Pero.
-
-And that is a much better answer.
--- a/docs/tutorials/nvidia-jetson.md
+++ b/docs/tutorials/nvidia-jetson.md
@@ -1,15 +0,0 @@
-# Running Ollama on NVIDIA Jetson Devices
-
-Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/) and should run out of the box with the standard installation instructions. 
-
-The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack), but should also work on JetPack 6.0.
-
- Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.com/install.sh | sh`
- Pull the model you want to use (e.g. mistral): `ollama pull mistral`
- Start an interactive session: `ollama run mistral`
-
-And that's it!
-
-# Running Ollama in Docker
-
-When running GPU accelerated applications in Docker, it is highly recommended to use [dusty-nv jetson-containers repo](https://github.com/dusty-nv/jetson-containers).
--- a/go.mod
+++ b/go.mod
@@ -7,7 +7,7 @@ require (
 	github.com/emirpasic/gods v1.18.1
 	github.com/gin-gonic/gin v1.10.0
 	github.com/golang/protobuf v1.5.4 // indirect
-	github.com/google/uuid v1.1.2
+	github.com/google/uuid v1.6.0
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
--- a/go.sum
+++ b/go.sum
@@ -113,8 +113,9 @@ github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
 github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
-github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -10,7 +10,38 @@ import (
 	"github.com/ollama/ollama/api"
 )

+func TestLongInputContext(t *testing.T) {
+	// Setting NUM_PARALLEL to 1 ensures the allocated context is exactly what
+	// we asked for and there is nothing extra that we could spill over into
+	t.Setenv("OLLAMA_NUM_PARALLEL", "1")
+
+	// Longer needed for small footprint GPUs
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	defer cancel()
+	// Set up the test data
+	req := api.GenerateRequest{
+		Model:  "llama2",
+		Prompt: "Oh, don’t speak to me of Austria. Perhaps I don’t understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexander’s loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosíltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I don’t believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe! What country is this referring to?",
+		Stream: &stream,
+		Options: map[string]interface{}{
+			"temperature": 0,
+			"seed":        123,
+			"num_ctx":     128,
+		},
+	}
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatalf("PullIfMissing failed: %v", err)
+	}
+	DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia"}, 120*time.Second, 10*time.Second)
+}
+
 func TestContextExhaustion(t *testing.T) {
+	// Setting NUM_PARALLEL to 1 ensures the allocated context is exactly what
+	// we asked for and there is nothing extra that we could spill over into
+	t.Setenv("OLLAMA_NUM_PARALLEL", "1")
+
 	// Longer needed for small footprint GPUs
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
--- a/llama/README.md
+++ b/llama/README.md
@@ -55,7 +55,7 @@ go build -tags avx,cuda .

 ### ROCm

-Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive):
+Install [ROCm](https://rocm.docs.amd.com/en/latest/).

 ```shell
 make ggml_hipblas.so
@@ -77,7 +77,7 @@ go build -tags avx,cuda .

 ### ROCm

-Install [ROCm 5.7.1](https://rocm.docs.amd.com/en/docs-5.7.1/).
+Install [ROCm](https://rocm.docs.amd.com/en/latest/).

 ```shell
 make ggml_hipblas.dll
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -157,9 +157,7 @@ type Context struct {
 	numThreads int
 }

-func (c *Context) KvCacheClear() {
-	C.llama_kv_cache_clear(c.c)
-}
+var ErrKvCacheFull = errors.New("could not find a kv cache slot")

 func (c *Context) Decode(batch *Batch) error {
 	// Positive return values does not mean a fatal error, but rather a warning.
@@ -173,7 +171,7 @@ func (c *Context) Decode(batch *Batch) error {
 	}

 	if code > 0 {
-		return fmt.Errorf("could not find a KV slot for the batch - try reducing the size of the batch or increase the context. code: %d", code)
+		return ErrKvCacheFull
 	}

 	return nil
@@ -195,6 +193,14 @@ func (c *Context) KvCacheSeqCp(srcSeqId int, dstSeqId int, p0 int, p1 int) {
 	C.llama_kv_cache_seq_cp(c.c, C.int(srcSeqId), C.int(dstSeqId), C.int(p0), C.int(p1))
 }

+func (c *Context) KvCacheClear() {
+	C.llama_kv_cache_clear(c.c)
+}
+
+func (c *Context) KvCacheDefrag() {
+	C.llama_kv_cache_defrag(c.c)
+}
+
 // Get the embeddings for a sequence id
 func (c *Context) GetEmbeddingsSeq(seqId int) []float32 {
 	embeddings := unsafe.Pointer(C.llama_get_embeddings_seq(c.c, C.int(seqId)))
@@ -384,6 +390,8 @@ func (b *Batch) Add(token int, embed []float32, pos int, logits bool, seqIds ...

 	if logits {
 		unsafe.Slice(b.c.logits, b.allocSize())[b.c.n_tokens] = 1
+	} else {
+		unsafe.Slice(b.c.logits, b.allocSize())[b.c.n_tokens] = 0
 	}

 	b.c.n_tokens += 1
--- a/llama/runner/cache.go
+++ b/llama/runner/cache.go
@@ -203,7 +203,11 @@ func countCommonPrefix(a []input, b []input) int {
 // the newest half into that space (saving numKeep inputs at the beginning).
 //
 // Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
-func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) {
+func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) error {
+	if numKeep >= c.numCtx {
+		return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
+	}
+
 	targetFree := (c.numCtx - numKeep) / 2
 	targetFree = max(targetFree, 1)

@@ -211,18 +215,22 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) {
 	discard := targetFree - currentFree

 	if discard <= 0 {
-		return
+		return nil
 	}

-	slog.Debug("context limit hit - shifting", "limit", c.numCtx, "input", len(slot.Inputs),
+	slog.Debug("context limit hit - shifting", "id", slot.Id, "limit", c.numCtx, "input", len(slot.Inputs),
 		"keep", numKeep, "discard", discard)

 	// TODO (jessegross): KV cache removal can fail for certain types of models
-	c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard)
+	if !c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard) {
+		return fmt.Errorf("unable to remove old kv cache entries (id: %v, keep: %v discard: %v)", slot.Id, numKeep, discard)
+	}
 	c.lc.KvCacheSeqAdd(slot.Id, numKeep+discard, len(slot.Inputs), -discard)

 	for i := numKeep + discard; i < len(slot.Inputs); i++ {
 		slot.Inputs[i-discard] = slot.Inputs[i]
 	}
 	slot.Inputs = slot.Inputs[:len(slot.Inputs)-discard]
+
+	return nil
 }
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -45,6 +45,9 @@ type Sequence struct {
 	// prompt inputs left to evaluate
 	inputs []input

+	// inputs that have been added to a batch but not yet submitted to Decode
+	pendingInputs []input
+
 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
 	pendingResponses []string

@@ -119,7 +122,10 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 	params.numKeep = min(params.numKeep, s.cache.numCtx-1)

 	if len(inputs) > s.cache.numCtx {
-		slog.Warn("input exceeds context length", "prompt", len(inputs), "limit", s.cache.numCtx)
+		slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep)
+		newInputs := inputs[:params.numKeep]
+		newInputs = append(newInputs, inputs[len(inputs)-s.cache.numCtx+params.numKeep:]...)
+		inputs = newInputs
 	}

 	var sc *llama.SamplingContext
@@ -324,7 +330,11 @@ func (s *Server) run(ctx context.Context) {
 		case <-ctx.Done():
 			return
 		default:
-			s.processBatch(tokenBatch, embedBatch)
+			err := s.processBatch(tokenBatch, embedBatch)
+			if err != nil {
+				panic(err)
+			}
+
 			tokenBatch.Clear()
 			embedBatch.Clear()
 		}
@@ -338,7 +348,7 @@ func (s *Server) run(ctx context.Context) {
 // these should instead be handled by the handlers
 // it should only be responsible for accepting tokens or embeddings and
 // processing batches as fast as possible
-func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) {
+func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) error {
 	s.mu.Lock()
 	for s.allNil() {
 		s.cond.Wait() // Wait until an item is added
@@ -357,28 +367,19 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}

-		// If an error occurred during the processing of a previous batch then we may have emptied the inputs
-		// without adding a new one. In this case, end the sequence rather than infinite looping.
-		if len(seq.inputs) == 0 {
-			slog.Error("removing sequence due to no input tokens", "index", seqIdx, "cache id", seq.cache.Id)
-			s.removeSequence(seqIdx, "error")
-			continue
-		}
-
 		// if past the num predict limit
 		if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
 			s.removeSequence(seqIdx, "limit")
 			continue
 		}

-		var numInputsProcessed int
-		shifted := false
-
 		for i, input := range seq.inputs {
-			if len(seq.cache.Inputs)+1 > s.cache.numCtx {
-				if !shifted {
-					s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
-					shifted = true
+			if len(seq.cache.Inputs)+len(seq.pendingInputs)+1 > s.cache.numCtx {
+				if len(seq.pendingInputs) == 0 {
+					err := s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
+					if err != nil {
+						return err
+					}
 				} else {
 					break
 				}
@@ -407,27 +408,30 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			}

 			crossAttention = seq.crossAttention
-			batch.Add(input.token, input.embed, len(seq.cache.Inputs), i+1 == len(seq.inputs), seq.cache.Id)
-			seq.cache.Inputs = append(seq.cache.Inputs, input)
-			numInputsProcessed++
-		}
-
-		if numInputsProcessed > 0 {
-			seq.inputs = seq.inputs[numInputsProcessed:]
+			batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
+			seq.pendingInputs = append(seq.pendingInputs, input)
 			seq.iBatch = batch.NumTokens() - 1
 		}
+
+		seq.inputs = seq.inputs[len(seq.pendingInputs):]
 	}

 	if batch == nil || batch.NumTokens() == 0 {
-		return
+		return nil
 	}

 	s.lc.SetCrossAttention(crossAttention)

 	err := s.lc.Decode(batch)
 	if err != nil {
-		slog.Error("failed to decode batch", "error", err)
-		return
+		if errors.Is(err, llama.ErrKvCacheFull) {
+			slog.Debug("defragmenting kv cache")
+			s.cache.lc.KvCacheDefrag()
+			err = s.lc.Decode(batch)
+		}
+		if err != nil {
+			return fmt.Errorf("failed to decode batch: %w", err)
+		}
 	}

 	if crossAttention {
@@ -442,6 +446,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}

+		// After calling Decode, pending inputs are now in the cache
+		if len(seq.pendingInputs) > 0 {
+			seq.cache.Inputs = append(seq.cache.Inputs, seq.pendingInputs...)
+			seq.pendingInputs = []input{}
+		}
+
 		// don't sample prompt processing
 		if len(seq.inputs) != 0 {
 			continue
@@ -454,7 +464,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)

 		// if done processing the prompt, generate an embedding and return
 		if seq.embeddingOnly {
-			embed := s.lc.GetEmbeddingsSeq(i)
+			embed := s.lc.GetEmbeddingsSeq(seq.cache.Id)
 			if embed == nil {
 				embed = s.lc.GetEmbeddingsIth(seq.iBatch)
 			}
@@ -524,6 +534,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			s.removeSequence(i, "connection")
 		}
 	}
+
+	return nil
 }

 // TODO (jmorganca): use structs from the api package to avoid duplication
--- a/llm/server.go
+++ b/llm/server.go
@@ -838,13 +838,15 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	}

 	if err := scanner.Err(); err != nil {
-		if strings.Contains(err.Error(), "unexpected EOF") {
+		if strings.Contains(err.Error(), "unexpected EOF") || strings.Contains(err.Error(), "forcibly closed") {
 			s.Close()
-			msg := ""
+			var msg string
 			if s.status != nil && s.status.LastErrMsg != "" {
 				msg = s.status.LastErrMsg
+			} else {
+				msg = err.Error()
 			}
-			return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
+			return fmt.Errorf("an error was encountered while running the model: %s", msg)
 		}

 		return fmt.Errorf("error reading llm response: %v", err)
--- a/llm/status.go
+++ b/llm/status.go
@@ -27,6 +27,7 @@ var errorPrefixes = []string{
 	"\"ERR\"",
 	"error loading model",
 	"GGML_ASSERT",
+	"Deepseek2 does not support K-shift",
 }

 func (w *StatusWriter) Write(b []byte) (int, error) {
--- a/scripts/env.sh
+++ b/scripts/env.sh
@@ -5,7 +5,6 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$V
 # TODO - consider `docker buildx ls --format=json` to autodiscover platform capability
 PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"}
 DOCKER_ORG=${DOCKER_ORG:-"ollama"}
-RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
 FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
 OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
    --build-arg=GOFLAGS \
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -4,8 +4,8 @@

 set -eu

-red="$( (/usr/bin/tput bold; /usr/bin/tput setaf 1; :) 2>&-)"
-plain="$( (/usr/bin/tput sgr0; :) 2>&-)"
+red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
+plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"

 status() { echo ">>> $*" >&2; }
 error() { echo "${red}ERROR:${plain} $*"; exit 1; }
--- a/server/images.go
+++ b/server/images.go
@@ -13,6 +13,7 @@ import (
 	"io"
 	"log"
 	"log/slog"
+	"net"
 	"net/http"
 	"net/url"
 	"os"
@@ -1071,6 +1072,21 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 	return nil, errUnauthorized
 }

+// testMakeRequestDialContext specifies the dial function for the http client in
+// makeRequest. It can be used to resolve hosts in model names to local
+// addresses for testing. For example, the model name ("example.com/my/model")
+// can be directed to push/pull from "127.0.0.1:1234".
+//
+// This is not safe to set across goroutines. It should be set in
+// the main test goroutine, and not by tests marked to run in parallel with
+// t.Parallel().
+//
+// It should be cleared after use, otherwise it will affect other tests.
+//
+// Ideally we would have some set this up the stack, but the code is not
+// structured in a way that makes this easy, so this will have to do for now.
+var testMakeRequestDialContext func(ctx context.Context, network, addr string) (net.Conn, error)
+
 func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *registryOptions) (*http.Response, error) {
 	if requestURL.Scheme != "http" && regOpts != nil && regOpts.Insecure {
 		requestURL.Scheme = "http"
@@ -1105,6 +1121,9 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
 	}

 	resp, err := (&http.Client{
+		Transport: &http.Transport{
+			DialContext: testMakeRequestDialContext,
+		},
 		CheckRedirect: regOpts.CheckRedirect,
 	}).Do(req)
 	if err != nil {
--- a/server/routes.go
+++ b/server/routes.go
@@ -507,7 +507,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	embedding, err := r.Embedding(c.Request.Context(), req.Prompt)
 	if err != nil {
 		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
-		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
+		c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Errorf("failed to generate embedding: %v", err)})
 		return
 	}

@@ -540,7 +540,8 @@ func (s *Server) PullHandler(c *gin.Context) {
 		return
 	}

-	if err := checkNameExists(name); err != nil {
+	name, err = getExistingName(name)
+	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@@ -621,19 +622,20 @@ func (s *Server) PushHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }

-func checkNameExists(name model.Name) error {
-	names, err := Manifests(true)
+// getExistingName returns the original, on disk name if the input name is a
+// case-insensitive match, otherwise it returns the input name.
+func getExistingName(n model.Name) (model.Name, error) {
+	var zero model.Name
+	existing, err := Manifests(true)
 	if err != nil {
-		return err
+		return zero, err
 	}
-
-	for n := range names {
-		if strings.EqualFold(n.Filepath(), name.Filepath()) && n != name {
-			return errors.New("a model with that name already exists")
+	for e := range existing {
+		if n.EqualFold(e) {
+			return e, nil
 		}
 	}
-
-	return nil
+	return n, nil
 }

 func (s *Server) CreateHandler(c *gin.Context) {
@@ -652,7 +654,8 @@ func (s *Server) CreateHandler(c *gin.Context) {
 		return
 	}

-	if err := checkNameExists(name); err != nil {
+	name, err := getExistingName(name)
+	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@@ -958,14 +961,19 @@ func (s *Server) CopyHandler(c *gin.Context) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("source %q is invalid", r.Source)})
 		return
 	}
+	src, err := getExistingName(src)
+	if err != nil {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}

 	dst := model.ParseName(r.Destination)
 	if !dst.IsValid() {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("destination %q is invalid", r.Destination)})
 		return
 	}
-
-	if err := checkNameExists(dst); err != nil {
+	dst, err = getExistingName(dst)
+	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -7,13 +7,18 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
+	"io/fs"
 	"math"
+	"math/rand/v2"
+	"net"
 	"net/http"
 	"net/http/httptest"
 	"os"
+	"path/filepath"
 	"sort"
 	"strings"
 	"testing"
+	"unicode"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
@@ -473,83 +478,129 @@ func Test_Routes(t *testing.T) {
 	}
 }

-func TestCase(t *testing.T) {
+func casingShuffle(s string) string {
+	rr := []rune(s)
+	for i := range rr {
+		if rand.N(2) == 0 {
+			rr[i] = unicode.ToUpper(rr[i])
+		} else {
+			rr[i] = unicode.ToLower(rr[i])
+		}
+	}
+	return string(rr)
+}
+
+func TestManifestCaseSensitivity(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", t.TempDir())

-	cases := []string{
-		"mistral",
-		"llama3:latest",
-		"library/phi3:q4_0",
-		"registry.ollama.ai/library/gemma:q5_K_M",
-		// TODO: host:port currently fails on windows (#4107)
-		// "localhost:5000/alice/bob:latest",
+	r := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+		io.WriteString(w, `{}`) //nolint:errcheck
+	}))
+	defer r.Close()
+
+	nameUsed := make(map[string]bool)
+	name := func() string {
+		const fqmn = "example/namespace/model:tag"
+		for {
+			v := casingShuffle(fqmn)
+			if nameUsed[v] {
+				continue
+			}
+			nameUsed[v] = true
+			return v
+		}
+	}
+
+	wantStableName := name()
+
+	// checkManifestList tests that there is strictly one manifest in the
+	// models directory, and that the manifest is for the model under test.
+	checkManifestList := func() {
+		t.Helper()
+
+		mandir := filepath.Join(os.Getenv("OLLAMA_MODELS"), "manifests/")
+		var entries []string
+		t.Logf("dir entries:")
+		fsys := os.DirFS(mandir)
+		err := fs.WalkDir(fsys, ".", func(path string, info fs.DirEntry, err error) error {
+			if err != nil {
+				return err
+			}
+			t.Logf("    %s", fs.FormatDirEntry(info))
+			if info.IsDir() {
+				return nil
+			}
+			path = strings.TrimPrefix(path, mandir)
+			entries = append(entries, path)
+			return nil
+		})
+		if err != nil {
+			t.Fatalf("failed to walk directory: %v", err)
+		}
+
+		if len(entries) != 1 {
+			t.Errorf("len(got) = %d, want 1", len(entries))
+			return // do not use Fatal so following steps run
+		}
+
+		g := entries[0] // raw path
+		g = filepath.ToSlash(g)
+		w := model.ParseName(wantStableName).Filepath()
+		w = filepath.ToSlash(w)
+		if g != w {
+			t.Errorf("\ngot:  %s\nwant: %s", g, w)
+		}
+	}
+
+	checkOK := func(w *httptest.ResponseRecorder) {
+		t.Helper()
+		if w.Code != http.StatusOK {
+			t.Errorf("code = %d, want 200", w.Code)
+			t.Logf("body: %s", w.Body.String())
+		}
 	}

 	var s Server
-	for _, tt := range cases {
-		t.Run(tt, func(t *testing.T) {
-			w := createRequest(t, s.CreateHandler, api.CreateRequest{
-				Name:      tt,
-				Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
-				Stream:    &stream,
-			})
-
-			if w.Code != http.StatusOK {
-				t.Fatalf("expected status 200 got %d", w.Code)
-			}
-
-			expect, err := json.Marshal(map[string]string{"error": "a model with that name already exists"})
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			t.Run("create", func(t *testing.T) {
-				w = createRequest(t, s.CreateHandler, api.CreateRequest{
-					Name:      strings.ToUpper(tt),
-					Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
-					Stream:    &stream,
-				})
-
-				if w.Code != http.StatusBadRequest {
-					t.Fatalf("expected status 500 got %d", w.Code)
-				}
-
-				if !bytes.Equal(w.Body.Bytes(), expect) {
-					t.Fatalf("expected error %s got %s", expect, w.Body.String())
-				}
-			})
-
-			t.Run("pull", func(t *testing.T) {
-				w := createRequest(t, s.PullHandler, api.PullRequest{
-					Name:   strings.ToUpper(tt),
-					Stream: &stream,
-				})
-
-				if w.Code != http.StatusBadRequest {
-					t.Fatalf("expected status 500 got %d", w.Code)
-				}
-
-				if !bytes.Equal(w.Body.Bytes(), expect) {
-					t.Fatalf("expected error %s got %s", expect, w.Body.String())
-				}
-			})
-
-			t.Run("copy", func(t *testing.T) {
-				w := createRequest(t, s.CopyHandler, api.CopyRequest{
-					Source:      tt,
-					Destination: strings.ToUpper(tt),
-				})
-
-				if w.Code != http.StatusBadRequest {
-					t.Fatalf("expected status 500 got %d", w.Code)
-				}
-
-				if !bytes.Equal(w.Body.Bytes(), expect) {
-					t.Fatalf("expected error %s got %s", expect, w.Body.String())
-				}
-			})
-		})
+	testMakeRequestDialContext = func(ctx context.Context, _, _ string) (net.Conn, error) {
+		var d net.Dialer
+		return d.DialContext(ctx, "tcp", r.Listener.Addr().String())
 	}
+	t.Cleanup(func() { testMakeRequestDialContext = nil })
+
+	t.Logf("creating")
+	checkOK(createRequest(t, s.CreateHandler, api.CreateRequest{
+		// Start with the stable name, and later use a case-shuffled
+		// version.
+		Name: wantStableName,
+
+		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
+		Stream:    &stream,
+	}))
+	checkManifestList()
+
+	t.Logf("creating (again)")
+	checkOK(createRequest(t, s.CreateHandler, api.CreateRequest{
+		Name:      name(),
+		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
+		Stream:    &stream,
+	}))
+	checkManifestList()
+
+	t.Logf("pulling")
+	checkOK(createRequest(t, s.PullHandler, api.PullRequest{
+		Name:     name(),
+		Stream:   &stream,
+		Insecure: true,
+	}))
+	checkManifestList()
+
+	t.Logf("copying")
+	checkOK(createRequest(t, s.CopyHandler, api.CopyRequest{
+		Source:      name(),
+		Destination: name(),
+	}))
+	checkManifestList()
 }

 func TestShow(t *testing.T) {
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -298,6 +298,13 @@ func (n Name) LogValue() slog.Value {
 	return slog.StringValue(n.String())
 }

+func (n Name) EqualFold(o Name) bool {
+	return strings.EqualFold(n.Host, o.Host) &&
+		strings.EqualFold(n.Namespace, o.Namespace) &&
+		strings.EqualFold(n.Model, o.Model) &&
+		strings.EqualFold(n.Tag, o.Tag)
+}
+
 func isValidLen(kind partKind, s string) bool {
 	switch kind {
 	case kindHost: