Merge branch 'ollama:main' into main

This commit is contained in:
likelovewant
2024-05-06 22:32:02 +08:00
committed by GitHub
49 changed files with 851 additions and 504 deletions

View File

@@ -370,12 +370,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram) - [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
- [Hass Ollama Conversation](https://github.com/ej52/hass-ollama-conversation) - [Hass Ollama Conversation](https://github.com/ej52/hass-ollama-conversation)
- [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama) - [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot) - [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
- [Cliobot](https://github.com/herval/cliobot) (Telegram bot with Ollama support) - [Cliobot](https://github.com/herval/cliobot) (Telegram bot with Ollama support)
- [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot) - [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot)
- [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt) - [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama) - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama) - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace) - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension) - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
@@ -384,4 +385,5 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation) - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
### Supported backends ### Supported backends
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov. - [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.

View File

@@ -5,12 +5,14 @@ import (
"log/slog" "log/slog"
"os" "os"
"path/filepath" "path/filepath"
"github.com/ollama/ollama/server/envconfig"
) )
func InitLogging() { func InitLogging() {
level := slog.LevelInfo level := slog.LevelInfo
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" { if envconfig.Debug {
level = slog.LevelDebug level = slog.LevelDebug
} }

View File

@@ -31,16 +31,13 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd "/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
"/FORCECLOSEAPPLICATIONS", // Force close the tray app - might be needed "/FORCECLOSEAPPLICATIONS", // Force close the tray app - might be needed
} }
// When we're not in debug mode, make the upgrade as quiet as possible (no GUI, no prompts) // make the upgrade as quiet as possible (no GUI, no prompts)
// TODO - temporarily disable since we're pinning in debug mode for the preview
// if debug := os.Getenv("OLLAMA_DEBUG"); debug == "" {
installArgs = append(installArgs, installArgs = append(installArgs,
"/SP", // Skip the "This will install... Do you wish to continue" prompt "/SP", // Skip the "This will install... Do you wish to continue" prompt
"/SUPPRESSMSGBOXES", "/SUPPRESSMSGBOXES",
"/SILENT", "/SILENT",
"/VERYSILENT", "/VERYSILENT",
) )
// }
// Safeguard in case we have requests in flight that need to drain... // Safeguard in case we have requests in flight that need to drain...
slog.Info("Waiting for server to shutdown") slog.Info("Waiting for server to shutdown")

View File

@@ -1,71 +1,71 @@
//go:build windows //go:build windows
package wintray package wintray
import ( import (
"fmt" "fmt"
"log/slog" "log/slog"
"unsafe" "unsafe"
"golang.org/x/sys/windows" "golang.org/x/sys/windows"
) )
const ( const (
updatAvailableMenuID = 1 updatAvailableMenuID = 1
updateMenuID = updatAvailableMenuID + 1 updateMenuID = updatAvailableMenuID + 1
separatorMenuID = updateMenuID + 1 separatorMenuID = updateMenuID + 1
diagLogsMenuID = separatorMenuID + 1 diagLogsMenuID = separatorMenuID + 1
diagSeparatorMenuID = diagLogsMenuID + 1 diagSeparatorMenuID = diagLogsMenuID + 1
quitMenuID = diagSeparatorMenuID + 1 quitMenuID = diagSeparatorMenuID + 1
) )
func (t *winTray) initMenus() error { func (t *winTray) initMenus() error {
if err := t.addOrUpdateMenuItem(diagLogsMenuID, 0, diagLogsMenuTitle, false); err != nil { if err := t.addOrUpdateMenuItem(diagLogsMenuID, 0, diagLogsMenuTitle, false); err != nil {
return fmt.Errorf("unable to create menu entries %w\n", err) return fmt.Errorf("unable to create menu entries %w\n", err)
} }
if err := t.addSeparatorMenuItem(diagSeparatorMenuID, 0); err != nil { if err := t.addSeparatorMenuItem(diagSeparatorMenuID, 0); err != nil {
return fmt.Errorf("unable to create menu entries %w", err) return fmt.Errorf("unable to create menu entries %w", err)
} }
if err := t.addOrUpdateMenuItem(quitMenuID, 0, quitMenuTitle, false); err != nil { if err := t.addOrUpdateMenuItem(quitMenuID, 0, quitMenuTitle, false); err != nil {
return fmt.Errorf("unable to create menu entries %w\n", err) return fmt.Errorf("unable to create menu entries %w\n", err)
} }
return nil return nil
} }
func (t *winTray) UpdateAvailable(ver string) error { func (t *winTray) UpdateAvailable(ver string) error {
if !t.updateNotified { if !t.updateNotified {
slog.Debug("updating menu and sending notification for new update") slog.Debug("updating menu and sending notification for new update")
if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil { if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
return fmt.Errorf("unable to create menu entries %w", err) return fmt.Errorf("unable to create menu entries %w", err)
} }
if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil { if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
return fmt.Errorf("unable to create menu entries %w", err) return fmt.Errorf("unable to create menu entries %w", err)
} }
if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil { if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
return fmt.Errorf("unable to create menu entries %w", err) return fmt.Errorf("unable to create menu entries %w", err)
} }
iconFilePath, err := iconBytesToFilePath(wt.updateIcon) iconFilePath, err := iconBytesToFilePath(wt.updateIcon)
if err != nil { if err != nil {
return fmt.Errorf("unable to write icon data to temp file: %w", err) return fmt.Errorf("unable to write icon data to temp file: %w", err)
} }
if err := wt.setIcon(iconFilePath); err != nil { if err := wt.setIcon(iconFilePath); err != nil {
return fmt.Errorf("unable to set icon: %w", err) return fmt.Errorf("unable to set icon: %w", err)
} }
t.updateNotified = true t.updateNotified = true
t.pendingUpdate = true t.pendingUpdate = true
// Now pop up the notification // Now pop up the notification
t.muNID.Lock() t.muNID.Lock()
defer t.muNID.Unlock() defer t.muNID.Unlock()
copy(t.nid.InfoTitle[:], windows.StringToUTF16(updateTitle)) copy(t.nid.InfoTitle[:], windows.StringToUTF16(updateTitle))
copy(t.nid.Info[:], windows.StringToUTF16(fmt.Sprintf(updateMessage, ver))) copy(t.nid.Info[:], windows.StringToUTF16(fmt.Sprintf(updateMessage, ver)))
t.nid.Flags |= NIF_INFO t.nid.Flags |= NIF_INFO
t.nid.Timeout = 10 t.nid.Timeout = 10
t.nid.Size = uint32(unsafe.Sizeof(*wt.nid)) t.nid.Size = uint32(unsafe.Sizeof(*wt.nid))
err = t.nid.modify() err = t.nid.modify()
if err != nil { if err != nil {
return err return err
} }
} }
return nil return nil
} }

View File

@@ -34,7 +34,6 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/auth" "github.com/ollama/ollama/auth"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/progress" "github.com/ollama/ollama/progress"
"github.com/ollama/ollama/server" "github.com/ollama/ollama/server"
"github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/errtypes"
@@ -57,13 +56,13 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
p := progress.NewProgress(os.Stderr) p := progress.NewProgress(os.Stderr)
defer p.Stop() defer p.Stop()
modelfile, err := os.Open(filename) f, err := os.Open(filename)
if err != nil { if err != nil {
return err return err
} }
defer modelfile.Close() defer f.Close()
commands, err := parser.Parse(modelfile) modelfile, err := model.ParseFile(f)
if err != nil { if err != nil {
return err return err
} }
@@ -77,10 +76,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
spinner := progress.NewSpinner(status) spinner := progress.NewSpinner(status)
p.Add(status, spinner) p.Add(status, spinner)
for i := range commands { for i := range modelfile.Commands {
switch commands[i].Name { switch modelfile.Commands[i].Name {
case "model", "adapter": case "model", "adapter":
path := commands[i].Args path := modelfile.Commands[i].Args
if path == "~" { if path == "~" {
path = home path = home
} else if strings.HasPrefix(path, "~/") { } else if strings.HasPrefix(path, "~/") {
@@ -92,7 +91,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
} }
fi, err := os.Stat(path) fi, err := os.Stat(path)
if errors.Is(err, os.ErrNotExist) && commands[i].Name == "model" { if errors.Is(err, os.ErrNotExist) && modelfile.Commands[i].Name == "model" {
continue continue
} else if err != nil { } else if err != nil {
return err return err
@@ -115,7 +114,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
return err return err
} }
commands[i].Args = "@"+digest modelfile.Commands[i].Args = "@" + digest
} }
} }
@@ -145,7 +144,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
quantization, _ := cmd.Flags().GetString("quantization") quantization, _ := cmd.Flags().GetString("quantization")
request := api.CreateRequest{Name: args[0], Modelfile: parser.Format(commands), Quantization: quantization} request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantization: quantization}
if err := client.Create(cmd.Context(), &request, fn); err != nil { if err := client.Create(cmd.Context(), &request, fn); err != nil {
return err return err
} }

View File

@@ -53,7 +53,7 @@ func (m *SafetensorFormat) GetTensors(dirpath string, params *Params) ([]llm.Ten
var err error var err error
t, offset, err = m.readTensors(f, offset, params) t, offset, err = m.readTensors(f, offset, params)
if err != nil { if err != nil {
slog.Error("%v", err) slog.Error(err.Error())
return nil, err return nil, err
} }
tensors = append(tensors, t...) tensors = append(tensors, t...)
@@ -122,7 +122,7 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
ggufName, err := m.GetLayerName(k) ggufName, err := m.GetLayerName(k)
if err != nil { if err != nil {
slog.Error("%v", err) slog.Error(err.Error())
return nil, 0, err return nil, 0, err
} }

View File

@@ -74,7 +74,7 @@ func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor,
ggufName, err := tf.GetLayerName(k.(string)) ggufName, err := tf.GetLayerName(k.(string))
if err != nil { if err != nil {
slog.Error("%v", err) slog.Error(err.Error())
return nil, err return nil, err
} }
slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName)) slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName))

View File

@@ -17,7 +17,7 @@
### Model names ### Model names
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version. Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
### Durations ### Durations
@@ -66,7 +66,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama2", "model": "llama3",
"prompt": "Why is the sky blue?" "prompt": "Why is the sky blue?"
}' }'
``` ```
@@ -77,7 +77,7 @@ A stream of JSON objects is returned:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T08:52:19.385406455-07:00", "created_at": "2023-08-04T08:52:19.385406455-07:00",
"response": "The", "response": "The",
"done": false "done": false
@@ -99,7 +99,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"response": "", "response": "",
"done": true, "done": true,
@@ -121,7 +121,7 @@ A response can be received in one reply when streaming is off.
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama2", "model": "llama3",
"prompt": "Why is the sky blue?", "prompt": "Why is the sky blue?",
"stream": false "stream": false
}' }'
@@ -133,7 +133,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.", "response": "The sky is blue because it is the color of the sky.",
"done": true, "done": true,
@@ -155,7 +155,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama2", "model": "llama3",
"prompt": "What color is the sky at different times of the day? Respond using JSON", "prompt": "What color is the sky at different times of the day? Respond using JSON",
"format": "json", "format": "json",
"stream": false "stream": false
@@ -166,7 +166,7 @@ curl http://localhost:11434/api/generate -d '{
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-11-09T21:07:55.186497Z", "created_at": "2023-11-09T21:07:55.186497Z",
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n", "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
"done": true, "done": true,
@@ -289,7 +289,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama2", "model": "llama3",
"prompt": "Why is the sky blue?", "prompt": "Why is the sky blue?",
"stream": false, "stream": false,
"options": { "options": {
@@ -332,7 +332,7 @@ curl http://localhost:11434/api/generate -d '{
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.", "response": "The sky is blue because it is the color of the sky.",
"done": true, "done": true,
@@ -354,7 +354,7 @@ If an empty prompt is provided, the model will be loaded into memory.
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama2" "model": "llama3"
}' }'
``` ```
@@ -364,7 +364,7 @@ A single JSON object is returned:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-12-18T19:52:07.071755Z", "created_at": "2023-12-18T19:52:07.071755Z",
"response": "", "response": "",
"done": true "done": true
@@ -407,7 +407,7 @@ Send a chat message with a streaming response.
```shell ```shell
curl http://localhost:11434/api/chat -d '{ curl http://localhost:11434/api/chat -d '{
"model": "llama2", "model": "llama3",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
@@ -423,7 +423,7 @@ A stream of JSON objects is returned:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T08:52:19.385406455-07:00", "created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": { "message": {
"role": "assistant", "role": "assistant",
@@ -438,7 +438,7 @@ Final response:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"done": true, "done": true,
"total_duration": 4883583458, "total_duration": 4883583458,
@@ -456,7 +456,7 @@ Final response:
```shell ```shell
curl http://localhost:11434/api/chat -d '{ curl http://localhost:11434/api/chat -d '{
"model": "llama2", "model": "llama3",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
@@ -471,7 +471,7 @@ curl http://localhost:11434/api/chat -d '{
```json ```json
{ {
"model": "registry.ollama.ai/library/llama2:latest", "model": "registry.ollama.ai/library/llama3:latest",
"created_at": "2023-12-12T14:13:43.416799Z", "created_at": "2023-12-12T14:13:43.416799Z",
"message": { "message": {
"role": "assistant", "role": "assistant",
@@ -495,7 +495,7 @@ Send a chat message with a conversation history. You can use this same approach
```shell ```shell
curl http://localhost:11434/api/chat -d '{ curl http://localhost:11434/api/chat -d '{
"model": "llama2", "model": "llama3",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
@@ -519,7 +519,7 @@ A stream of JSON objects is returned:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T08:52:19.385406455-07:00", "created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": { "message": {
"role": "assistant", "role": "assistant",
@@ -533,7 +533,7 @@ Final response:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"done": true, "done": true,
"total_duration": 8113331500, "total_duration": 8113331500,
@@ -591,7 +591,7 @@ curl http://localhost:11434/api/chat -d '{
```shell ```shell
curl http://localhost:11434/api/chat -d '{ curl http://localhost:11434/api/chat -d '{
"model": "llama2", "model": "llama3",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
@@ -609,7 +609,7 @@ curl http://localhost:11434/api/chat -d '{
```json ```json
{ {
"model": "registry.ollama.ai/library/llama2:latest", "model": "registry.ollama.ai/library/llama3:latest",
"created_at": "2023-12-12T14:13:43.416799Z", "created_at": "2023-12-12T14:13:43.416799Z",
"message": { "message": {
"role": "assistant", "role": "assistant",
@@ -651,7 +651,7 @@ Create a new model from a `Modelfile`.
```shell ```shell
curl http://localhost:11434/api/create -d '{ curl http://localhost:11434/api/create -d '{
"name": "mario", "name": "mario",
"modelfile": "FROM llama2\nSYSTEM You are mario from Super Mario Bros." "modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
}' }'
``` ```
@@ -758,7 +758,7 @@ A single JSON object will be returned.
} }
}, },
{ {
"name": "llama2:latest", "name": "llama3:latest",
"modified_at": "2023-12-07T09:32:18.757212583-08:00", "modified_at": "2023-12-07T09:32:18.757212583-08:00",
"size": 3825819519, "size": 3825819519,
"digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e", "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
@@ -792,7 +792,7 @@ Show information about a model including details, modelfile, template, parameter
```shell ```shell
curl http://localhost:11434/api/show -d '{ curl http://localhost:11434/api/show -d '{
"name": "llama2" "name": "llama3"
}' }'
``` ```
@@ -827,8 +827,8 @@ Copy a model. Creates a model with another name from an existing model.
```shell ```shell
curl http://localhost:11434/api/copy -d '{ curl http://localhost:11434/api/copy -d '{
"source": "llama2", "source": "llama3",
"destination": "llama2-backup" "destination": "llama3-backup"
}' }'
``` ```
@@ -854,7 +854,7 @@ Delete a model and its data.
```shell ```shell
curl -X DELETE http://localhost:11434/api/delete -d '{ curl -X DELETE http://localhost:11434/api/delete -d '{
"name": "llama2:13b" "name": "llama3:13b"
}' }'
``` ```
@@ -882,7 +882,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
```shell ```shell
curl http://localhost:11434/api/pull -d '{ curl http://localhost:11434/api/pull -d '{
"name": "llama2" "name": "llama3"
}' }'
``` ```

View File

@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
``` ```
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama2", "model": "llama3",
"prompt": "Why is the sky blue?", "prompt": "Why is the sky blue?",
"options": { "options": {
"num_ctx": 4096 "num_ctx": 4096
@@ -88,9 +88,9 @@ On windows, Ollama inherits your user and system environment variables.
3. Edit or create New variable(s) for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc. 3. Edit or create New variable(s) for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc.
4. Click OK/Apply to save 4. Click OK/Apply to save
5. Run `ollama` from a new terminal window 5. Run `ollama` from a new terminal window
## How can I expose Ollama on my network? ## How can I expose Ollama on my network?
@@ -221,14 +221,20 @@ The `keep_alive` parameter can be set to:
For example, to preload a model and leave it in memory use: For example, to preload a model and leave it in memory use:
```shell ```shell
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": -1}' curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
``` ```
To unload the model and free up memory use: To unload the model and free up memory use:
```shell ```shell
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}' curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
``` ```
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable. Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints. If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
## How do I manage the maximum number of requests the server can queue
If too many requests are sent to the server, it will respond with a 503 error
indicating the server is overloaded. You can adjust how many requests may be
queue by setting `OLLAMA_MAX_QUEUE`

View File

@@ -10,7 +10,7 @@ A model file is the blueprint to create and share models with Ollama.
- [Examples](#examples) - [Examples](#examples)
- [Instructions](#instructions) - [Instructions](#instructions)
- [FROM (Required)](#from-required) - [FROM (Required)](#from-required)
- [Build from llama2](#build-from-llama2) - [Build from llama3](#build-from-llama3)
- [Build from a bin file](#build-from-a-bin-file) - [Build from a bin file](#build-from-a-bin-file)
- [PARAMETER](#parameter) - [PARAMETER](#parameter)
- [Valid Parameters and Values](#valid-parameters-and-values) - [Valid Parameters and Values](#valid-parameters-and-values)
@@ -48,7 +48,7 @@ INSTRUCTION arguments
An example of a `Modelfile` creating a mario blueprint: An example of a `Modelfile` creating a mario blueprint:
```modelfile ```modelfile
FROM llama2 FROM llama3
# sets the temperature to 1 [higher is more creative, lower is more coherent] # sets the temperature to 1 [higher is more creative, lower is more coherent]
PARAMETER temperature 1 PARAMETER temperature 1
# sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
@@ -67,33 +67,25 @@ To use this:
More examples are available in the [examples directory](../examples). More examples are available in the [examples directory](../examples).
### `Modelfile`s in [ollama.com/library][1] To view the Modelfile of a given model, use the `ollama show --modelfile` command.
There are two ways to view `Modelfile`s underlying the models in [ollama.com/library][1]:
- Option 1: view a details page from a model's tags page:
1. Go to a particular model's tags (e.g. https://ollama.com/library/llama2/tags)
2. Click on a tag (e.g. https://ollama.com/library/llama2:13b)
3. Scroll down to "Layers"
- Note: if the [`FROM` instruction](#from-required) is not present,
it means the model was created from a local file
- Option 2: use `ollama show` to print the `Modelfile` for any local models like so:
```bash ```bash
> ollama show --modelfile llama2:13b > ollama show --modelfile llama3
# Modelfile generated by "ollama show" # Modelfile generated by "ollama show"
# To build a new Modelfile based on this one, replace the FROM line with: # To build a new Modelfile based on this one, replace the FROM line with:
# FROM llama2:13b # FROM llama3:latest
FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
FROM /root/.ollama/models/blobs/sha256:123abc {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
TEMPLATE """[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>>
{{ end }}{{ .Prompt }} [/INST] """ {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
SYSTEM """"""
PARAMETER stop [INST] {{ .Response }}<|eot_id|>"""
PARAMETER stop [/INST] PARAMETER stop "<|start_header_id|>"
PARAMETER stop <<SYS>> PARAMETER stop "<|end_header_id|>"
PARAMETER stop <</SYS>> PARAMETER stop "<|eot_id|>"
PARAMETER stop "<|reserved_special_token"
``` ```
## Instructions ## Instructions
@@ -106,10 +98,10 @@ The `FROM` instruction defines the base model to use when creating a model.
FROM <model name>:<tag> FROM <model name>:<tag>
``` ```
#### Build from llama2 #### Build from llama3
```modelfile ```modelfile
FROM llama2 FROM llama3
``` ```
A list of available base models: A list of available base models:

View File

@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
'content': 'Say this is a test', 'content': 'Say this is a test',
} }
], ],
model='llama2', model='llama3',
) )
``` ```
@@ -43,7 +43,7 @@ const openai = new OpenAI({
const chatCompletion = await openai.chat.completions.create({ const chatCompletion = await openai.chat.completions.create({
messages: [{ role: 'user', content: 'Say this is a test' }], messages: [{ role: 'user', content: 'Say this is a test' }],
model: 'llama2', model: 'llama3',
}) })
``` ```
@@ -53,7 +53,7 @@ const chatCompletion = await openai.chat.completions.create({
curl http://localhost:11434/v1/chat/completions \ curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model": "llama2", "model": "llama3",
"messages": [ "messages": [
{ {
"role": "system", "role": "system",
@@ -113,7 +113,7 @@ curl http://localhost:11434/v1/chat/completions \
Before using a model, pull it locally `ollama pull`: Before using a model, pull it locally `ollama pull`:
```shell ```shell
ollama pull llama2 ollama pull llama3
``` ```
### Default model names ### Default model names
@@ -121,7 +121,7 @@ ollama pull llama2
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name: For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
``` ```
ollama cp llama2 gpt-3.5-turbo ollama cp llama3 gpt-3.5-turbo
``` ```
Afterwards, this new model name can be specified the `model` field: Afterwards, this new model name can be specified the `model` field:

View File

@@ -15,7 +15,7 @@ import { Ollama } from "langchain/llms/ollama";
const ollama = new Ollama({ const ollama = new Ollama({
baseUrl: "http://localhost:11434", baseUrl: "http://localhost:11434",
model: "llama2", model: "llama3",
}); });
const answer = await ollama.invoke(`why is the sky blue?`); const answer = await ollama.invoke(`why is the sky blue?`);
@@ -23,10 +23,10 @@ const answer = await ollama.invoke(`why is the sky blue?`);
console.log(answer); console.log(answer);
``` ```
That will get us the same thing as if we ran `ollama run llama2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app. That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
```bash ```bash
npm install cheerio npm install cheerio
``` ```
```javascript ```javascript

View File

@@ -1,3 +1,4 @@
<<<<<<< HEAD
# Ollama Windows Preview # Ollama Windows Preview
Welcome to the Ollama Windows preview. Welcome to the Ollama Windows preview.
@@ -27,7 +28,7 @@ Logs will often be helpful in diagnosing the problem (see
Here's a quick example showing API access from `powershell` Here's a quick example showing API access from `powershell`
```powershell ```powershell
(Invoke-WebRequest -method POST -Body '{"model":"llama2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json (Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
``` ```
## Troubleshooting ## Troubleshooting
@@ -45,3 +46,17 @@ the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH) - `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration - `explorer %HOMEPATH%\.ollama` contains models and configuration
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories - `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
## Standalone CLI
The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
installer. It installs in your account without requiring Administrator rights.
We update Ollama regularly to support the latest models, and this installer will
help you keep up to date.
If you'd like to install or integrate Ollama as a service, a standalone
`ollama-windows-amd64.zip` zip file is available containing only the Ollama CLI
and GPU library dependencies for Nvidia and AMD. This allows for embedding
Ollama in existing applications, or running it as a system service via `ollama
serve` with tools such as [NSSM](https://nssm.cc/).

View File

@@ -2,7 +2,7 @@
When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other: When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other:
`ollama run llama2 < sourcequestions.txt` `ollama run llama3 < sourcequestions.txt`
This concept is used in the following example. This concept is used in the following example.

View File

@@ -35,7 +35,7 @@ func main() {
ctx := context.Background() ctx := context.Background()
req := &api.ChatRequest{ req := &api.ChatRequest{
Model: "llama2", Model: "llama3",
Messages: messages, Messages: messages,
} }

View File

@@ -19,7 +19,7 @@ func main() {
} }
defer resp.Body.Close() defer resp.Body.Close()
responseData, err := io.ReadAll(resp.Body) responseData, err := io.ReadAll(resp.Body)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)

View File

@@ -40,9 +40,9 @@ while True:
continue continue
# Prompt # Prompt
template = """Use the following pieces of context to answer the question at the end. template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer. If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible. Use three sentences maximum and keep the answer as concise as possible.
{context} {context}
Question: {question} Question: {question}
Helpful Answer:""" Helpful Answer:"""
@@ -51,11 +51,11 @@ while True:
template=template, template=template,
) )
llm = Ollama(model="llama2:13b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])) llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
qa_chain = RetrievalQA.from_chain_type( qa_chain = RetrievalQA.from_chain_type(
llm, llm,
retriever=vectorstore.as_retriever(), retriever=vectorstore.as_retriever(),
chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}, chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
) )
result = qa_chain({"query": query}) result = qa_chain({"query": query})

View File

@@ -1,12 +1,12 @@
from langchain.llms import Ollama from langchain_community.llms import Ollama
from langchain.document_loaders import WebBaseLoader from langchain_community.document_loaders import WebBaseLoader
from langchain.chains.summarize import load_summarize_chain from langchain.chains.summarize import load_summarize_chain
loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally") loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
docs = loader.load() docs = loader.load()
llm = Ollama(model="llama2") llm = Ollama(model="llama3")
chain = load_summarize_chain(llm, chain_type="stuff") chain = load_summarize_chain(llm, chain_type="stuff")
result = chain.run(docs) result = chain.invoke(docs)
print(result) print(result)

View File

@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
## Running the Example ## Running the Example
1. Ensure you have the `llama2` model installed: 1. Ensure you have the `llama3` model installed:
```bash ```bash
ollama pull llama2 ollama pull llama3
``` ```
2. Install the Python Requirements. 2. Install the Python Requirements.
@@ -21,4 +21,3 @@ This example is a basic "hello world" of using LangChain with Ollama.
```bash ```bash
python main.py python main.py
``` ```

View File

@@ -1,6 +1,6 @@
from langchain.llms import Ollama from langchain.llms import Ollama
input = input("What is your question?") input = input("What is your question?")
llm = Ollama(model="llama2") llm = Ollama(model="llama3")
res = llm.predict(input) res = llm.predict(input)
print (res) print (res)

View File

@@ -1,4 +1,4 @@
FROM llama2 FROM llama3
PARAMETER temperature 1 PARAMETER temperature 1
SYSTEM """ SYSTEM """
You are Mario from super mario bros, acting as an assistant. You are Mario from super mario bros, acting as an assistant.

View File

@@ -2,12 +2,12 @@
# Example character: Mario # Example character: Mario
This example shows how to create a basic character using Llama2 as the base model. This example shows how to create a basic character using Llama3 as the base model.
To run this example: To run this example:
1. Download the Modelfile 1. Download the Modelfile
2. `ollama pull llama2` to get the base model used in the model file. 2. `ollama pull llama3` to get the base model used in the model file.
3. `ollama create NAME -f ./Modelfile` 3. `ollama create NAME -f ./Modelfile`
4. `ollama run NAME` 4. `ollama run NAME`
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
What the model file looks like: What the model file looks like:
``` ```
FROM llama2 FROM llama3
PARAMETER temperature 1 PARAMETER temperature 1
SYSTEM """ SYSTEM """
You are Mario from Super Mario Bros, acting as an assistant. You are Mario from Super Mario Bros, acting as an assistant.

View File

@@ -2,16 +2,16 @@ import requests
import json import json
import random import random
model = "llama2" model = "llama3"
template = { template = {
"firstName": "", "firstName": "",
"lastName": "", "lastName": "",
"address": { "address": {
"street": "", "street": "",
"city": "", "city": "",
"state": "", "state": "",
"zipCode": "" "zipCode": ""
}, },
"phoneNumber": "" "phoneNumber": ""
} }

View File

@@ -12,7 +12,7 @@ countries = [
"France", "France",
] ]
country = random.choice(countries) country = random.choice(countries)
model = "llama2" model = "llama3"
prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters." prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."

View File

@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
## Running the Example ## Running the Example
1. Ensure you have the `llama2` model installed: 1. Ensure you have the `llama3` model installed:
```bash ```bash
ollama pull llama2 ollama pull llama3
``` ```
2. Install the Python Requirements. 2. Install the Python Requirements.

View File

@@ -2,7 +2,7 @@ import json
import requests import requests
# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve` # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
model = "llama2" # TODO: update this for whatever model you wish to use model = "llama3" # TODO: update this for whatever model you wish to use
def chat(messages): def chat(messages):

View File

@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
## Running the Example ## Running the Example
1. Ensure you have the `llama2` model installed: 1. Ensure you have the `llama3` model installed:
```bash ```bash
ollama pull llama2 ollama pull llama3
``` ```
2. Install the Python Requirements. 2. Install the Python Requirements.

View File

@@ -4,10 +4,10 @@ This example demonstrates how one would create a set of 'mentors' you can have a
## Usage ## Usage
1. Add llama2 to have the mentors ask your questions: 1. Add llama3 to have the mentors ask your questions:
```bash ```bash
ollama pull llama2 ollama pull llama3
``` ```
2. Install prerequisites: 2. Install prerequisites:

View File

@@ -15,7 +15,7 @@ async function characterGenerator() {
ollama.setModel("stablebeluga2:70b-q4_K_M"); ollama.setModel("stablebeluga2:70b-q4_K_M");
const bio = await ollama.generate(`create a bio of ${character} in a single long paragraph. Instead of saying '${character} is...' or '${character} was...' use language like 'You are...' or 'You were...'. Then create a paragraph describing the speaking mannerisms and style of ${character}. Don't include anything about how ${character} looked or what they sounded like, just focus on the words they said. Instead of saying '${character} would say...' use language like 'You should say...'. If you use quotes, always use single quotes instead of double quotes. If there are any specific words or phrases you used a lot, show how you used them. `); const bio = await ollama.generate(`create a bio of ${character} in a single long paragraph. Instead of saying '${character} is...' or '${character} was...' use language like 'You are...' or 'You were...'. Then create a paragraph describing the speaking mannerisms and style of ${character}. Don't include anything about how ${character} looked or what they sounded like, just focus on the words they said. Instead of saying '${character} would say...' use language like 'You should say...'. If you use quotes, always use single quotes instead of double quotes. If there are any specific words or phrases you used a lot, show how you used them. `);
const thecontents = `FROM llama2\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`; const thecontents = `FROM llama3\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;
fs.writeFile(path.join(directory, 'Modelfile'), thecontents, (err: any) => { fs.writeFile(path.join(directory, 'Modelfile'), thecontents, (err: any) => {
if (err) throw err; if (err) throw err;
@@ -23,4 +23,4 @@ async function characterGenerator() {
}); });
} }
characterGenerator(); characterGenerator();

View File

@@ -1,6 +1,6 @@
import * as readline from "readline"; import * as readline from "readline";
const model = "llama2"; const model = "llama3";
type Message = { type Message = {
role: "assistant" | "user" | "system"; role: "assistant" | "user" | "system";
content: string; content: string;
@@ -74,4 +74,4 @@ async function main() {
} }
main(); main();

View File

@@ -12,6 +12,8 @@ import (
"sync" "sync"
"syscall" "syscall"
"time" "time"
"github.com/ollama/ollama/server/envconfig"
) )
var ( var (
@@ -24,45 +26,8 @@ func PayloadsDir() (string, error) {
defer lock.Unlock() defer lock.Unlock()
var err error var err error
if payloadsDir == "" { if payloadsDir == "" {
runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR") runnersDir := envconfig.RunnersDir
// On Windows we do not carry the payloads inside the main executable
if runtime.GOOS == "windows" && runnersDir == "" {
appExe, err := os.Executable()
if err != nil {
slog.Error("failed to lookup executable path", "error", err)
return "", err
}
cwd, err := os.Getwd()
if err != nil {
slog.Error("failed to lookup working directory", "error", err)
return "", err
}
var paths []string
for _, root := range []string{filepath.Dir(appExe), cwd} {
paths = append(paths,
filepath.Join(root),
filepath.Join(root, "windows-"+runtime.GOARCH),
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
)
}
// Try a few variations to improve developer experience when building from source in the local tree
for _, p := range paths {
candidate := filepath.Join(p, "ollama_runners")
_, err := os.Stat(candidate)
if err == nil {
runnersDir = candidate
break
}
}
if runnersDir == "" {
err = fmt.Errorf("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
slog.Error("incomplete distribution", "error", err)
return "", err
}
}
if runnersDir != "" { if runnersDir != "" {
payloadsDir = runnersDir payloadsDir = runnersDir
return payloadsDir, nil return payloadsDir, nil
@@ -70,7 +35,7 @@ func PayloadsDir() (string, error) {
// The remainder only applies on non-windows where we still carry payloads in the main executable // The remainder only applies on non-windows where we still carry payloads in the main executable
cleanupTmpDirs() cleanupTmpDirs()
tmpDir := os.Getenv("OLLAMA_TMPDIR") tmpDir := envconfig.TmpDir
if tmpDir == "" { if tmpDir == "" {
tmpDir, err = os.MkdirTemp("", "ollama") tmpDir, err = os.MkdirTemp("", "ollama")
if err != nil { if err != nil {
@@ -133,7 +98,7 @@ func cleanupTmpDirs() {
func Cleanup() { func Cleanup() {
lock.Lock() lock.Lock()
defer lock.Unlock() defer lock.Unlock()
runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR") runnersDir := envconfig.RunnersDir
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" { if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
// We want to fully clean up the tmpdir parent of the payloads dir // We want to fully clean up the tmpdir parent of the payloads dir
tmpDir := filepath.Clean(filepath.Join(payloadsDir, "..")) tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))

View File

@@ -21,6 +21,7 @@ import (
"unsafe" "unsafe"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/server/envconfig"
) )
type handles struct { type handles struct {
@@ -268,7 +269,7 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
} }
func getVerboseState() C.uint16_t { func getVerboseState() C.uint16_t {
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" { if envconfig.Debug {
return C.uint16_t(1) return C.uint16_t(1)
} }
return C.uint16_t(0) return C.uint16_t(0)

View File

@@ -0,0 +1,117 @@
//go:build integration
package integration
import (
"context"
"errors"
"fmt"
"log/slog"
"os"
"strconv"
"strings"
"sync"
"testing"
"time"
"github.com/ollama/ollama/api"
"github.com/stretchr/testify/require"
)
func TestMaxQueue(t *testing.T) {
// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
threadCount := 32
mq := os.Getenv("OLLAMA_MAX_QUEUE")
if mq != "" {
var err error
threadCount, err = strconv.Atoi(mq)
require.NoError(t, err)
} else {
os.Setenv("OLLAMA_MAX_QUEUE", fmt.Sprintf("%d", threadCount))
}
req := api.GenerateRequest{
Model: "orca-mini",
Prompt: "write a long historical fiction story about christopher columbus. use at least 10 facts from his actual journey",
Options: map[string]interface{}{
"seed": 42,
"temperature": 0.0,
},
}
resp := []string{"explore", "discover", "ocean"}
// CPU mode takes much longer at the limit with a large queue setting
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
require.NoError(t, PullIfMissing(ctx, client, req.Model))
// Context for the worker threads so we can shut them down
// embedCtx, embedCancel := context.WithCancel(ctx)
embedCtx := ctx
var genwg sync.WaitGroup
go func() {
genwg.Add(1)
defer genwg.Done()
slog.Info("Starting generate request")
DoGenerate(ctx, t, client, req, resp, 45*time.Second, 5*time.Second)
slog.Info("generate completed")
}()
// Give the generate a chance to get started before we start hammering on embed requests
time.Sleep(5 * time.Millisecond)
threadCount += 10 // Add a few extra to ensure we push the queue past its limit
busyCount := 0
resetByPeerCount := 0
canceledCount := 0
succesCount := 0
counterMu := sync.Mutex{}
var embedwg sync.WaitGroup
for i := 0; i < threadCount; i++ {
go func(i int) {
embedwg.Add(1)
defer embedwg.Done()
slog.Info("embed started", "id", i)
embedReq := api.EmbeddingRequest{
Model: req.Model,
Prompt: req.Prompt,
Options: req.Options,
}
// Fresh client for every request
client, _ = GetTestEndpoint()
resp, genErr := client.Embeddings(embedCtx, &embedReq)
counterMu.Lock()
defer counterMu.Unlock()
switch {
case genErr == nil:
succesCount++
require.Greater(t, len(resp.Embedding), 5) // somewhat arbitrary, but sufficient to be reasonable
case errors.Is(genErr, context.Canceled):
canceledCount++
case strings.Contains(genErr.Error(), "busy"):
busyCount++
case strings.Contains(genErr.Error(), "connection reset by peer"):
resetByPeerCount++
default:
require.NoError(t, genErr, "%d request failed", i)
}
slog.Info("embed finished", "id", i)
}(i)
}
genwg.Wait()
slog.Info("generate done, waiting for embeds")
embedwg.Wait()
require.Equal(t, resetByPeerCount, 0, "Connections reset by peer, have you updated your fd and socket limits?")
require.True(t, busyCount > 0, "no requests hit busy error but some should have")
require.True(t, canceledCount == 0, "no requests should have been canceled due to timeout")
slog.Info("embeds completed", "success", succesCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
}

View File

@@ -1186,8 +1186,6 @@ struct llama_server_context
{"model", params.model_alias}, {"model", params.model_alias},
{"tokens_predicted", slot.n_decoded}, {"tokens_predicted", slot.n_decoded},
{"tokens_evaluated", slot.n_prompt_tokens}, {"tokens_evaluated", slot.n_prompt_tokens},
{"generation_settings", get_formated_generation(slot)},
{"prompt", slot.prompt},
{"truncated", slot.truncated}, {"truncated", slot.truncated},
{"stopped_eos", slot.stopped_eos}, {"stopped_eos", slot.stopped_eos},
{"stopped_word", slot.stopped_word}, {"stopped_word", slot.stopped_word},

View File

@@ -3,12 +3,11 @@ package llm
import ( import (
"fmt" "fmt"
"log/slog" "log/slog"
"os"
"strconv"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu" "github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/server/envconfig"
) )
// This algorithm looks for a complete fit to determine if we need to unload other models // This algorithm looks for a complete fit to determine if we need to unload other models
@@ -50,15 +49,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
for _, info := range gpus { for _, info := range gpus {
memoryAvailable += info.FreeMemory memoryAvailable += info.FreeMemory
} }
userLimit := os.Getenv("OLLAMA_MAX_VRAM") if envconfig.MaxVRAM > 0 {
if userLimit != "" { memoryAvailable = envconfig.MaxVRAM
avail, err := strconv.ParseUint(userLimit, 10, 64)
if err != nil {
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
} else {
slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable)
memoryAvailable = avail
}
} }
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable)) slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))

View File

@@ -0,0 +1,24 @@
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index e3c9bcd4..b43f892d 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -573,14 +573,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
struct ggml_tensor * embeddings = inp;
if (ctx->has_class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+ }
+ ggml_set_name(embeddings, "embeddings");
+ ggml_set_input(embeddings);
+
+ if (ctx->has_class_embedding) {
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
- ggml_set_name(embeddings, "embeddings");
- ggml_set_input(embeddings);
-
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_set_name(positions, "positions");

View File

@@ -26,6 +26,7 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu" "github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/server/envconfig"
) )
type LlamaServer interface { type LlamaServer interface {
@@ -124,7 +125,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} else { } else {
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
} }
demandLib := strings.Trim(os.Getenv("OLLAMA_LLM_LIBRARY"), "\"' ") demandLib := envconfig.LLMLibrary
if demandLib != "" { if demandLib != "" {
serverPath := availableServers[demandLib] serverPath := availableServers[demandLib]
if serverPath == "" { if serverPath == "" {
@@ -145,7 +146,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
"--batch-size", fmt.Sprintf("%d", opts.NumBatch), "--batch-size", fmt.Sprintf("%d", opts.NumBatch),
"--embedding", "--embedding",
} }
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" { if envconfig.Debug {
params = append(params, "--log-format", "json") params = append(params, "--log-format", "json")
} else { } else {
params = append(params, "--log-disable") params = append(params, "--log-disable")
@@ -155,7 +156,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU)) params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
} }
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" { if envconfig.Debug {
params = append(params, "--verbose") params = append(params, "--verbose")
} }
@@ -193,16 +194,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--numa") params = append(params, "--numa")
} }
// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests numParallel := envconfig.NumParallel
numParallel := 1
if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" { // TODO (jmorganca): multimodal models don't support parallel yet
numParallel, err = strconv.Atoi(onp) // see https://github.com/ollama/ollama/issues/4165
if err != nil || numParallel <= 0 { if len(projectors) > 0 {
err = fmt.Errorf("invalid OLLAMA_NUM_PARALLEL=%s must be greater than zero - %w", onp, err) numParallel = 1
slog.Error("misconfiguration", "error", err) slog.Warn("multimodal models don't support parallel requests yet")
return nil, err
}
} }
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel)) params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
for i := 0; i < len(servers); i++ { for i := 0; i < len(servers); i++ {

174
server/envconfig/config.go Normal file
View File

@@ -0,0 +1,174 @@
package envconfig
import (
"fmt"
"log/slog"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
)
var (
// Set via OLLAMA_ORIGINS in the environment
AllowOrigins []string
// Set via OLLAMA_DEBUG in the environment
Debug bool
// Set via OLLAMA_LLM_LIBRARY in the environment
LLMLibrary string
// Set via OLLAMA_MAX_LOADED_MODELS in the environment
MaxRunners int
// Set via OLLAMA_MAX_QUEUE in the environment
MaxQueuedRequests int
// Set via OLLAMA_MAX_VRAM in the environment
MaxVRAM uint64
// Set via OLLAMA_NOPRUNE in the environment
NoPrune bool
// Set via OLLAMA_NUM_PARALLEL in the environment
NumParallel int
// Set via OLLAMA_RUNNERS_DIR in the environment
RunnersDir string
// Set via OLLAMA_TMPDIR in the environment
TmpDir string
)
func AsMap() map[string]string {
return map[string]string{
"OLLAMA_ORIGINS": fmt.Sprintf("%v", AllowOrigins),
"OLLAMA_DEBUG": fmt.Sprintf("%v", Debug),
"OLLAMA_LLM_LIBRARY": fmt.Sprintf("%v", LLMLibrary),
"OLLAMA_MAX_LOADED_MODELS": fmt.Sprintf("%v", MaxRunners),
"OLLAMA_MAX_QUEUE": fmt.Sprintf("%v", MaxQueuedRequests),
"OLLAMA_MAX_VRAM": fmt.Sprintf("%v", MaxVRAM),
"OLLAMA_NOPRUNE": fmt.Sprintf("%v", NoPrune),
"OLLAMA_NUM_PARALLEL": fmt.Sprintf("%v", NumParallel),
"OLLAMA_RUNNERS_DIR": fmt.Sprintf("%v", RunnersDir),
"OLLAMA_TMPDIR": fmt.Sprintf("%v", TmpDir),
}
}
var defaultAllowOrigins = []string{
"localhost",
"127.0.0.1",
"0.0.0.0",
}
// Clean quotes and spaces from the value
func clean(key string) string {
return strings.Trim(os.Getenv(key), "\"' ")
}
func init() {
// default values
NumParallel = 1
MaxRunners = 1
MaxQueuedRequests = 512
LoadConfig()
}
func LoadConfig() {
if debug := clean("OLLAMA_DEBUG"); debug != "" {
d, err := strconv.ParseBool(debug)
if err == nil {
Debug = d
} else {
Debug = true
}
}
RunnersDir = clean("OLLAMA_RUNNERS_DIR")
if runtime.GOOS == "windows" && RunnersDir == "" {
// On Windows we do not carry the payloads inside the main executable
appExe, err := os.Executable()
if err != nil {
slog.Error("failed to lookup executable path", "error", err)
}
cwd, err := os.Getwd()
if err != nil {
slog.Error("failed to lookup working directory", "error", err)
}
var paths []string
for _, root := range []string{filepath.Dir(appExe), cwd} {
paths = append(paths,
filepath.Join(root),
filepath.Join(root, "windows-"+runtime.GOARCH),
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
)
}
// Try a few variations to improve developer experience when building from source in the local tree
for _, p := range paths {
candidate := filepath.Join(p, "ollama_runners")
_, err := os.Stat(candidate)
if err == nil {
RunnersDir = candidate
break
}
}
if RunnersDir == "" {
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
}
}
TmpDir = clean("OLLAMA_TMPDIR")
userLimit := clean("OLLAMA_MAX_VRAM")
if userLimit != "" {
avail, err := strconv.ParseUint(userLimit, 10, 64)
if err != nil {
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
} else {
MaxVRAM = avail
}
}
LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
val, err := strconv.Atoi(onp)
if err != nil || val <= 0 {
slog.Error("invalid setting must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
} else {
NumParallel = val
}
}
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
NoPrune = true
}
if origins := clean("OLLAMA_ORIGINS"); origins != "" {
AllowOrigins = strings.Split(origins, ",")
}
for _, allowOrigin := range defaultAllowOrigins {
AllowOrigins = append(AllowOrigins,
fmt.Sprintf("http://%s", allowOrigin),
fmt.Sprintf("https://%s", allowOrigin),
fmt.Sprintf("http://%s:*", allowOrigin),
fmt.Sprintf("https://%s:*", allowOrigin),
)
}
maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
if maxRunners != "" {
m, err := strconv.Atoi(maxRunners)
if err != nil {
slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
} else {
MaxRunners = m
}
}
if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
p, err := strconv.Atoi(onp)
if err != nil || p <= 0 {
slog.Error("invalid setting", "OLLAMA_MAX_QUEUE", onp, "error", err)
} else {
MaxQueuedRequests = p
}
}
}

View File

@@ -0,0 +1,20 @@
package envconfig
import (
"os"
"testing"
"github.com/stretchr/testify/require"
)
func TestConfig(t *testing.T) {
os.Setenv("OLLAMA_DEBUG", "")
LoadConfig()
require.False(t, Debug)
os.Setenv("OLLAMA_DEBUG", "false")
LoadConfig()
require.False(t, Debug)
os.Setenv("OLLAMA_DEBUG", "1")
LoadConfig()
require.True(t, Debug)
}

View File

@@ -29,7 +29,7 @@ import (
"github.com/ollama/ollama/convert" "github.com/ollama/ollama/convert"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/parser" "github.com/ollama/ollama/server/envconfig"
"github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/errtypes"
"github.com/ollama/ollama/types/model" "github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version" "github.com/ollama/ollama/version"
@@ -63,46 +63,74 @@ func (m *Model) IsEmbedding() bool {
return slices.Contains(m.Config.ModelFamilies, "bert") || slices.Contains(m.Config.ModelFamilies, "nomic-bert") return slices.Contains(m.Config.ModelFamilies, "bert") || slices.Contains(m.Config.ModelFamilies, "nomic-bert")
} }
func (m *Model) Commands() (cmds []parser.Command) { func (m *Model) String() string {
cmds = append(cmds, parser.Command{Name: "model", Args: m.ModelPath}) var modelfile model.File
modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "model",
Args: m.ModelPath,
})
if m.Template != "" { if m.Template != "" {
cmds = append(cmds, parser.Command{Name: "template", Args: m.Template}) modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "template",
Args: m.Template,
})
} }
if m.System != "" { if m.System != "" {
cmds = append(cmds, parser.Command{Name: "system", Args: m.System}) modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "system",
Args: m.System,
})
} }
for _, adapter := range m.AdapterPaths { for _, adapter := range m.AdapterPaths {
cmds = append(cmds, parser.Command{Name: "adapter", Args: adapter}) modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "adapter",
Args: adapter,
})
} }
for _, projector := range m.ProjectorPaths { for _, projector := range m.ProjectorPaths {
cmds = append(cmds, parser.Command{Name: "projector", Args: projector}) modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "projector",
Args: projector,
})
} }
for k, v := range m.Options { for k, v := range m.Options {
switch v := v.(type) { switch v := v.(type) {
case []any: case []any:
for _, s := range v { for _, s := range v {
cmds = append(cmds, parser.Command{Name: k, Args: fmt.Sprintf("%v", s)}) modelfile.Commands = append(modelfile.Commands, model.Command{
Name: k,
Args: fmt.Sprintf("%v", s),
})
} }
default: default:
cmds = append(cmds, parser.Command{Name: k, Args: fmt.Sprintf("%v", v)}) modelfile.Commands = append(modelfile.Commands, model.Command{
Name: k,
Args: fmt.Sprintf("%v", v),
})
} }
} }
for _, license := range m.License { for _, license := range m.License {
cmds = append(cmds, parser.Command{Name: "license", Args: license}) modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "license",
Args: license,
})
} }
for _, msg := range m.Messages { for _, msg := range m.Messages {
cmds = append(cmds, parser.Command{Name: "message", Args: fmt.Sprintf("%s %s", msg.Role, msg.Content)}) modelfile.Commands = append(modelfile.Commands, model.Command{
Name: "message",
Args: fmt.Sprintf("%s %s", msg.Role, msg.Content),
})
} }
return cmds return modelfile.String()
} }
type Message struct { type Message struct {
@@ -329,7 +357,7 @@ func realpath(mfDir, from string) string {
return abspath return abspath
} }
func CreateModel(ctx context.Context, name, modelFileDir, quantization string, commands []parser.Command, fn func(resp api.ProgressResponse)) error { func CreateModel(ctx context.Context, name, modelFileDir, quantization string, modelfile *model.File, fn func(resp api.ProgressResponse)) error {
deleteMap := make(map[string]struct{}) deleteMap := make(map[string]struct{})
if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil { if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil {
for _, layer := range append(manifest.Layers, manifest.Config) { for _, layer := range append(manifest.Layers, manifest.Config) {
@@ -351,7 +379,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
params := make(map[string][]string) params := make(map[string][]string)
fromParams := make(map[string]any) fromParams := make(map[string]any)
for _, c := range commands { for _, c := range modelfile.Commands {
mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name) mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
switch c.Name { switch c.Name {
@@ -668,7 +696,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
return err return err
} }
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" { if !envconfig.NoPrune {
if err := deleteUnusedLayers(nil, deleteMap, false); err != nil { if err := deleteUnusedLayers(nil, deleteMap, false); err != nil {
return err return err
} }
@@ -999,7 +1027,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
// build deleteMap to prune unused layers // build deleteMap to prune unused layers
deleteMap := make(map[string]struct{}) deleteMap := make(map[string]struct{})
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" { if !envconfig.NoPrune {
manifest, _, err = GetManifest(mp) manifest, _, err = GetManifest(mp)
if err != nil && !errors.Is(err, os.ErrNotExist) { if err != nil && !errors.Is(err, os.ErrNotExist) {
return err return err

View File

@@ -6,6 +6,7 @@ import (
"net/url" "net/url"
"os" "os"
"path/filepath" "path/filepath"
"regexp"
"strings" "strings"
) )
@@ -25,9 +26,10 @@ const (
) )
var ( var (
ErrInvalidImageFormat = errors.New("invalid image format") ErrInvalidImageFormat = errors.New("invalid image format")
ErrInvalidProtocol = errors.New("invalid protocol scheme") ErrInvalidProtocol = errors.New("invalid protocol scheme")
ErrInsecureProtocol = errors.New("insecure protocol http") ErrInsecureProtocol = errors.New("insecure protocol http")
ErrInvalidDigestFormat = errors.New("invalid digest format")
) )
func ParseModelPath(name string) ModelPath { func ParseModelPath(name string) ModelPath {
@@ -149,6 +151,17 @@ func GetBlobsPath(digest string) (string, error) {
return "", err return "", err
} }
// only accept actual sha256 digests
pattern := "^sha256[:-][0-9a-fA-F]{64}$"
re := regexp.MustCompile(pattern)
if err != nil {
return "", err
}
if digest != "" && !re.MatchString(digest) {
return "", ErrInvalidDigestFormat
}
digest = strings.ReplaceAll(digest, ":", "-") digest = strings.ReplaceAll(digest, ":", "-")
path := filepath.Join(dir, "blobs", digest) path := filepath.Join(dir, "blobs", digest)
dirPath := filepath.Dir(path) dirPath := filepath.Dir(path)

View File

@@ -1,6 +1,73 @@
package server package server
import "testing" import (
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
)
func TestGetBlobsPath(t *testing.T) {
// GetBlobsPath expects an actual directory to exist
dir, err := os.MkdirTemp("", "ollama-test")
assert.Nil(t, err)
defer os.RemoveAll(dir)
tests := []struct {
name string
digest string
expected string
err error
}{
{
"empty digest",
"",
filepath.Join(dir, "blobs"),
nil,
},
{
"valid with colon",
"sha256:456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9",
filepath.Join(dir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"),
nil,
},
{
"valid with dash",
"sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9",
filepath.Join(dir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"),
nil,
},
{
"digest too short",
"sha256-45640291",
"",
ErrInvalidDigestFormat,
},
{
"digest too long",
"sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9aaaaaaaaaa",
"",
ErrInvalidDigestFormat,
},
{
"digest invalid chars",
"../sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7a",
"",
ErrInvalidDigestFormat,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
t.Setenv("OLLAMA_MODELS", dir)
got, err := GetBlobsPath(tc.digest)
assert.ErrorIs(t, tc.err, err, tc.name)
assert.Equal(t, tc.expected, got, tc.name)
})
}
}
func TestParseModelPath(t *testing.T) { func TestParseModelPath(t *testing.T) {
tests := []struct { tests := []struct {

View File

@@ -1,6 +1,7 @@
package server package server
import ( import (
"cmp"
"context" "context"
"encoding/json" "encoding/json"
"errors" "errors"
@@ -28,7 +29,7 @@ import (
"github.com/ollama/ollama/gpu" "github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/openai" "github.com/ollama/ollama/openai"
"github.com/ollama/ollama/parser" "github.com/ollama/ollama/server/envconfig"
"github.com/ollama/ollama/types/model" "github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version" "github.com/ollama/ollama/version"
) )
@@ -146,12 +147,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
select { select {
case runner = <-rCh: case runner = <-rCh:
case err = <-eCh: case err = <-eCh:
if errors.Is(err, context.Canceled) { handleErrorResponse(c, err)
c.JSON(499, gin.H{"error": "request canceled"})
return
}
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return return
} }
@@ -394,12 +390,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
select { select {
case runner = <-rCh: case runner = <-rCh:
case err = <-eCh: case err = <-eCh:
if errors.Is(err, context.Canceled) { handleErrorResponse(c, err)
c.JSON(499, gin.H{"error": "request canceled"})
return
}
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return return
} }
@@ -522,28 +513,17 @@ func (s *Server) PushModelHandler(c *gin.Context) {
func (s *Server) CreateModelHandler(c *gin.Context) { func (s *Server) CreateModelHandler(c *gin.Context) {
var req api.CreateRequest var req api.CreateRequest
err := c.ShouldBindJSON(&req) if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) {
switch {
case errors.Is(err, io.EOF):
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"}) c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
return return
case err != nil: } else if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()}) c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return return
} }
var model string name := model.ParseName(cmp.Or(req.Model, req.Name))
if req.Model != "" { if !name.IsValid() {
model = req.Model c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "invalid model name"})
} else if req.Name != "" {
model = req.Name
} else {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
return
}
if err := ParseModelPath(model).Validate(); err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return return
} }
@@ -552,19 +532,19 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
return return
} }
var modelfile io.Reader = strings.NewReader(req.Modelfile) var r io.Reader = strings.NewReader(req.Modelfile)
if req.Path != "" && req.Modelfile == "" { if req.Path != "" && req.Modelfile == "" {
mf, err := os.Open(req.Path) f, err := os.Open(req.Path)
if err != nil { if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("error reading modelfile: %s", err)}) c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("error reading modelfile: %s", err)})
return return
} }
defer mf.Close() defer f.Close()
modelfile = mf r = f
} }
commands, err := parser.Parse(modelfile) modelfile, err := model.ParseFile(r)
if err != nil { if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()}) c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return return
@@ -580,7 +560,7 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
ctx, cancel := context.WithCancel(c.Request.Context()) ctx, cancel := context.WithCancel(c.Request.Context())
defer cancel() defer cancel()
if err := CreateModel(ctx, model, filepath.Dir(req.Path), req.Quantization, commands, fn); err != nil { if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), req.Quantization, modelfile, fn); err != nil {
ch <- gin.H{"error": err.Error()} ch <- gin.H{"error": err.Error()}
} }
}() }()
@@ -732,7 +712,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
fmt.Fprintln(&sb, "# Modelfile generate by \"ollama show\"") fmt.Fprintln(&sb, "# Modelfile generate by \"ollama show\"")
fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:") fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName) fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName)
fmt.Fprint(&sb, parser.Format(model.Commands())) fmt.Fprint(&sb, model.String())
resp.Modelfile = sb.String() resp.Modelfile = sb.String()
return resp, nil return resp, nil
@@ -880,12 +860,6 @@ func (s *Server) CreateBlobHandler(c *gin.Context) {
c.Status(http.StatusCreated) c.Status(http.StatusCreated)
} }
var defaultAllowOrigins = []string{
"localhost",
"127.0.0.1",
"0.0.0.0",
}
func isLocalIP(ip netip.Addr) bool { func isLocalIP(ip netip.Addr) bool {
if interfaces, err := net.Interfaces(); err == nil { if interfaces, err := net.Interfaces(); err == nil {
for _, iface := range interfaces { for _, iface := range interfaces {
@@ -969,19 +943,7 @@ func (s *Server) GenerateRoutes() http.Handler {
config := cors.DefaultConfig() config := cors.DefaultConfig()
config.AllowWildcard = true config.AllowWildcard = true
config.AllowBrowserExtensions = true config.AllowBrowserExtensions = true
config.AllowOrigins = envconfig.AllowOrigins
if allowedOrigins := strings.Trim(os.Getenv("OLLAMA_ORIGINS"), "\"'"); allowedOrigins != "" {
config.AllowOrigins = strings.Split(allowedOrigins, ",")
}
for _, allowOrigin := range defaultAllowOrigins {
config.AllowOrigins = append(config.AllowOrigins,
fmt.Sprintf("http://%s", allowOrigin),
fmt.Sprintf("https://%s", allowOrigin),
fmt.Sprintf("http://%s:*", allowOrigin),
fmt.Sprintf("https://%s:*", allowOrigin),
)
}
r := gin.Default() r := gin.Default()
r.Use( r.Use(
@@ -1020,10 +982,11 @@ func (s *Server) GenerateRoutes() http.Handler {
func Serve(ln net.Listener) error { func Serve(ln net.Listener) error {
level := slog.LevelInfo level := slog.LevelInfo
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" { if envconfig.Debug {
level = slog.LevelDebug level = slog.LevelDebug
} }
slog.Info("server config", "env", envconfig.AsMap())
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: level, Level: level,
AddSource: true, AddSource: true,
@@ -1047,7 +1010,7 @@ func Serve(ln net.Listener) error {
return err return err
} }
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" { if !envconfig.NoPrune {
// clean up unused layers and manifests // clean up unused layers and manifests
if err := PruneLayers(); err != nil { if err := PruneLayers(); err != nil {
return err return err
@@ -1223,12 +1186,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
select { select {
case runner = <-rCh: case runner = <-rCh:
case err = <-eCh: case err = <-eCh:
if errors.Is(err, context.Canceled) { handleErrorResponse(c, err)
c.JSON(499, gin.H{"error": "request canceled"})
return
}
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return return
} }
@@ -1349,3 +1307,15 @@ func (s *Server) ChatHandler(c *gin.Context) {
streamResponse(c, ch) streamResponse(c, ch)
} }
func handleErrorResponse(c *gin.Context, err error) {
if errors.Is(err, context.Canceled) {
c.JSON(499, gin.H{"error": "request canceled"})
return
}
if errors.Is(err, ErrMaxQueue) {
c.JSON(http.StatusServiceUnavailable, gin.H{"error": err.Error()})
return
}
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}

View File

@@ -17,7 +17,7 @@ import (
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/parser" "github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version" "github.com/ollama/ollama/version"
) )
@@ -55,13 +55,13 @@ func Test_Routes(t *testing.T) {
createTestModel := func(t *testing.T, name string) { createTestModel := func(t *testing.T, name string) {
fname := createTestFile(t, "ollama-model") fname := createTestFile(t, "ollama-model")
modelfile := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname)) r := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
commands, err := parser.Parse(modelfile) modelfile, err := model.ParseFile(r)
assert.Nil(t, err) assert.Nil(t, err)
fn := func(resp api.ProgressResponse) { fn := func(resp api.ProgressResponse) {
t.Logf("Status: %s", resp.Status) t.Logf("Status: %s", resp.Status)
} }
err = CreateModel(context.TODO(), name, "", "", commands, fn) err = CreateModel(context.TODO(), name, "", "", modelfile, fn)
assert.Nil(t, err) assert.Nil(t, err)
} }

View File

@@ -5,10 +5,8 @@ import (
"errors" "errors"
"fmt" "fmt"
"log/slog" "log/slog"
"os"
"reflect" "reflect"
"sort" "sort"
"strconv"
"strings" "strings"
"sync" "sync"
"time" "time"
@@ -17,6 +15,7 @@ import (
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu" "github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/server/envconfig"
"golang.org/x/exp/slices" "golang.org/x/exp/slices"
) )
@@ -43,35 +42,14 @@ type Scheduler struct {
getGpuFn func() gpu.GpuInfoList getGpuFn func() gpu.GpuInfoList
} }
// TODO set this to zero after a release or two, to enable multiple models by default var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
var loadedMax = 1 // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
var maxQueuedRequests = 10 // TODO configurable
var numParallel = 1
func InitScheduler(ctx context.Context) *Scheduler { func InitScheduler(ctx context.Context) *Scheduler {
maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS")
if maxRunners != "" {
m, err := strconv.Atoi(maxRunners)
if err != nil {
slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
} else {
loadedMax = m
}
}
if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
p, err := strconv.Atoi(onp)
if err != nil || p <= 0 {
slog.Error("invalid parallel setting, must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
} else {
numParallel = p
}
}
sched := &Scheduler{ sched := &Scheduler{
pendingReqCh: make(chan *LlmRequest, maxQueuedRequests), pendingReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
finishedReqCh: make(chan *LlmRequest, maxQueuedRequests), finishedReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
expiredCh: make(chan *runnerRef, maxQueuedRequests), expiredCh: make(chan *runnerRef, envconfig.MaxQueuedRequests),
unloadedCh: make(chan interface{}, maxQueuedRequests), unloadedCh: make(chan interface{}, envconfig.MaxQueuedRequests),
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer, newServerFn: llm.NewLlamaServer,
getGpuFn: gpu.GetGPUInfo, getGpuFn: gpu.GetGPUInfo,
@@ -82,6 +60,9 @@ func InitScheduler(ctx context.Context) *Scheduler {
// context must be canceled to decrement ref count and release the runner // context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) { func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
// allocate a large enough kv cache for all parallel requests
opts.NumCtx = opts.NumCtx * envconfig.NumParallel
req := &LlmRequest{ req := &LlmRequest{
ctx: c, ctx: c,
model: model, model: model,
@@ -90,12 +71,11 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options,
successCh: make(chan *runnerRef), successCh: make(chan *runnerRef),
errCh: make(chan error, 1), errCh: make(chan error, 1),
} }
// context split across parallel threads
opts.NumCtx = opts.NumCtx * numParallel
select { select {
case s.pendingReqCh <- req: case s.pendingReqCh <- req:
default: default:
req.errCh <- fmt.Errorf("server busy, please try again. maximum pending requests exceeded") req.errCh <- ErrMaxQueue
} }
return req.successCh, req.errCh return req.successCh, req.errCh
} }
@@ -134,11 +114,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
pending.useLoadedRunner(runner, s.finishedReqCh) pending.useLoadedRunner(runner, s.finishedReqCh)
break break
} }
} else if loadedMax > 0 && loadedCount >= loadedMax { } else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount) slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
runnerToExpire = s.findRunnerToUnload(pending) runnerToExpire = s.findRunnerToUnload()
} else { } else {
// Either no models are loaded or below loadedMax // Either no models are loaded or below envconfig.MaxRunners
// Get a refreshed GPU list // Get a refreshed GPU list
gpus := s.getGpuFn() gpus := s.getGpuFn()
@@ -149,7 +129,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
break break
} }
// If we're CPU only mode, just limit by loadedMax above // If we're CPU only mode, just limit by envconfig.MaxRunners above
// TODO handle system memory exhaustion // TODO handle system memory exhaustion
if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 { if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
slog.Debug("cpu mode with existing models, loading") slog.Debug("cpu mode with existing models, loading")
@@ -177,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
s.loadFn(pending, ggml, gpus) s.loadFn(pending, ggml, gpus)
break break
} }
runnerToExpire = s.findRunnerToUnload(pending) runnerToExpire = s.findRunnerToUnload()
} }
if runnerToExpire == nil { if runnerToExpire == nil {
@@ -277,9 +257,9 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
continue continue
} }
s.loadedMu.Lock()
slog.Debug("got lock to unload", "model", runner.model) slog.Debug("got lock to unload", "model", runner.model)
runner.unload() runner.unload()
s.loadedMu.Lock()
delete(s.loaded, runner.model) delete(s.loaded, runner.model)
s.loadedMu.Unlock() s.loadedMu.Unlock()
slog.Debug("runner released", "model", runner.model) slog.Debug("runner released", "model", runner.model)
@@ -524,7 +504,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
} }
// findRunnerToUnload finds a runner to unload to make room for a new model // findRunnerToUnload finds a runner to unload to make room for a new model
func (s *Scheduler) findRunnerToUnload(req *LlmRequest) *runnerRef { func (s *Scheduler) findRunnerToUnload() *runnerRef {
s.loadedMu.Lock() s.loadedMu.Lock()
runnerList := make([]*runnerRef, 0, len(s.loaded)) runnerList := make([]*runnerRef, 0, len(s.loaded))
for _, r := range s.loaded { for _, r := range s.loaded {

View File

@@ -15,6 +15,7 @@ import (
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu" "github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/server/envconfig"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
@@ -27,38 +28,14 @@ func init() {
func TestInitScheduler(t *testing.T) { func TestInitScheduler(t *testing.T) {
ctx, done := context.WithCancel(context.Background()) ctx, done := context.WithCancel(context.Background())
defer done() defer done()
initialMax := loadedMax
initialParallel := numParallel
s := InitScheduler(ctx) s := InitScheduler(ctx)
require.Equal(t, initialMax, loadedMax)
s.loadedMu.Lock() s.loadedMu.Lock()
require.NotNil(t, s.loaded) require.NotNil(t, s.loaded)
s.loadedMu.Unlock() s.loadedMu.Unlock()
os.Setenv("OLLAMA_MAX_LOADED_MODELS", "blue")
s = InitScheduler(ctx)
require.Equal(t, initialMax, loadedMax)
s.loadedMu.Lock()
require.NotNil(t, s.loaded)
s.loadedMu.Unlock()
os.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
s = InitScheduler(ctx)
require.Equal(t, 0, loadedMax)
s.loadedMu.Lock()
require.NotNil(t, s.loaded)
s.loadedMu.Unlock()
os.Setenv("OLLAMA_NUM_PARALLEL", "blue")
_ = InitScheduler(ctx)
require.Equal(t, initialParallel, numParallel)
os.Setenv("OLLAMA_NUM_PARALLEL", "10")
_ = InitScheduler(ctx)
require.Equal(t, 10, numParallel)
} }
func TestLoad(t *testing.T) { func TestLoad(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
defer done() defer done()
s := InitScheduler(ctx) s := InitScheduler(ctx)
var ggml *llm.GGML // value not used in tests var ggml *llm.GGML // value not used in tests
@@ -174,7 +151,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
} }
func TestRequests(t *testing.T) { func TestRequests(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
defer done() defer done()
// Same model, same request // Same model, same request
@@ -249,7 +226,7 @@ func TestRequests(t *testing.T) {
t.Errorf("timeout") t.Errorf("timeout")
} }
loadedMax = 1 envconfig.MaxRunners = 1
s.newServerFn = scenario3a.newServer s.newServerFn = scenario3a.newServer
slog.Info("scenario3a") slog.Info("scenario3a")
s.pendingReqCh <- scenario3a.req s.pendingReqCh <- scenario3a.req
@@ -268,7 +245,7 @@ func TestRequests(t *testing.T) {
require.Len(t, s.loaded, 1) require.Len(t, s.loaded, 1)
s.loadedMu.Unlock() s.loadedMu.Unlock()
loadedMax = 0 envconfig.MaxRunners = 0
s.newServerFn = scenario3b.newServer s.newServerFn = scenario3b.newServer
slog.Info("scenario3b") slog.Info("scenario3b")
s.pendingReqCh <- scenario3b.req s.pendingReqCh <- scenario3b.req
@@ -329,7 +306,7 @@ func TestRequests(t *testing.T) {
} }
func TestGetRunner(t *testing.T) { func TestGetRunner(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done() defer done()
// Same model, same request // Same model, same request
@@ -339,7 +316,7 @@ func TestGetRunner(t *testing.T) {
scenario1b.req.sessionDuration = 0 scenario1b.req.sessionDuration = 0
scenario1c := newScenario(t, ctx, "ollama-model-1c", 10) scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
scenario1c.req.sessionDuration = 0 scenario1c.req.sessionDuration = 0
maxQueuedRequests = 1 envconfig.MaxQueuedRequests = 1
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.getGpuFn = func() gpu.GpuInfoList { s.getGpuFn = func() gpu.GpuInfoList {
g := gpu.GpuInfo{Library: "metal"} g := gpu.GpuInfo{Library: "metal"}
@@ -391,7 +368,7 @@ func TestGetRunner(t *testing.T) {
// TODO - add one scenario that triggers the bogus finished event with positive ref count // TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) { func TestPrematureExpired(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
defer done() defer done()
// Same model, same request // Same model, same request
@@ -436,7 +413,7 @@ func TestPrematureExpired(t *testing.T) {
} }
func TestUseLoadedRunner(t *testing.T) { func TestUseLoadedRunner(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
req := &LlmRequest{ req := &LlmRequest{
ctx: ctx, ctx: ctx,
opts: api.DefaultOptions(), opts: api.DefaultOptions(),
@@ -461,7 +438,7 @@ func TestUseLoadedRunner(t *testing.T) {
} }
func TestUpdateFreeSpace(t *testing.T) { func TestUpdateFreeSpace(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done() defer done()
gpus := gpu.GpuInfoList{ gpus := gpu.GpuInfoList{
{ {
@@ -494,12 +471,9 @@ func TestUpdateFreeSpace(t *testing.T) {
} }
func TestFindRunnerToUnload(t *testing.T) { func TestFindRunnerToUnload(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done() defer done()
req := &LlmRequest{
ctx: ctx,
opts: api.DefaultOptions(),
}
r1 := &runnerRef{refCount: 1, sessionDuration: 1} r1 := &runnerRef{refCount: 1, sessionDuration: 1}
r2 := &runnerRef{sessionDuration: 2} r2 := &runnerRef{sessionDuration: 2}
@@ -509,16 +483,16 @@ func TestFindRunnerToUnload(t *testing.T) {
s.loaded["b"] = r2 s.loaded["b"] = r2
s.loadedMu.Unlock() s.loadedMu.Unlock()
resp := s.findRunnerToUnload(req) resp := s.findRunnerToUnload()
require.Equal(t, r2, resp) require.Equal(t, r2, resp)
r2.refCount = 1 r2.refCount = 1
resp = s.findRunnerToUnload(req) resp = s.findRunnerToUnload()
require.Equal(t, r1, resp) require.Equal(t, r1, resp)
} }
func TestNeedsReload(t *testing.T) { func TestNeedsReload(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done() defer done()
llm := &mockLlm{} llm := &mockLlm{}
@@ -562,7 +536,7 @@ func TestNeedsReload(t *testing.T) {
} }
func TestUnloadAllRunners(t *testing.T) { func TestUnloadAllRunners(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done() defer done()
llm1 := &mockLlm{} llm1 := &mockLlm{}

View File

@@ -1,4 +1,4 @@
package parser package model
import ( import (
"bufio" "bufio"
@@ -10,11 +10,41 @@ import (
"strings" "strings"
) )
type File struct {
Commands []Command
}
func (f File) String() string {
var sb strings.Builder
for _, cmd := range f.Commands {
fmt.Fprintln(&sb, cmd.String())
}
return sb.String()
}
type Command struct { type Command struct {
Name string Name string
Args string Args string
} }
func (c Command) String() string {
var sb strings.Builder
switch c.Name {
case "model":
fmt.Fprintf(&sb, "FROM %s", c.Args)
case "license", "template", "system", "adapter":
fmt.Fprintf(&sb, "%s %s", strings.ToUpper(c.Name), quote(c.Args))
case "message":
role, message, _ := strings.Cut(c.Args, ": ")
fmt.Fprintf(&sb, "MESSAGE %s %s", role, quote(message))
default:
fmt.Fprintf(&sb, "PARAMETER %s %s", c.Name, quote(c.Args))
}
return sb.String()
}
type state int type state int
const ( const (
@@ -32,38 +62,14 @@ var (
errInvalidCommand = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"") errInvalidCommand = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
) )
func Format(cmds []Command) string { func ParseFile(r io.Reader) (*File, error) {
var sb strings.Builder
for _, cmd := range cmds {
name := cmd.Name
args := cmd.Args
switch cmd.Name {
case "model":
name = "from"
args = cmd.Args
case "license", "template", "system", "adapter":
args = quote(args)
case "message":
role, message, _ := strings.Cut(cmd.Args, ": ")
args = role + " " + quote(message)
default:
name = "parameter"
args = cmd.Name + " " + quote(cmd.Args)
}
fmt.Fprintln(&sb, strings.ToUpper(name), args)
}
return sb.String()
}
func Parse(r io.Reader) (cmds []Command, err error) {
var cmd Command var cmd Command
var curr state var curr state
var b bytes.Buffer var b bytes.Buffer
var role string var role string
var f File
br := bufio.NewReader(r) br := bufio.NewReader(r)
for { for {
r, _, err := br.ReadRune() r, _, err := br.ReadRune()
@@ -128,7 +134,7 @@ func Parse(r io.Reader) (cmds []Command, err error) {
} }
cmd.Args = s cmd.Args = s
cmds = append(cmds, cmd) f.Commands = append(f.Commands, cmd)
} }
b.Reset() b.Reset()
@@ -157,14 +163,14 @@ func Parse(r io.Reader) (cmds []Command, err error) {
} }
cmd.Args = s cmd.Args = s
cmds = append(cmds, cmd) f.Commands = append(f.Commands, cmd)
default: default:
return nil, io.ErrUnexpectedEOF return nil, io.ErrUnexpectedEOF
} }
for _, cmd := range cmds { for _, cmd := range f.Commands {
if cmd.Name == "model" { if cmd.Name == "model" {
return cmds, nil return &f, nil
} }
} }

View File

@@ -1,4 +1,4 @@
package parser package model
import ( import (
"bytes" "bytes"
@@ -10,7 +10,7 @@ import (
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
) )
func TestParser(t *testing.T) { func TestParseFileFile(t *testing.T) {
input := ` input := `
FROM model1 FROM model1
ADAPTER adapter1 ADAPTER adapter1
@@ -22,8 +22,8 @@ TEMPLATE template1
reader := strings.NewReader(input) reader := strings.NewReader(input)
commands, err := Parse(reader) modelfile, err := ParseFile(reader)
assert.Nil(t, err) assert.NoError(t, err)
expectedCommands := []Command{ expectedCommands := []Command{
{Name: "model", Args: "model1"}, {Name: "model", Args: "model1"},
@@ -34,10 +34,10 @@ TEMPLATE template1
{Name: "template", Args: "template1"}, {Name: "template", Args: "template1"},
} }
assert.Equal(t, expectedCommands, commands) assert.Equal(t, expectedCommands, modelfile.Commands)
} }
func TestParserFrom(t *testing.T) { func TestParseFileFrom(t *testing.T) {
var cases = []struct { var cases = []struct {
input string input string
expected []Command expected []Command
@@ -85,14 +85,16 @@ func TestParserFrom(t *testing.T) {
for _, c := range cases { for _, c := range cases {
t.Run("", func(t *testing.T) { t.Run("", func(t *testing.T) {
commands, err := Parse(strings.NewReader(c.input)) modelfile, err := ParseFile(strings.NewReader(c.input))
assert.ErrorIs(t, err, c.err) assert.ErrorIs(t, err, c.err)
assert.Equal(t, c.expected, commands) if modelfile != nil {
assert.Equal(t, c.expected, modelfile.Commands)
}
}) })
} }
} }
func TestParserParametersMissingValue(t *testing.T) { func TestParseFileParametersMissingValue(t *testing.T) {
input := ` input := `
FROM foo FROM foo
PARAMETER param1 PARAMETER param1
@@ -100,21 +102,21 @@ PARAMETER param1
reader := strings.NewReader(input) reader := strings.NewReader(input)
_, err := Parse(reader) _, err := ParseFile(reader)
assert.ErrorIs(t, err, io.ErrUnexpectedEOF) assert.ErrorIs(t, err, io.ErrUnexpectedEOF)
} }
func TestParserBadCommand(t *testing.T) { func TestParseFileBadCommand(t *testing.T) {
input := ` input := `
FROM foo FROM foo
BADCOMMAND param1 value1 BADCOMMAND param1 value1
` `
_, err := Parse(strings.NewReader(input)) _, err := ParseFile(strings.NewReader(input))
assert.ErrorIs(t, err, errInvalidCommand) assert.ErrorIs(t, err, errInvalidCommand)
} }
func TestParserMessages(t *testing.T) { func TestParseFileMessages(t *testing.T) {
var cases = []struct { var cases = []struct {
input string input string
expected []Command expected []Command
@@ -123,34 +125,34 @@ func TestParserMessages(t *testing.T) {
{ {
` `
FROM foo FROM foo
MESSAGE system You are a Parser. Always Parse things. MESSAGE system You are a file parser. Always parse things.
`, `,
[]Command{ []Command{
{Name: "model", Args: "foo"}, {Name: "model", Args: "foo"},
{Name: "message", Args: "system: You are a Parser. Always Parse things."}, {Name: "message", Args: "system: You are a file parser. Always parse things."},
}, },
nil, nil,
}, },
{ {
` `
FROM foo FROM foo
MESSAGE system You are a Parser. Always Parse things.`, MESSAGE system You are a file parser. Always parse things.`,
[]Command{ []Command{
{Name: "model", Args: "foo"}, {Name: "model", Args: "foo"},
{Name: "message", Args: "system: You are a Parser. Always Parse things."}, {Name: "message", Args: "system: You are a file parser. Always parse things."},
}, },
nil, nil,
}, },
{ {
` `
FROM foo FROM foo
MESSAGE system You are a Parser. Always Parse things. MESSAGE system You are a file parser. Always parse things.
MESSAGE user Hey there! MESSAGE user Hey there!
MESSAGE assistant Hello, I want to parse all the things! MESSAGE assistant Hello, I want to parse all the things!
`, `,
[]Command{ []Command{
{Name: "model", Args: "foo"}, {Name: "model", Args: "foo"},
{Name: "message", Args: "system: You are a Parser. Always Parse things."}, {Name: "message", Args: "system: You are a file parser. Always parse things."},
{Name: "message", Args: "user: Hey there!"}, {Name: "message", Args: "user: Hey there!"},
{Name: "message", Args: "assistant: Hello, I want to parse all the things!"}, {Name: "message", Args: "assistant: Hello, I want to parse all the things!"},
}, },
@@ -160,12 +162,12 @@ MESSAGE assistant Hello, I want to parse all the things!
` `
FROM foo FROM foo
MESSAGE system """ MESSAGE system """
You are a multiline Parser. Always Parse things. You are a multiline file parser. Always parse things.
""" """
`, `,
[]Command{ []Command{
{Name: "model", Args: "foo"}, {Name: "model", Args: "foo"},
{Name: "message", Args: "system: \nYou are a multiline Parser. Always Parse things.\n"}, {Name: "message", Args: "system: \nYou are a multiline file parser. Always parse things.\n"},
}, },
nil, nil,
}, },
@@ -196,14 +198,16 @@ MESSAGE system`,
for _, c := range cases { for _, c := range cases {
t.Run("", func(t *testing.T) { t.Run("", func(t *testing.T) {
commands, err := Parse(strings.NewReader(c.input)) modelfile, err := ParseFile(strings.NewReader(c.input))
assert.ErrorIs(t, err, c.err) assert.ErrorIs(t, err, c.err)
assert.Equal(t, c.expected, commands) if modelfile != nil {
assert.Equal(t, c.expected, modelfile.Commands)
}
}) })
} }
} }
func TestParserQuoted(t *testing.T) { func TestParseFileQuoted(t *testing.T) {
var cases = []struct { var cases = []struct {
multiline string multiline string
expected []Command expected []Command
@@ -348,14 +352,16 @@ TEMPLATE """
for _, c := range cases { for _, c := range cases {
t.Run("", func(t *testing.T) { t.Run("", func(t *testing.T) {
commands, err := Parse(strings.NewReader(c.multiline)) modelfile, err := ParseFile(strings.NewReader(c.multiline))
assert.ErrorIs(t, err, c.err) assert.ErrorIs(t, err, c.err)
assert.Equal(t, c.expected, commands) if modelfile != nil {
assert.Equal(t, c.expected, modelfile.Commands)
}
}) })
} }
} }
func TestParserParameters(t *testing.T) { func TestParseFileParameters(t *testing.T) {
var cases = map[string]struct { var cases = map[string]struct {
name, value string name, value string
}{ }{
@@ -404,18 +410,18 @@ func TestParserParameters(t *testing.T) {
var b bytes.Buffer var b bytes.Buffer
fmt.Fprintln(&b, "FROM foo") fmt.Fprintln(&b, "FROM foo")
fmt.Fprintln(&b, "PARAMETER", k) fmt.Fprintln(&b, "PARAMETER", k)
commands, err := Parse(&b) modelfile, err := ParseFile(&b)
assert.Nil(t, err) assert.NoError(t, err)
assert.Equal(t, []Command{ assert.Equal(t, []Command{
{Name: "model", Args: "foo"}, {Name: "model", Args: "foo"},
{Name: v.name, Args: v.value}, {Name: v.name, Args: v.value},
}, commands) }, modelfile.Commands)
}) })
} }
} }
func TestParserComments(t *testing.T) { func TestParseFileComments(t *testing.T) {
var cases = []struct { var cases = []struct {
input string input string
expected []Command expected []Command
@@ -433,14 +439,14 @@ FROM foo
for _, c := range cases { for _, c := range cases {
t.Run("", func(t *testing.T) { t.Run("", func(t *testing.T) {
commands, err := Parse(strings.NewReader(c.input)) modelfile, err := ParseFile(strings.NewReader(c.input))
assert.Nil(t, err) assert.NoError(t, err)
assert.Equal(t, c.expected, commands) assert.Equal(t, c.expected, modelfile.Commands)
}) })
} }
} }
func TestParseFormatParse(t *testing.T) { func TestParseFileFormatParseFile(t *testing.T) {
var cases = []string{ var cases = []string{
` `
FROM foo FROM foo
@@ -449,7 +455,7 @@ LICENSE MIT
PARAMETER param1 value1 PARAMETER param1 value1
PARAMETER param2 value2 PARAMETER param2 value2
TEMPLATE template1 TEMPLATE template1
MESSAGE system You are a Parser. Always Parse things. MESSAGE system You are a file parser. Always parse things.
MESSAGE user Hey there! MESSAGE user Hey there!
MESSAGE assistant Hello, I want to parse all the things! MESSAGE assistant Hello, I want to parse all the things!
`, `,
@@ -488,13 +494,13 @@ MESSAGE assistant Hello, I want to parse all the things!
for _, c := range cases { for _, c := range cases {
t.Run("", func(t *testing.T) { t.Run("", func(t *testing.T) {
commands, err := Parse(strings.NewReader(c)) modelfile, err := ParseFile(strings.NewReader(c))
assert.NoError(t, err) assert.NoError(t, err)
commands2, err := Parse(strings.NewReader(Format(commands))) modelfile2, err := ParseFile(strings.NewReader(modelfile.String()))
assert.NoError(t, err) assert.NoError(t, err)
assert.Equal(t, commands, commands2) assert.Equal(t, modelfile, modelfile2)
}) })
} }

View File

@@ -161,7 +161,7 @@ func ParseNameBare(s string) Name {
} }
scheme, host, ok := strings.Cut(s, "://") scheme, host, ok := strings.Cut(s, "://")
if ! ok { if !ok {
host = scheme host = scheme
} }
n.Host = host n.Host = host