diff --git a/README.md b/README.md index 6157e7ed..b5813d20 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,8 @@ Here are some example models that can be downloaded: | LLaVA | 7B | 4.5GB | `ollama run llava` | | Solar | 10.7B | 6.1GB | `ollama run solar` | -> Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models. +> [!NOTE] +> You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models. ## Customize a model @@ -314,6 +315,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS) - [AI Studio](https://github.com/MindWorkAI/AI-Studio) - [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client) +- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows) ### Terminal diff --git a/docs/api.md b/docs/api.md index 4381c376..2d4fe28f 100644 --- a/docs/api.md +++ b/docs/api.md @@ -40,6 +40,7 @@ Generate a response for a given prompt with a provided model. This is a streamin - `model`: (required) the [model name](#model-names) - `prompt`: the prompt to generate a response for +- `suffix`: the text after the model response - `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`) Advanced parameters (optional): @@ -57,7 +58,8 @@ Advanced parameters (optional): Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below. -> Note: it's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace. +> [!IMPORTANT] +> It's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace. ### Examples @@ -148,8 +150,44 @@ If `stream` is set to `false`, the response will be a single JSON object: } ``` +#### Request (with suffix) + +##### Request + +```shell +curl http://localhost:11434/api/generate -d '{ + "model": "codellama:code", + "prompt": "def compute_gcd(a, b):", + "suffix": " return result", + "options": { + "temperature": 0 + }, + "stream": false +}' +``` + +##### Response + +```json +{ + "model": "codellama:code", + "created_at": "2024-07-22T20:47:51.147561Z", + "response": "\n if a == 0:\n return b\n else:\n return compute_gcd(b % a, a)\n\ndef compute_lcm(a, b):\n result = (a * b) / compute_gcd(a, b)\n", + "done": true, + "done_reason": "stop", + "context": [...], + "total_duration": 1162761250, + "load_duration": 6683708, + "prompt_eval_count": 17, + "prompt_eval_duration": 201222000, + "eval_count": 63, + "eval_duration": 953997000 +} +``` + #### Request (JSON mode) +> [!IMPORTANT] > When `format` is set to `json`, the output will always be a well-formed JSON object. It's important to also instruct the model to respond in JSON. ##### Request @@ -380,12 +418,14 @@ Generate the next message in a chat with a provided model. This is a streaming e - `model`: (required) the [model name](#model-names) - `messages`: the messages of the chat, this can be used to keep a chat memory +- `tools`: tools for the model to use if supported. Requires `stream` to be set to `false` The `message` object has the following fields: -- `role`: the role of the message, either `system`, `user` or `assistant` +- `role`: the role of the message, either `system`, `user`, `assistant`, or `tool` - `content`: the content of the message - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`) +- `tool_calls` (optional): a list of tools the model wants to use Advanced parameters (optional): @@ -622,6 +662,79 @@ curl http://localhost:11434/api/chat -d '{ } ``` +#### Chat request (with tools) + +##### Request + +``` +curl http://localhost:11434/api/chat -d '{ + "model": "mistral", + "messages": [ + { + "role": "user", + "content": "What is the weather today in Paris?" + } + ], + "stream": false, + "tools": [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to get the weather for, e.g. San Francisco, CA" + }, + "format": { + "type": "string", + "description": "The format to return the weather in, e.g. 'celsius' or 'fahrenheit'", + "enum": ["celsius", "fahrenheit"] + } + }, + "required": ["location", "format"] + } + } + } + ] +}' +``` + +##### Response + +```json +{ + "model": "mistral:7b-instruct-v0.3-q4_K_M", + "created_at": "2024-07-22T20:33:28.123648Z", + "message": { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "function": { + "name": "get_current_weather", + "arguments": { + "format": "celsius", + "location": "Paris, FR" + } + } + } + ] + }, + "done_reason": "stop", + "done": true, + "total_duration": 885095291, + "load_duration": 3753500, + "prompt_eval_count": 122, + "prompt_eval_duration": 328493000, + "eval_count": 33, + "eval_duration": 552222000 +} +``` + ## Create a Model ```shell @@ -1173,4 +1286,4 @@ curl http://localhost:11434/api/embeddings -d '{ 0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281 ] } -``` \ No newline at end of file +``` diff --git a/docs/modelfile.md b/docs/modelfile.md index 21ee1826..c3645b06 100644 --- a/docs/modelfile.md +++ b/docs/modelfile.md @@ -1,6 +1,7 @@ # Ollama Model File -> Note: `Modelfile` syntax is in development +> [!NOTE] +> `Modelfile` syntax is in development A model file is the blueprint to create and share models with Ollama. diff --git a/docs/openai.md b/docs/openai.md index 248ba74a..fee30f71 100644 --- a/docs/openai.md +++ b/docs/openai.md @@ -78,8 +78,8 @@ curl http://localhost:11434/v1/chat/completions \ - [x] Streaming - [x] JSON mode - [x] Reproducible outputs +- [x] Tools (streaming support coming soon) - [ ] Vision -- [ ] Function calling - [ ] Logprobs #### Supported request fields @@ -97,9 +97,9 @@ curl http://localhost:11434/v1/chat/completions \ - [x] `temperature` - [x] `top_p` - [x] `max_tokens` -- [ ] `logit_bias` -- [ ] `tools` +- [x] `tools` - [ ] `tool_choice` +- [ ] `logit_bias` - [ ] `user` - [ ] `n` diff --git a/docs/template.md b/docs/template.md new file mode 100644 index 00000000..f6ce06ba --- /dev/null +++ b/docs/template.md @@ -0,0 +1,173 @@ +# Template + +Ollama provides a powerful templating engine backed by Go's built-in templating engine to construct prompts for your large language model. This feature is a valuable tool to get the most out of your models. + +## Basic Template Structure + +A basic Go template consists of three main parts: + +* **Layout**: The overall structure of the template. +* **Variables**: Placeholders for dynamic data that will be replaced with actual values when the template is rendered. +* **Functions**: Custom functions or logic that can be used to manipulate the template's content. + +Here's an example of a simple chat template: + +```gotmpl +{{- range .Messages }} +{{ .Role }}: {{ .Content }} +{{- end }} +``` + +In this example, we have: + +* A basic messages structure (layout) +* Three variables: `Messages`, `Role`, and `Content` (variables) +* A custom function (action) that iterates over an array of items (`range .Messages`) and displays each item + +## Adding templates to your model + +By default, models imported into Ollama have a default template of `{{ .Prompt }}`, i.e. user inputs are sent verbatim to the LLM. This is appropriate for text or code completion models but lacks essential markers for chat or instruction models. + +Omitting a template in these models puts the responsibility of correctly templating input onto the user. Adding a template allows users to easily get the best results from the model. + +To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3. + +```dockerfile +FROM llama3 + +TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|> + +{{ .System }}<|eot_id|> +{{- end }} +{{- range .Messages }}<|start_header_id|>{{ .Role }}<|end_header_id|> + +{{ .Content }}<|eot_id|> +{{- end }}<|start_header_id|>assistant<|end_header_id|> + +""" +``` + +## Variables + +`System` (string): system prompt + +`Prompt` (string): user prompt + +`Response` (string): assistant response + +`Suffix` (string): text inserted after the assistant's response + +`Messages` (list): list of messages + +`Messages[].Role` (string): role which can be one of `system`, `user`, `assistant`, or `tool` + +`Messages[].Content` (string): message content + +`Messages[].ToolCalls` (list): list of tools the model wants to call + +`Messages[].ToolCalls[].Function` (object): function to call + +`Messages[].ToolCalls[].Function.Name` (string): function name + +`Messages[].ToolCalls[].Function.Arguments` (map): mapping of argument name to argument value + +`Tools` (list): list of tools the model can access + +`Tools[].Type` (string): schema type. `type` is always `function` + +`Tools[].Function` (object): function definition + +`Tools[].Function.Name` (string): function name + +`Tools[].Function.Description` (string): function description + +`Tools[].Function.Parameters` (object): function parameters + +`Tools[].Function.Parameters.Type` (string): schema type. `type` is always `object` + +`Tools[].Function.Parameters.Required` (list): list of required properties + +`Tools[].Function.Parameters.Properties` (map): mapping of property name to property definition + +`Tools[].Function.Parameters.Properties[].Type` (string): property type + +`Tools[].Function.Parameters.Properties[].Description` (string): property description + +`Tools[].Function.Parameters.Properties[].Enum` (list): list of valid values + +## Tips and Best Practices + +Keep the following tips and best practices in mind when working with Go templates: + +* **Be mindful of dot**: Control flow structures like `range` and `with` changes the value `.` +* **Out-of-scope variables**: Use `$.` to reference variables not currently in scope, starting from the root +* **Whitespace control**: Use `-` to trim leading (`{{-`) and trailing (`-}}`) whitespace + +## Examples + +### Example Messages + +#### ChatML + +ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2. + +```gotmpl +{{- if .System }}<|im_start|>system +{{ .System }}<|im_end|> +{{ end }} +{{- range .Messages }}<|im_start|>{{ .Role }} +{{ .Content }}<|im_end|> +{{ end }}<|im_start|>assistant +{{ else }} +{{ if .System }}<|im_start|>system +{{ .System }}<|im_end|> +``` + +### Example Tools + +Tools support can be added to a model by adding a `{{ .Tools }}` node to the template. This feature is useful for models trained to call external tools and can a powerful tool for retrieving real-time data or performing complex tasks. + +#### Mistral + +Mistral v0.3 and Mixtral 8x22B supports tool calling. + +```gotmpl +{{- range $index, $_ := .Messages }} +{{- if eq .Role "user" }} +{{- if and (le (len (slice $.Messages $index)) 2) $.Tools }}[AVAILABLE_TOOLS] {{ json $.Tools }}[/AVAILABLE_TOOLS] +{{- end }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }} + +{{ end }}{{ .Content }}[/INST] +{{- else if eq .Role "assistant" }} +{{- if .Content }} {{ .Content }} +{{- else if .ToolCalls }}[TOOL_CALLS] [ +{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ json .Function.Arguments }}} +{{- end }}] +{{- end }} +{{- else if eq .Role "tool" }}[TOOL_RESULTS] {"content": {{ .Content }}}[/TOOL_RESULTS] +{{- end }} +{{- end }} +``` + +### Example Fill-in-Middle + +Fill-in-middle support can be added to a model by adding a `{{ .Suffix }}` node to the template. This feature is useful for models that are trained to generate text in the middle of user input, such as code completion models. + +#### CodeLlama + +CodeLlama [7B](https://ollama.com/library/codellama:7b-code) and [13B](https://ollama.com/library/codellama:13b-code) code completion models support fill-in-middle. + +```gotmpl +
{{ .Prompt }} {{ .Suffix }}
+```
+
+> [!NOTE]
+> CodeLlama 34B and 70B code completion and all instruct and Python fine-tuned models do not support fill-in-middle.
+
+#### Codestral
+
+Codestral [22B](https://ollama.com/library/codestral:22b) supports fill-in-middle.
+
+```gotmpl
+[SUFFIX]{{ .Suffix }}[PREFIX] {{ .Prompt }}
+```
diff --git a/integration/embed_test.go b/integration/embed_test.go
index aeafa57b..61b36fa2 100644
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -4,12 +4,45 @@ package integration
import (
"context"
+ "math"
"testing"
"time"
"github.com/ollama/ollama/api"
)
+func floatsEqual32(a, b float32) bool {
+ return math.Abs(float64(a-b)) <= 1e-4
+}
+
+func floatsEqual64(a, b float64) bool {
+ return math.Abs(a-b) <= 1e-4
+}
+
+func TestAllMiniLMEmbeddings(t *testing.T) {
+ ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+ defer cancel()
+
+ req := api.EmbeddingRequest{
+ Model: "all-minilm",
+ Prompt: "why is the sky blue?",
+ }
+
+ res, err := embeddingTestHelper(ctx, t, req)
+
+ if err != nil {
+ t.Fatalf("error: %v", err)
+ }
+
+ if len(res.Embedding) != 384 {
+ t.Fatalf("expected 384 floats, got %d", len(res.Embedding))
+ }
+
+ if !floatsEqual64(res.Embedding[0], 0.06642947345972061) {
+ t.Fatalf("expected 0.06642947345972061, got %.16f", res.Embedding[0])
+ }
+}
+
func TestAllMiniLMEmbed(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
@@ -33,8 +66,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
}
- if res.Embeddings[0][0] != 0.010071031 {
- t.Fatalf("expected 0.010071031, got %f", res.Embeddings[0][0])
+ if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
+ t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
}
}
@@ -61,12 +94,12 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
}
- if res.Embeddings[0][0] != 0.010071031 || res.Embeddings[1][0] != -0.009802706 {
- t.Fatalf("expected 0.010071031 and -0.009802706, got %f and %f", res.Embeddings[0][0], res.Embeddings[1][0])
+ if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
+ t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
}
}
-func TestAllMiniLmEmbedTruncate(t *testing.T) {
+func TestAllMiniLMEmbedTruncate(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
@@ -135,6 +168,22 @@ func TestAllMiniLmEmbedTruncate(t *testing.T) {
}
}
+func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
+ client, _, cleanup := InitServerConnection(ctx, t)
+ defer cleanup()
+ if err := PullIfMissing(ctx, client, req.Model); err != nil {
+ t.Fatalf("failed to pull model %s: %v", req.Model, err)
+ }
+
+ response, err := client.Embeddings(ctx, &req)
+
+ if err != nil {
+ return nil, err
+ }
+
+ return response, nil
+}
+
func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
diff --git a/server/download.go b/server/download.go
index d93cd3b4..8b5b577f 100644
--- a/server/download.go
+++ b/server/download.go
@@ -8,6 +8,7 @@ import (
"io"
"log/slog"
"math"
+ "math/rand/v2"
"net/http"
"net/url"
"os"
@@ -141,6 +142,32 @@ func (b *blobDownload) Run(ctx context.Context, requestURL *url.URL, opts *regis
b.err = b.run(ctx, requestURL, opts)
}
+func newBackoff(maxBackoff time.Duration) func(ctx context.Context) error {
+ var n int
+ return func(ctx context.Context) error {
+ if ctx.Err() != nil {
+ return ctx.Err()
+ }
+
+ n++
+
+ // n^2 backoff timer is a little smoother than the
+ // common choice of 2^n.
+ d := min(time.Duration(n*n)*10*time.Millisecond, maxBackoff)
+ // Randomize the delay between 0.5-1.5 x msec, in order
+ // to prevent accidental "thundering herd" problems.
+ d = time.Duration(float64(d) * (rand.Float64() + 0.5))
+ t := time.NewTimer(d)
+ defer t.Stop()
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case <-t.C:
+ return nil
+ }
+ }
+}
+
func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *registryOptions) error {
defer blobDownloadManager.Delete(b.Digest)
ctx, b.CancelFunc = context.WithCancel(ctx)
@@ -153,6 +180,52 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
_ = file.Truncate(b.Total)
+ directURL, err := func() (*url.URL, error) {
+ ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
+ defer cancel()
+
+ backoff := newBackoff(10 * time.Second)
+ for {
+ // shallow clone opts to be used in the closure
+ // without affecting the outer opts.
+ newOpts := new(registryOptions)
+ *newOpts = *opts
+
+ newOpts.CheckRedirect = func(req *http.Request, via []*http.Request) error {
+ if len(via) > 10 {
+ return errors.New("maxium redirects exceeded (10) for directURL")
+ }
+
+ // if the hostname is the same, allow the redirect
+ if req.URL.Hostname() == requestURL.Hostname() {
+ return nil
+ }
+
+ // stop at the first redirect that is not
+ // the same hostname as the original
+ // request.
+ return http.ErrUseLastResponse
+ }
+
+ resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, nil, nil, newOpts)
+ if err != nil {
+ slog.Warn("failed to get direct URL; backing off and retrying", "err", err)
+ if err := backoff(ctx); err != nil {
+ return nil, err
+ }
+ continue
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusTemporaryRedirect {
+ return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
+ }
+ return resp.Location()
+ }
+ }()
+ if err != nil {
+ return err
+ }
+
g, inner := errgroup.WithContext(ctx)
g.SetLimit(numDownloadParts)
for i := range b.Parts {
@@ -165,7 +238,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
var err error
for try := 0; try < maxRetries; try++ {
w := io.NewOffsetWriter(file, part.StartsAt())
- err = b.downloadChunk(inner, requestURL, w, part, opts)
+ err = b.downloadChunk(inner, directURL, w, part, opts)
switch {
case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC):
// return immediately if the context is canceled or the device is out of space
diff --git a/server/images.go b/server/images.go
index 574dec19..836dbcc2 100644
--- a/server/images.go
+++ b/server/images.go
@@ -54,6 +54,8 @@ type registryOptions struct {
Username string
Password string
Token string
+
+ CheckRedirect func(req *http.Request, via []*http.Request) error
}
type Model struct {
@@ -1131,7 +1133,9 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
req.ContentLength = contentLength
}
- resp, err := http.DefaultClient.Do(req)
+ resp, err := (&http.Client{
+ CheckRedirect: regOpts.CheckRedirect,
+ }).Do(req)
if err != nil {
return nil, err
}