From 4151ef8cf7d2f2c2dc6bd5fab77b5a45a388be29 Mon Sep 17 00:00:00 2001 From: ycomiti <94963509+ycomiti@users.noreply.github.com> Date: Tue, 22 Jul 2025 20:17:31 +0200 Subject: [PATCH 01/17] Update linux.md (#11462) --- docs/linux.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/linux.md b/docs/linux.md index 72a5ff01..0c19ef0b 100644 --- a/docs/linux.md +++ b/docs/linux.md @@ -16,7 +16,7 @@ curl -fsSL https://ollama.com/install.sh | sh Download and extract the package: ```shell -curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz +curl -LO https://ollama.com/download/ollama-linux-amd64.tgz sudo tar -C /usr -xzf ollama-linux-amd64.tgz ``` From 3bac5cba60b08afb1164611dac3b710583f3b241 Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Tue, 22 Jul 2025 13:40:47 -0700 Subject: [PATCH 02/17] Fix GetModelInfo (#11496) --------- Co-authored-by: Richard Lyons --- cmd/interactive.go | 21 ++++++++++++--------- server/routes.go | 7 +++++-- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/cmd/interactive.go b/cmd/interactive.go index a285b365..08ab4947 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -385,18 +385,21 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { case "modelfile": fmt.Println(resp.Modelfile) case "parameters": + fmt.Println("Model defined parameters:") if resp.Parameters == "" { - fmt.Println("No parameters were specified for this model.") + fmt.Println(" No additional parameters were specified for this model.") } else { - if len(opts.Options) > 0 { - fmt.Println("User defined parameters:") - for k, v := range opts.Options { - fmt.Printf("%-*s %v\n", 30, k, v) - } - fmt.Println() + for _, l := range strings.Split(resp.Parameters, "\n") { + fmt.Printf(" %s\n", l) } - fmt.Println("Model defined parameters:") - fmt.Println(resp.Parameters) + } + fmt.Println() + if len(opts.Options) > 0 { + fmt.Println("User defined parameters:") + for k, v := range opts.Options { + fmt.Printf(" %-*s %v\n", 30, k, v) + } + fmt.Println() } case "system": switch { diff --git a/server/routes.go b/server/routes.go index 603cd42a..40348e73 100644 --- a/server/routes.go +++ b/server/routes.go @@ -842,8 +842,11 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) { } resp.Parameters = strings.Join(params, "\n") - for k, v := range req.Options { - if _, ok := req.Options[k]; ok { + if len(req.Options) > 0 { + if m.Options == nil { + m.Options = make(map[string]any) + } + for k, v := range req.Options { m.Options[k] = v } } From 6c733bf0a65f59410f091719c429d59cd5488072 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 23 Jul 2025 13:23:32 -0700 Subject: [PATCH 03/17] s#x/exp/maps#maps# (#11506) --- convert/convert_test.go | 11 +++-------- convert/reader_safetensors.go | 5 ++--- convert/tokenizer.go | 8 ++------ go.mod | 2 +- template/template.go | 6 ++---- 5 files changed, 10 insertions(+), 22 deletions(-) diff --git a/convert/convert_test.go b/convert/convert_test.go index 105fbb3d..95cccd56 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -11,14 +11,13 @@ import ( "io" "io/fs" "log/slog" + "maps" "os" "path/filepath" "slices" "strings" "testing" - "golang.org/x/exp/maps" - "github.com/ollama/ollama/fs/ggml" ) @@ -137,9 +136,7 @@ func TestConvertModel(t *testing.T) { t.Fatal(err) } - keys := maps.Keys(expect) - slices.Sort(keys) - for _, k := range keys { + for _, k := range slices.Sorted(maps.Keys(expect)) { if v, ok := actual[k]; !ok { t.Errorf("missing %s", k) } else if v != expect[k] { @@ -343,9 +340,7 @@ func TestConvertAdapter(t *testing.T) { actual := generateResultsJSON(t, r, m.KV(), m.Tensors()) - keys := maps.Keys(c.Expected) - slices.Sort(keys) - for _, k := range keys { + for _, k := range slices.Sorted(maps.Keys(c.Expected)) { if v, ok := actual[k]; !ok { t.Errorf("missing %s", k) } else if v != c.Expected[k] { diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go index f5858532..f182a656 100644 --- a/convert/reader_safetensors.go +++ b/convert/reader_safetensors.go @@ -8,12 +8,12 @@ import ( "fmt" "io" "io/fs" + "maps" "slices" "strings" "github.com/d4l3k/go-bfloat16" "github.com/x448/float16" - "golang.org/x/exp/maps" ) type safetensorMetadata struct { @@ -46,8 +46,7 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T return nil, err } - keys := maps.Keys(headers) - slices.Sort(keys) + keys := slices.Sorted(maps.Keys(headers)) names := make(map[string]struct{}, len(keys)) diff --git a/convert/tokenizer.go b/convert/tokenizer.go index bedcd4f8..41d0310a 100644 --- a/convert/tokenizer.go +++ b/convert/tokenizer.go @@ -8,11 +8,10 @@ import ( "fmt" "io/fs" "log/slog" + "maps" "os" "slices" "strings" - - "golang.org/x/exp/maps" ) const ( @@ -260,11 +259,8 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) { tokens[token.ID] = token } - keys := maps.Keys(tokens) - slices.Sort(keys) - v := Vocabulary{Model: "gpt2"} - for _, k := range keys { + for _, k := range slices.Sorted(maps.Keys(tokens)) { token := tokens[k] v.Tokens = append(v.Tokens, token.Content) v.Scores = append(v.Scores, float32(token.ID)) diff --git a/go.mod b/go.mod index ec3f61bb..46e7f433 100644 --- a/go.mod +++ b/go.mod @@ -71,7 +71,7 @@ require ( github.com/ugorji/go/codec v1.2.12 // indirect golang.org/x/arch v0.8.0 // indirect golang.org/x/crypto v0.36.0 - golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa + golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect golang.org/x/net v0.38.0 // indirect golang.org/x/sys v0.31.0 golang.org/x/term v0.30.0 diff --git a/template/template.go b/template/template.go index 242708f1..d28ace41 100644 --- a/template/template.go +++ b/template/template.go @@ -6,6 +6,7 @@ import ( "encoding/json" "errors" "io" + "maps" "math" "slices" "strings" @@ -14,7 +15,6 @@ import ( "text/template/parse" "github.com/agnivade/levenshtein" - "golang.org/x/exp/maps" "github.com/ollama/ollama/api" ) @@ -157,9 +157,7 @@ func (t *Template) Vars() []string { set[strings.ToLower(n)] = struct{}{} } - vars = maps.Keys(set) - slices.Sort(vars) - return vars + return slices.Sorted(maps.Keys(set)) } type Values struct { From 1e6eab5c334503d95a1d45b143736ae6b1ce5dec Mon Sep 17 00:00:00 2001 From: minxinyi Date: Thu, 24 Jul 2025 05:25:39 +0800 Subject: [PATCH 04/17] server: use slices.Equal to simplify code (#11502) --- server/routes_test.go | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/server/routes_test.go b/server/routes_test.go index 7c44bc95..87b52663 100644 --- a/server/routes_test.go +++ b/server/routes_test.go @@ -16,6 +16,7 @@ import ( "os" "path/filepath" "reflect" + "slices" "sort" "strings" "testing" @@ -82,19 +83,6 @@ func createTestFile(t *testing.T, name string) (string, string) { return f.Name(), digest } -// equalStringSlices checks if two slices of strings are equal. -func equalStringSlices(a, b []string) bool { - if len(a) != len(b) { - return false - } - for i := range a { - if a[i] != b[i] { - return false - } - } - return true -} - type panicTransport struct{} func (t *panicTransport) RoundTrip(r *http.Request) (*http.Response, error) { @@ -447,7 +435,7 @@ func TestRoutes(t *testing.T) { "stop \"foo\"", "top_p 0.9", } - if !equalStringSlices(params, expectedParams) { + if !slices.Equal(params, expectedParams) { t.Errorf("expected parameters %v, got %v", expectedParams, params) } paramCount, ok := showResp.ModelInfo["general.parameter_count"].(float64) From 4f8a0166ccc540346dd160796dacdaceac1fde73 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Wed, 23 Jul 2025 21:21:29 -0700 Subject: [PATCH 05/17] tools: loosen tool argument parsing (#11509) --- tools/tools.go | 125 +++++++++++----------------- tools/tools_test.go | 197 +++++++------------------------------------- 2 files changed, 78 insertions(+), 244 deletions(-) diff --git a/tools/tools.go b/tools/tools.go index c149885f..f473ab6a 100644 --- a/tools/tools.go +++ b/tools/tools.go @@ -120,16 +120,14 @@ func (p *Parser) parseToolCall() *api.ToolCall { return nil } - // only look for arguments after the tool name if the tool has parameters - // TODO (jmorganca): while probably uncommon, this doesn't support - // parsing arguments before the tool name, which may be needed in the future - args := map[string]any{} - if len(tool.Function.Parameters.Properties) > 0 { - var i int - if args, i = findArguments(*tool, p.buffer[end:]); args == nil { - return nil + var args map[string]any + if found, i := findArguments(p.buffer); found == nil { + return nil + } else { + args = found + if i > end { + end = i } - end += i } tc := &api.ToolCall{ @@ -217,93 +215,70 @@ func findTool(tools []api.Tool, buf []byte) (*api.Tool, int) { // objects for functions that have all-optional parameters // e.g. `{"name": "get_conditions", "arguments": {}}` will work but // `{"name": "get_conditions"}` will not currently work -func findArguments(tool api.Tool, buffer []byte) (map[string]any, int) { +func findArguments(buffer []byte) (map[string]any, int) { if len(buffer) == 0 { return nil, 0 } var braces int var start int = -1 - var end int - var object []byte - // find any outer json object for i, c := range buffer { if c == '{' { - braces++ - if start == -1 { + if braces == 0 { start = i } - } + braces++ + } else if c == '}' && braces > 0 { + braces-- + if braces == 0 && start != -1 { + object := buffer[start : i+1] - if c == '}' { - if start != -1 { - braces-- - if braces == 0 { - end = i + 1 - object = buffer[start:end] - break + var data map[string]any + if err := json.Unmarshal(object, &data); err != nil { + start = -1 + continue } - } - } - } - if braces > 0 { - return nil, 0 - } - - var data map[string]any - if err := json.Unmarshal(object, &data); err != nil { - return nil, 0 - } - - var find func(obj any) map[string]any - find = func(obj any) map[string]any { - switch obj := obj.(type) { - case map[string]any: - valid := true - // check if all keys in the object exist in the tool's parameters - for key := range obj { - if _, exists := tool.Function.Parameters.Properties[key]; !exists { - valid = false - break - } - } - - // check for required parameters - // TODO (jmorganca): this should error instead of silently failing - if valid { - for _, required := range tool.Function.Parameters.Required { - if _, exists := obj[required]; !exists { - valid = false - break + var findObject func(obj map[string]any) (map[string]any, bool) + findObject = func(obj map[string]any) (map[string]any, bool) { + if _, hasName := obj["name"]; hasName { + if args, ok := obj["arguments"].(map[string]any); ok { + return args, true + } + if args, ok := obj["parameters"].(map[string]any); ok { + return args, true + } + return nil, true } - } - } - if valid { - return obj - } + for _, v := range obj { + switch child := v.(type) { + case map[string]any: + if result, found := findObject(child); found { + return result, true + } + case []any: + for _, item := range child { + if childObj, ok := item.(map[string]any); ok { + if result, found := findObject(childObj); found { + return result, true + } + } + } + } + } - for _, value := range obj { - if result := find(value); result != nil { - return result + return nil, false } - } - case []any: - for _, item := range obj { - if result := find(item); result != nil { - return result + + if args, found := findObject(data); found { + return args, i } + + return data, i } } - - return nil - } - - result := find(data) - if result != nil { - return result, end } return nil, 0 diff --git a/tools/tools_test.go b/tools/tools_test.go index 092ae323..a0f7b6b0 100644 --- a/tools/tools_test.go +++ b/tools/tools_test.go @@ -227,13 +227,6 @@ func TestParser(t *testing.T) { }, }, }, - { - name: "invalid arguments", - inputs: []string{`{"name": "get_conditions", "arguments": {"city": "San Francisco"}}`}, - content: "", - tmpl: qwen, - calls: nil, - }, { name: "empty args", inputs: []string{`{"name": "get_conditions", "arguments": {}}`}, @@ -249,13 +242,6 @@ func TestParser(t *testing.T) { }, }, }, - { - name: "missing required args", - inputs: []string{`{"name": "get_temperature", "arguments": {}}`}, - content: "", - tmpl: qwen, - calls: nil, - }, { name: "text before tool call", inputs: []string{`Let me check the weather. {"name": "get_temperature", "arguments": {"city": "New York"}}`}, @@ -273,21 +259,6 @@ func TestParser(t *testing.T) { }, }, }, - { - name: "qwen no args tool call", - inputs: []string{`Let me say hello to the user. I'll use the say_hello tool {"name": "say_hello"}`}, - content: "Let me say hello to the user. I'll use the say_hello tool ", - tmpl: qwen, - calls: []api.ToolCall{ - { - Function: api.ToolCallFunction{ - Index: 0, - Name: "say_hello", - Arguments: api.ToolCallFunctionArguments{}, - }, - }, - }, - }, { name: "qwen no args with text", inputs: []string{"Let me say hello to the user. I'll use the say_hello tool. "}, @@ -521,52 +492,6 @@ func TestParser(t *testing.T) { content: "for { fmt.Println(\"hello\") }", tmpl: json, }, - { - name: "json no args tool call", - inputs: []string{ - "{\"name\": \"say_hello\"}", - }, - content: "", - tmpl: json, - calls: []api.ToolCall{ - { - Function: api.ToolCallFunction{ - Index: 0, - Name: "say_hello", - Arguments: api.ToolCallFunctionArguments{}, - }, - }, - }, - }, - { - name: "json no args no tool call", - inputs: []string{ - "I'll use the say_hello tool to say hello to the user.", - }, - content: "I'll use the say_hello tool to say hello to the user.", - tmpl: json, - calls: nil, - }, - - // TODO (jmorganca): this is a false positive, we should - // not be parsing this as a tool call - { - name: "json no args false positive", - inputs: []string{ - `{say_hello!!!}`, - }, - content: "", - tmpl: json, - calls: []api.ToolCall{ - { - Function: api.ToolCallFunction{ - Index: 0, - Name: "say_hello", - Arguments: api.ToolCallFunctionArguments{}, - }, - }, - }, - }, { name: "list multiple", inputs: []string{ @@ -684,26 +609,6 @@ func TestParser(t *testing.T) { tmpl: list, calls: nil, }, - { - name: "list with no arguments", - inputs: []string{ - "[", - "{", - "\"name\": \"say_hello\"", - "}", - }, - content: "", - tmpl: list, - calls: []api.ToolCall{ - { - Function: api.ToolCallFunction{ - Index: 0, - Name: "say_hello", - Arguments: api.ToolCallFunctionArguments{}, - }, - }, - }, - }, { name: "tool name with collision", inputs: []string{ @@ -711,7 +616,7 @@ func TestParser(t *testing.T) { "{", "\"name\": \"say_hello", "_world\",", - "}", + "\"arguments\": {}}", "}", }, content: "", @@ -733,13 +638,13 @@ func TestParser(t *testing.T) { "{", "\"name\": \"say_hello", "_world\",", - "}", + "\"arguments\": {}}", "", "", "{", "\"name\": \"say_hello", "\",", - "}", + "\"arguments\": {}}", "", }, content: "", @@ -773,7 +678,7 @@ func TestParser(t *testing.T) { { name: "tool name with collision non streaming multiple", inputs: []string{ - `{"name": "say_hello"}{"name": "say_hello_world"}`, + `{"name": "say_hello", "arguments": {}}{"name": "say_hello_world", "arguments": {}}`, }, content: "", tmpl: qwen, @@ -797,7 +702,7 @@ func TestParser(t *testing.T) { { name: "tool name with collision non streaming shorter", inputs: []string{ - `{"name": "say_hello"}`, + `{"name": "say_hello", "arguments": {}}`, }, content: "", tmpl: qwen, @@ -814,7 +719,7 @@ func TestParser(t *testing.T) { { name: "tool name with collision non streaming longer", inputs: []string{ - `{"name": "say_hello_world"}`, + `{"name": "say_hello_world", "arguments": {}}`, }, content: "", tmpl: qwen, @@ -871,6 +776,26 @@ func TestParser(t *testing.T) { }, }, }, + { + name: "args before name", + inputs: []string{ + `{"arguments": {"a": "5", "b": "10"}, "name": "add"}`, + }, + content: "", + tmpl: qwen, + calls: []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Index: 0, + Name: "add", + Arguments: api.ToolCallFunctionArguments{ + "a": "5", + "b": "10", + }, + }, + }, + }, + }, } for _, tt := range tests { @@ -1167,75 +1092,25 @@ func TestFindTag(t *testing.T) { } func TestFindArguments(t *testing.T) { - tool := api.Tool{ - Type: "function", - Function: api.ToolFunction{ - Name: "get_temperature", - Description: "Retrieve the temperature for a given location", - Parameters: struct { - Type string `json:"type"` - Defs any `json:"$defs,omitempty"` - Items any `json:"items,omitempty"` - Required []string `json:"required"` - Properties map[string]struct { - Type api.PropertyType `json:"type"` - Items any `json:"items,omitempty"` - Description string `json:"description"` - Enum []any `json:"enum,omitempty"` - } `json:"properties"` - }{ - Type: "object", - Properties: map[string]struct { - Type api.PropertyType `json:"type"` - Items any `json:"items,omitempty"` - Description string `json:"description"` - Enum []any `json:"enum,omitempty"` - }{ - "format": { - Type: api.PropertyType{"string"}, - Description: "The format to return the temperature in", - Enum: []any{"fahrenheit", "celsius"}, - }, - "location": { - Type: api.PropertyType{"string"}, - Description: "The location to get the temperature for", - }, - }, - }, - }, - } - - tool2 := api.Tool{ - Type: "function", - Function: api.ToolFunction{ - Name: "say_hello", - Description: "Say hello to the user", - }, - } - tests := []struct { name string buffer []byte want map[string]any - tool api.Tool }{ { name: "empty string", buffer: []byte{}, want: nil, - tool: tool, }, { name: "whitespace only", buffer: []byte(" \n\t "), want: nil, - tool: tool, }, { name: "unbalanced braces - missing closing", buffer: []byte(`{"format": "fahrenheit", "location": "San Francisco"`), want: nil, - tool: tool, }, { name: "unbalanced braces - extra closing", @@ -1243,13 +1118,11 @@ func TestFindArguments(t *testing.T) { want: map[string]any{ "format": "fahrenheit", }, - tool: tool, }, { name: "invalid JSON", buffer: []byte(`{format: fahrenheit, location: "San Francisco"}`), want: nil, - tool: tool, }, { name: "valid json", @@ -1258,7 +1131,6 @@ func TestFindArguments(t *testing.T) { "format": "fahrenheit", "location": "San Francisco, CA", }, - tool: tool, }, { name: "valid arguments with special tokens", @@ -1267,16 +1139,14 @@ func TestFindArguments(t *testing.T) { "format": "fahrenheit", "location": "San Francisco, CA", }, - tool: tool, }, { name: "valid arguments in array", - buffer: []byte(`[{"arguments": {"format": "fahrenheit", "location": "San Francisco, CA"}}`), + buffer: []byte(`[{"name": "get_temperature", "arguments": {"format": "fahrenheit", "location": "San Francisco, CA"}}`), want: map[string]any{ "format": "fahrenheit", "location": "San Francisco, CA", }, - tool: tool, }, { name: "nested deep", @@ -1285,7 +1155,6 @@ func TestFindArguments(t *testing.T) { "format": "fahrenheit", "location": "San Francisco, CA", }, - tool: tool, }, { name: "one arg", @@ -1293,7 +1162,6 @@ func TestFindArguments(t *testing.T) { want: map[string]any{ "location": "San Francisco, CA", }, - tool: tool, }, { name: "two args", @@ -1302,13 +1170,6 @@ func TestFindArguments(t *testing.T) { "location": "San Francisco, CA", "format": "fahrenheit", }, - tool: tool, - }, - { - name: "no args", - buffer: []byte(`{"name": "say_hello"}`), - want: nil, - tool: tool2, }, { name: "deepseek", @@ -1316,7 +1177,6 @@ func TestFindArguments(t *testing.T) { want: map[string]any{ "location": "Tokyo", }, - tool: tool, }, { name: "deepseek", @@ -1324,13 +1184,12 @@ func TestFindArguments(t *testing.T) { want: map[string]any{ "location": "Tokyo", }, - tool: tool, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, _ := findArguments(tt.tool, tt.buffer) + got, _ := findArguments(tt.buffer) if diff := cmp.Diff(got, tt.want); diff != "" { t.Errorf("scanArguments() args mismatch (-got +want):\n%s", diff) From 80b538e312c173d124fdcb91d40285b32e80d0a9 Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Wed, 23 Jul 2025 22:16:55 -0700 Subject: [PATCH 06/17] cli: catch upstream errors gracefully (#11512) --- cmd/cmd.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmd/cmd.go b/cmd/cmd.go index 7955012c..1d1d116b 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1137,6 +1137,14 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) { if errors.Is(err, context.Canceled) { return nil, nil } + + // this error should ideally be wrapped properly by the client + if strings.Contains(err.Error(), "upstream error") { + p.StopAndClear() + fmt.Println("An error occurred while processing your message. Please try again.") + fmt.Println() + return nil, nil + } return nil, err } From b72e5adb14338f78937b103f0c8c668d5f4c4006 Mon Sep 17 00:00:00 2001 From: Ruyut Date: Sat, 26 Jul 2025 05:24:06 +0800 Subject: [PATCH 07/17] CONTRIBUTING: fix typo in commit message example (#11528) --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c7028e00..455e7c69 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -65,7 +65,7 @@ continuation of the sentence: Examples: llm/backend/mlx: support the llama architecture - CONTRIBUTING: provide clairity on good commit messages, and bad + CONTRIBUTING: provide clarity on good commit messages, and bad Bad Examples: From 764be7480f19f1749c518b21cead7c3a44c04b1d Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 25 Jul 2025 14:50:05 -0700 Subject: [PATCH 08/17] kvcache: Group shift operations into batches Currently, when we need to do a shift on the cache, it is one RoPE operation on the entire size of the cache (per layer). In some cases, this can create a compute graph that is larger than the forward pass since the forward pass is working in batches. Since we don't consider shifting in our memory estimates, it's possible for this to cause a crash if we run out of memory. By limiting the size of the RoPE calls to batch size chunks, we ensure that the shift will never exceed the size of the forward pass, since the forward pass will also contain a RoPE of the same size. This does not have a sigificant impact on performance since RoPE is a math operation that is mostly proportional to the size of its inputs. In theory defrag could have the same issue since it also creates a compute graph outside of the forward pass, however, since it is only copies, it does not require any working space. --- kvcache/causal.go | 79 ++++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 36 deletions(-) diff --git a/kvcache/causal.go b/kvcache/causal.go index b594d0b4..8b101a81 100644 --- a/kvcache/causal.go +++ b/kvcache/causal.go @@ -25,6 +25,9 @@ type Causal struct { opts CausalOptions + // maxBatch is the largest batch that we might receive + maxBatch int + // config controls mostly backend-specific optimizations config *ml.CacheConfig @@ -147,6 +150,7 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity c.DType = dtype c.cellRanges = make(map[int]cellRange) c.backend = backend + c.maxBatch = maxBatch } func (c *Causal) SetConfig(config ml.CacheConfig) { @@ -639,48 +643,51 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error { return ErrNotSupported } - ctx := c.backend.NewContext() - defer ctx.Close() - seqRange := c.cellRanges[seq] - size := seqRange.max - seqRange.min + 1 - offsets := make([]int32, size) - for i := range offsets { - cell := c.cells[seqRange.min+i] + for start := seqRange.min; start <= seqRange.max; start += c.maxBatch { + ctx := c.backend.NewContext() - if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex { - offsets[i] = offset + size := min(seqRange.max-start+1, c.maxBatch) + offsets := make([]int32, size) + for i := range offsets { + cell := c.cells[start+i] + + if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex { + offsets[i] = offset + } } + + kShift := ctx.Input().FromIntSlice(offsets, len(offsets)) + + for i, key := range c.keys { + if key == nil { + continue + } + + kHeadDim := key.Dim(0) + numKVHeads := key.Dim(1) + rowSize := key.Stride(2) + + key = key.View(ctx, rowSize*start, + kHeadDim, key.Stride(1), + numKVHeads, key.Stride(2), + size, + ) + + roped, err := c.shiftFn(ctx, i, key, kShift) + if err != nil { + ctx.Close() + return err + } + + ctx.Forward(roped.Copy(ctx, key)) + } + + ctx.Compute() + ctx.Close() } - kShift := ctx.Input().FromIntSlice(offsets, len(offsets)) - - for i, key := range c.keys { - if key == nil { - continue - } - - kHeadDim := key.Dim(0) - numKVHeads := key.Dim(1) - rowSize := key.Stride(2) - - key = key.View(ctx, rowSize*seqRange.min, - kHeadDim, key.Stride(1), - numKVHeads, key.Stride(2), - size, - ) - - roped, err := c.shiftFn(ctx, i, key, kShift) - if err != nil { - return err - } - - ctx.Forward(roped.Copy(ctx, key)) - } - - ctx.Compute() - return nil } From bbf66c0b960be42936e861f13dd0284b2aa03b9d Mon Sep 17 00:00:00 2001 From: Mayan EDMS <50279075+mayanedms@users.noreply.github.com> Date: Sun, 27 Jul 2025 18:02:52 -0400 Subject: [PATCH 09/17] readme: add Mayan EDMS to community integrations (#11543) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1ea24e75..d5049d3e 100644 --- a/README.md +++ b/README.md @@ -410,6 +410,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.) - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.) - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.) +- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.) ### Cloud From 3515cc377ce2506c95a0ea408fd5d15d306fc6aa Mon Sep 17 00:00:00 2001 From: Yoshi <70424721+yoshihyoda@users.noreply.github.com> Date: Mon, 28 Jul 2025 11:19:13 -0700 Subject: [PATCH 10/17] docs: fix typos and remove trailing whitespaces (#11554) --- docs/api.md | 4 ++-- docs/development.md | 2 +- docs/openai.md | 2 +- docs/troubleshooting.md | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/api.md b/docs/api.md index 41858885..683db357 100644 --- a/docs/api.md +++ b/docs/api.md @@ -500,11 +500,11 @@ The `message` object has the following fields: - `thinking`: (for thinking models) the model's thinking process - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`) - `tool_calls` (optional): a list of tools in JSON that the model wants to use -- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result +- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result Advanced parameters (optional): -- `format`: the format to return a response in. Format can be `json` or a JSON schema. +- `format`: the format to return a response in. Format can be `json` or a JSON schema. - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) diff --git a/docs/development.md b/docs/development.md index 24bcba19..9726b5d9 100644 --- a/docs/development.md +++ b/docs/development.md @@ -118,7 +118,7 @@ To run tests, use `go test`: go test ./... ``` -> NOTE: In rare cirumstances, you may need to change a package using the new +> NOTE: In rare circumstances, you may need to change a package using the new > "synctest" package in go1.24. > > If you do not have the "synctest" package enabled, you will not see build or diff --git a/docs/openai.md b/docs/openai.md index d0bac4cd..26930124 100644 --- a/docs/openai.md +++ b/docs/openai.md @@ -72,7 +72,7 @@ client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama") # Define the schema for the response class FriendInfo(BaseModel): name: str - age: int + age: int is_available: bool class FriendList(BaseModel): diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 995b33ac..6fdd3e85 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -9,7 +9,7 @@ cat ~/.ollama/logs/server.log On **Linux** systems with systemd, the logs can be found with this command: ```shell -journalctl -u ollama --no-pager --follow --pager-end +journalctl -u ollama --no-pager --follow --pager-end ``` When you run Ollama in a **container**, the logs go to stdout/stderr in the container: @@ -23,7 +23,7 @@ docker logs If manually running `ollama serve` in a terminal, the logs will be on that terminal. When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `+R` and type in: -- `explorer %LOCALAPPDATA%\Ollama` to view logs. The most recent server logs will be in `server.log` and older logs will be in `server-#.log` +- `explorer %LOCALAPPDATA%\Ollama` to view logs. The most recent server logs will be in `server.log` and older logs will be in `server-#.log` - `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH) - `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored @@ -38,7 +38,7 @@ Join the [Discord](https://discord.gg/ollama) for help interpreting the logs. ## LLM libraries -Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library. +Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` and the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library. In the server log, you will see a message that looks something like this (varies from release to release): @@ -97,7 +97,7 @@ If none of those resolve the problem, gather additional information and file an On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log. -When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices. For example, in the following output `crw-rw---- 1 0 44 226, 0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` +When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices. For example, in the following output `crw-rw---- 1 0 44 226, 0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure. - `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems From c116a7523ddc067db2b86aab38172c05ad01c710 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 28 Jul 2025 11:29:25 -0700 Subject: [PATCH 11/17] kvcache: Don't shift empty batches When we context shift, we delete half the context and apply RoPE with an offset to the other half. We used to RoPE across the entire context in a single pass with a zero offset for the deleted section. With the change to shifting in batches, we can skip any batches where all of the offsets would be zero. This typically reduces the number of operations by half. --- kvcache/causal.go | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/kvcache/causal.go b/kvcache/causal.go index 8b101a81..496eeaa6 100644 --- a/kvcache/causal.go +++ b/kvcache/causal.go @@ -646,18 +646,31 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error { seqRange := c.cellRanges[seq] for start := seqRange.min; start <= seqRange.max; start += c.maxBatch { - ctx := c.backend.NewContext() - size := min(seqRange.max-start+1, c.maxBatch) offsets := make([]int32, size) + + var batchFirst, batchLast int + + batchFirst = -1 for i := range offsets { cell := c.cells[start+i] if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex { offsets[i] = offset + if batchFirst < 0 { + batchFirst = i + } + batchLast = i } } + if batchFirst < 0 { + continue + } + + offsets = offsets[batchFirst : batchLast+1] + + ctx := c.backend.NewContext() kShift := ctx.Input().FromIntSlice(offsets, len(offsets)) for i, key := range c.keys { @@ -669,10 +682,10 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error { numKVHeads := key.Dim(1) rowSize := key.Stride(2) - key = key.View(ctx, rowSize*start, + key = key.View(ctx, rowSize*(start+batchFirst), kHeadDim, key.Stride(1), numKVHeads, key.Stride(2), - size, + len(offsets), ) roped, err := c.shiftFn(ctx, i, key, kShift) From ea85e27bbd76a342ad390576fc2e717a72ce96de Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Tue, 29 Jul 2025 21:37:06 +0200 Subject: [PATCH 12/17] Increase performance for Gemma3n models on NVGPUs by enabling CUDA Graph execution (#11525) * Enable CUDA Graphs for gemma3n. Similar to https://github.com/ggml-org/llama.cpp/pull/14741, though ollama has a slightly different model graph than llama.cpp which requires different workaround checks. * Remove residual check by reshaping differently in gemma3n model This should make the heuristics more robust --- .../0019-metal-add-mean-kernel-14267.patch | 2 +- .../0020-CUDA-add-mean-operation-14313.patch | 2 +- .../0021-Enable-CUDA-Graphs-for-gemma3n.patch | 50 +++++++++++++++++++ .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu | 16 ++++-- model/models/gemma3n/model_text.go | 7 ++- 5 files changed, 67 insertions(+), 10 deletions(-) create mode 100644 llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch diff --git a/llama/patches/0019-metal-add-mean-kernel-14267.patch b/llama/patches/0019-metal-add-mean-kernel-14267.patch index a52f0fdf..e65aeb7b 100644 --- a/llama/patches/0019-metal-add-mean-kernel-14267.patch +++ b/llama/patches/0019-metal-add-mean-kernel-14267.patch @@ -16,7 +16,7 @@ ggml-ci 2 files changed, 67 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index ee4f2dcb..f20f5615 100644 +index a9eeebc6..110c9ece 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -489,6 +489,7 @@ enum ggml_metal_kernel_type { diff --git a/llama/patches/0020-CUDA-add-mean-operation-14313.patch b/llama/patches/0020-CUDA-add-mean-operation-14313.patch index efcb1e8b..2f4e3794 100644 --- a/llama/patches/0020-CUDA-add-mean-operation-14313.patch +++ b/llama/patches/0020-CUDA-add-mean-operation-14313.patch @@ -52,7 +52,7 @@ index 64fb4ff4..5b9a0fe3 100644 static __device__ __forceinline__ float warp_reduce_max(float x) { #pragma unroll diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 4c829153..9e64e5ae 100644 +index d6960174..2b9fabf4 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -35,6 +35,7 @@ diff --git a/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch b/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch new file mode 100644 index 00000000..b9dd6cdc --- /dev/null +++ b/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch @@ -0,0 +1,50 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Oliver Simons +Date: Tue, 22 Jul 2025 11:02:28 +0200 +Subject: [PATCH] Enable CUDA Graphs for gemma3n. + +Similar to +https://github.com/ggml-org/llama.cpp/pull/14741, +though ollama has a slightly different model graph +than llama.cpp which requires different workaround +checks. +--- + ggml/src/ggml-cuda/ggml-cuda.cu | 16 ++++++++++++---- + 1 file changed, 12 insertions(+), 4 deletions(-) + +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index 2b9fabf4..28ccf4be 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud + // Loop over nodes in GGML graph to obtain info needed for CUDA graph + cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); + ++ const std::string gemma3n_per_layer_proj_src1_name = " (reshaped)"; ++ const std::string gemma3n_node_name = "node_"; ++ + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + +@@ -2495,12 +2498,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud + #endif + } + +- if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { +- // disable CUDA graphs for batch size > 1 for now. +- // Changes in batch size or context size can cause changes to the grid size of some kernels. ++ // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n ++ // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here ++ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256 ++ && node->ne[2] == 1 ++ && node->ne[3] == 1 ++ && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false ++ && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) { ++ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. + use_cuda_graph = false; + #ifndef NDEBUG +- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); ++ GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + #endif + } + diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index 2b9fabf4..28ccf4be 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud // Loop over nodes in GGML graph to obtain info needed for CUDA graph cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); + const std::string gemma3n_per_layer_proj_src1_name = " (reshaped)"; + const std::string gemma3n_node_name = "node_"; + for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2495,12 +2498,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud #endif } - if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { - // disable CUDA graphs for batch size > 1 for now. - // Changes in batch size or context size can cause changes to the grid size of some kernels. + // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n + // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here + if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256 + && node->ne[2] == 1 + && node->ne[3] == 1 + && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false + && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) { + // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. use_cuda_graph = false; #ifndef NDEBUG - GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); #endif } diff --git a/model/models/gemma3n/model_text.go b/model/models/gemma3n/model_text.go index 715b8a0e..b75a2abb 100644 --- a/model/models/gemma3n/model_text.go +++ b/model/models/gemma3n/model_text.go @@ -203,10 +203,9 @@ func (a AltUp) Predict(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions coefficients := a.PredictionCoefficient.Forward(ctx, modalities) coefficients = coefficients.Reshape(ctx, opts.altupInputs, opts.altupInputs, coefficients.Dim(1), coefficients.Dim(2)) - hiddenStates = hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx) - predictions := coefficients.Mulmat(ctx, hiddenStates) - predictions = predictions.Add(ctx, hiddenStates) - return predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx) + predictions := coefficients.Mulmat(ctx, hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)) + predictions = predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx) + return predictions.Add(ctx, hiddenStates) } func (a AltUp) Correct(ctx ml.Context, predictions, activated, one ml.Tensor, opts *TextOptions) ml.Tensor { From 8afa6e83f2cace42cc1421737f9f9b235e8e33b7 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 29 Jul 2025 16:41:25 -0700 Subject: [PATCH 13/17] CI: switch back to x86 macos builder (#11572) --- .github/workflows/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 40871e64..4acb283b 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -23,7 +23,7 @@ jobs: echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT darwin-build: - runs-on: macos-13-xlarge + runs-on: macos-13 environment: release needs: setup-environment strategy: From 25911a6e6bd5a0cf209d871c721aa7bc74f59509 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 30 Jul 2025 08:50:54 -0700 Subject: [PATCH 14/17] mac: disable bf16 on unsupported OS versions (#11585) Support for bf16 was added in MacOS v14+ and attempting to enable on older versions causes runtime failures. --- .../0019-metal-add-mean-kernel-14267.patch | 4 +-- .../0022-BF16-macos-version-guard.patch | 27 +++++++++++++++++++ .../ggml/ggml/src/ggml-metal/ggml-metal.m | 6 ++++- 3 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 llama/patches/0022-BF16-macos-version-guard.patch diff --git a/llama/patches/0019-metal-add-mean-kernel-14267.patch b/llama/patches/0019-metal-add-mean-kernel-14267.patch index e65aeb7b..f20e854b 100644 --- a/llama/patches/0019-metal-add-mean-kernel-14267.patch +++ b/llama/patches/0019-metal-add-mean-kernel-14267.patch @@ -19,7 +19,7 @@ diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index a9eeebc6..110c9ece 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m -@@ -489,6 +489,7 @@ enum ggml_metal_kernel_type { +@@ -489,6 +489,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_COS, GGML_METAL_KERNEL_TYPE_NEG, GGML_METAL_KERNEL_TYPE_SUM_ROWS, @@ -27,7 +27,7 @@ index a9eeebc6..110c9ece 100644 GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, GGML_METAL_KERNEL_TYPE_ARGMAX, -@@ -1436,6 +1437,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de +@@ -1436,6 +1437,7 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG, neg, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); diff --git a/llama/patches/0022-BF16-macos-version-guard.patch b/llama/patches/0022-BF16-macos-version-guard.patch new file mode 100644 index 00000000..68aac0bb --- /dev/null +++ b/llama/patches/0022-BF16-macos-version-guard.patch @@ -0,0 +1,27 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Daniel Hiltgen +Date: Wed, 30 Jul 2025 08:43:46 -0700 +Subject: [PATCH] BF16 macos version guard + +Only enable BF16 on supported MacOS versions (v14+) +--- + ggml/src/ggml-metal/ggml-metal.m | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m +index 110c9ece..ab46f6e3 100644 +--- a/ggml/src/ggml-metal/ggml-metal.m ++++ b/ggml/src/ggml-metal/ggml-metal.m +@@ -89,7 +89,11 @@ + ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6]; + + #if defined(GGML_METAL_USE_BF16) +- ctx->use_bfloat = ctx->has_bfloat; ++ if (@available(macOS 14.0, *)) { ++ ctx->use_bfloat = ctx->has_bfloat; ++ } else { ++ ctx->use_bfloat = false; ++ } + #else + ctx->use_bfloat = false; + #endif diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m index 110c9ece..ab46f6e3 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m @@ -89,7 +89,11 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6]; #if defined(GGML_METAL_USE_BF16) - ctx->use_bfloat = ctx->has_bfloat; + if (@available(macOS 14.0, *)) { + ctx->use_bfloat = ctx->has_bfloat; + } else { + ctx->use_bfloat = false; + } #else ctx->use_bfloat = false; #endif From 6dcc5dfb9c0a033e4e8dde627d55580600418fb6 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 30 Jul 2025 08:56:01 -0700 Subject: [PATCH 15/17] Revert "CI: switch back to x86 macos builder" (#11588) This reverts commit 9d071e6089319b37acf62bb739e3430dcb2ac0c3. --- .github/workflows/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 4acb283b..40871e64 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -23,7 +23,7 @@ jobs: echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT darwin-build: - runs-on: macos-13 + runs-on: macos-13-xlarge environment: release needs: setup-environment strategy: From ff89ba90bc97e9f58b8378a664b904bbc94e6f26 Mon Sep 17 00:00:00 2001 From: Sajal Kulshreshtha Date: Thu, 31 Jul 2025 00:32:54 +0530 Subject: [PATCH 16/17] fixing broken AMD driver link (#11579) --- discover/amd_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discover/amd_linux.go b/discover/amd_linux.go index 830fa1df..dc9a4e18 100644 --- a/discover/amd_linux.go +++ b/discover/amd_linux.go @@ -58,7 +58,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { driverMajor, driverMinor, err := AMDDriverVersion() if err != nil { // TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU - slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err) + slog.Warn("ollama recommends running the https://www.amd.com/en/support/download/linux-drivers.html", "error", err) } // Determine if the user has already pre-selected which GPUs to look at, then ignore the others From 4183bb0574a28b73276efef944107d0c45d79c95 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 30 Jul 2025 14:42:57 -0700 Subject: [PATCH 17/17] kvcache: Enable SWA to retain additional entries Models that use sliding window attention can only resume a sequence from the cache if it falls within the saved windows. This works well if the next message picks up where the old one left off. However, it generally prevents a partial prefix match unless the entire conversation falls within the sliding window. This can be a problem with reasoning models where the traces are supposed to be removed from future messages, forcing the entire history to be re-evaluated. This change allows models to specify that a larger amount of the history be retained in memory, to allow more partial resumption. It still respects the window that the model was trained on for token generation. --- kvcache/causal.go | 117 +++++++++++++++++++++++++-------------- kvcache/causal_test.go | 121 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 196 insertions(+), 42 deletions(-) diff --git a/kvcache/causal.go b/kvcache/causal.go index 496eeaa6..56c93600 100644 --- a/kvcache/causal.go +++ b/kvcache/causal.go @@ -19,9 +19,16 @@ type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, e // The tensors are of shape embed dim, kv heads, batch size // The mask is of shape history size, batch size type Causal struct { - DType ml.DType - windowSize int32 - chunkSize int32 + DType ml.DType + + // swaWindowSize is the number of tokens that will be included in the mask + // during attention operations. swaMemorySize is the number of tokens that + // will be retained in memory for partial prefix caching. Set to math.MaxInt32 + // for unlimited or if sliding window attention is not being used. + swaWindowSize int32 + swaMemorySize int32 + + chunkSize int32 opts CausalOptions @@ -88,32 +95,41 @@ type cellRange struct { func NewCausalCache(shift shiftFn) *Causal { return &Causal{ - windowSize: math.MaxInt32, - shiftFn: shift, - ctxs: make(map[int]ml.Context), - keys: make(map[int]ml.Tensor), - values: make(map[int]ml.Tensor), + shiftFn: shift, + ctxs: make(map[int]ml.Context), + keys: make(map[int]ml.Tensor), + values: make(map[int]ml.Tensor), } } func NewSWACache(windowSize int32, shift shiftFn) *Causal { return &Causal{ - windowSize: windowSize, - shiftFn: shift, - ctxs: make(map[int]ml.Context), - keys: make(map[int]ml.Tensor), - values: make(map[int]ml.Tensor), + swaWindowSize: windowSize, + shiftFn: shift, + ctxs: make(map[int]ml.Context), + keys: make(map[int]ml.Tensor), + values: make(map[int]ml.Tensor), + } +} + +func NewSWAMemCache(windowSize int32, memorySize int32, shift shiftFn) *Causal { + return &Causal{ + swaWindowSize: windowSize, + swaMemorySize: memorySize, + shiftFn: shift, + ctxs: make(map[int]ml.Context), + keys: make(map[int]ml.Tensor), + values: make(map[int]ml.Tensor), } } func NewChunkedAttentionCache(chunkSize int32, shift shiftFn) *Causal { return &Causal{ - windowSize: math.MaxInt32, - chunkSize: chunkSize, - shiftFn: shift, - ctxs: make(map[int]ml.Context), - keys: make(map[int]ml.Tensor), - values: make(map[int]ml.Tensor), + chunkSize: chunkSize, + shiftFn: shift, + ctxs: make(map[int]ml.Context), + keys: make(map[int]ml.Tensor), + values: make(map[int]ml.Tensor), } } @@ -138,11 +154,25 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity c.config.MaskDType = ml.DTypeF32 } + if c.swaWindowSize == 0 { + c.swaWindowSize = math.MaxInt32 + } + if c.swaMemorySize == 0 { + c.swaMemorySize = c.swaWindowSize + } + if int(c.swaMemorySize) > capacity { + c.swaMemorySize = math.MaxInt32 + } + + if c.swaMemorySize < c.swaWindowSize { + panic(fmt.Errorf("sliding window memory (%v) must be at least as large as the window (%v)", c.swaMemorySize, c.swaWindowSize)) + } + var cacheSize int - if c.windowSize == math.MaxInt32 || capacity < int(c.windowSize) { + if c.swaMemorySize == math.MaxInt32 { cacheSize = maxSequences * capacity } else { - cacheSize = (maxSequences * int(c.windowSize)) + maxBatch + cacheSize = (maxSequences * int(c.swaMemorySize)) + maxBatch } cacheSize = roundUp(cacheSize, c.config.CachePadding) c.cells = make([]cacheCell, cacheSize) @@ -187,7 +217,6 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e return err } - c.curCellRange = newRange() for i, pos := range batch.Positions { seq := batch.Sequences[i] @@ -198,19 +227,12 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e seqRange = newRange() } - if c.curLoc+i > seqRange.max { - seqRange.max = c.curLoc + i - } - if seqRange.max > c.curCellRange.max { - c.curCellRange.max = seqRange.max - } + seqRange.min = min(seqRange.min, c.curLoc+i) + c.curCellRange.min = min(c.curCellRange.min, c.curLoc+i) + + seqRange.max = max(seqRange.max, c.curLoc+i) + c.curCellRange.max = max(c.curCellRange.max, c.curLoc+i) - if c.curLoc+i < seqRange.min { - seqRange.min = c.curLoc + i - } - if seqRange.min < c.curCellRange.min { - c.curCellRange.min = seqRange.min - } c.cellRanges[seq] = seqRange } } else { @@ -252,7 +274,16 @@ func (c *Causal) findStartLoc() (int, error) { } func (c *Causal) updateSlidingWindow() { - if c.windowSize == math.MaxInt32 { + c.curCellRange = newRange() + + if c.swaMemorySize == math.MaxInt32 { + for _, seq := range c.curSequences { + if seqRange, ok := c.cellRanges[seq]; ok { + c.curCellRange.min = min(c.curCellRange.min, seqRange.min) + c.curCellRange.max = max(c.curCellRange.max, seqRange.max) + } + } + return } @@ -282,12 +313,16 @@ func (c *Causal) updateSlidingWindow() { for i := oldRange.min; i <= oldRange.max; i++ { if slices.Contains(c.cells[i].sequences, seq) { - if c.cells[i].pos < pos-c.windowSize { + if c.cells[i].pos < pos-c.swaMemorySize { c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq }) } else { newRange.min = min(newRange.min, i) newRange.max = max(newRange.max, i) } + if c.cells[i].pos >= pos-c.swaWindowSize { + c.curCellRange.min = min(c.curCellRange.min, i) + c.curCellRange.max = max(c.curCellRange.max, i) + } } } @@ -327,7 +362,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor { if !slices.Contains(c.cells[j].sequences, c.curSequences[i]) || (enabled && c.cells[j].pos > c.curPositions[i]) || c.chunkSize > 0 && c.cells[j].pos < c.curPositions[i]-c.curPositions[i]%c.chunkSize || - c.cells[j].pos < c.curPositions[i]-c.windowSize { + c.cells[j].pos < c.curPositions[i]-c.swaWindowSize { mask[i*length+(j-c.curCellRange.min)] = float32(math.Inf(-1)) } } @@ -485,6 +520,8 @@ func (c *Causal) defrag() { c.cellRanges[seq] = seqRange } + + c.updateSlidingWindow() } func (c *Causal) SetLayer(layer int) { @@ -610,7 +647,7 @@ func (c *Causal) CopyPrefix(srcSeq, dstSeq int, len int32) { } func (c *Causal) CanResume(seq int, pos int32) bool { - if c.windowSize == math.MaxInt32 { + if c.swaMemorySize == math.MaxInt32 { return true } @@ -632,8 +669,8 @@ func (c *Causal) CanResume(seq int, pos int32) bool { return false } - lastWindowStart := max(0, last-c.windowSize) - posWindowStart := max(0, pos-c.windowSize) + lastWindowStart := max(0, last-c.swaMemorySize) + posWindowStart := max(0, pos-c.swaWindowSize) return posWindowStart >= lastWindowStart } diff --git a/kvcache/causal_test.go b/kvcache/causal_test.go index 5b1dbe86..0d8cea79 100644 --- a/kvcache/causal_test.go +++ b/kvcache/causal_test.go @@ -60,6 +60,8 @@ func TestSWA(t *testing.T) { cache.Init(backend, ml.DTypeF16, 1, 16, 16) + x := float32(math.Inf(-1)) + tests := []testCase{ { name: "FirstBatch", @@ -69,7 +71,12 @@ func TestSWA(t *testing.T) { pos: []int32{0, 1, 2, 3}, expected: []float32{1, 2, 3, 4}, expectedShape: []int{1, 1, 4}, - expectedMask: []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0}, + expectedMask: []float32{ + 0, x, x, x, + 0, 0, x, x, + x, 0, 0, x, + x, x, 0, 0, + }, }, { name: "SecondBatch", @@ -79,7 +86,53 @@ func TestSWA(t *testing.T) { pos: []int32{4, 5}, expected: []float32{5, 6, 3, 4}, expectedShape: []int{1, 1, 4}, - expectedMask: []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1))}, + expectedMask: []float32{ + 0, x, x, 0, + 0, 0, x, x, + }, + }, + } + + testCache(t, backend, cache, tests) +} + +func TestSWAMem(t *testing.T) { + backend := &testBackend{} + cache := NewSWAMemCache(1, 3, nil) + defer cache.Close() + + cache.Init(backend, ml.DTypeF16, 1, 16, 16) + + x := float32(math.Inf(-1)) + + tests := []testCase{ + { + name: "FirstBatch", + in: []float32{1, 2, 3, 4}, + inShape: []int{1, 1, 4}, + seqs: []int{0, 0, 0, 0}, + pos: []int32{0, 1, 2, 3}, + expected: []float32{1, 2, 3, 4}, + expectedShape: []int{1, 1, 4}, + expectedMask: []float32{ + 0, x, x, x, + 0, 0, x, x, + x, 0, 0, x, + x, x, 0, 0, + }, + }, + { + name: "SecondBatch", + in: []float32{5, 6}, + inShape: []int{1, 1, 2}, + seqs: []int{0, 0}, + pos: []int32{4, 5}, + expected: []float32{4, 5, 6}, + expectedShape: []int{1, 1, 3}, + expectedMask: []float32{ + 0, 0, x, + x, 0, 0, + }, }, } @@ -437,6 +490,70 @@ func TestCanResume(t *testing.T) { } } +func TestCanResumeSWAMem(t *testing.T) { + backend := &testBackend{} + windowSize := int32(4) + memSize := int32(5) + cache := NewSWAMemCache(windowSize, memSize, nil) + defer cache.Close() + + cache.Init(backend, ml.DTypeF16, 1, 16, 16) + + context := backend.NewContext() + defer context.Close() + + err := cache.StartForward(context, input.Batch{ + Positions: []int32{0, 1, 2, 3, 4, 5}, + Sequences: []int{0, 0, 0, 0, 0, 0}, + }, false) + if err != nil { + t.Fatalf("StartForward failed: %v", err) + } + + cache.SetLayer(0) + tensor := context.FromFloatSlice([]float32{1, 2, 3, 4, 5, 6}, 1, 1, 6) + cache.Put(context, tensor, tensor) + + // shift window by adding position 6 + err = cache.StartForward(context, input.Batch{ + Positions: []int32{6, 7}, + Sequences: []int{0, 0}, + }, false) + if err != nil { + t.Fatalf("StartForward failed: %v", err) + } + + cache.SetLayer(0) + tensor = context.FromFloatSlice([]float32{7, 8}, 1, 1, 2) + cache.Put(context, tensor, tensor) + + // only the latest position has overlapping windows + if cache.CanResume(0, 0) { + t.Errorf("after shift: CanResume(0, 0) = true, want false (outside window)") + } + if cache.CanResume(0, 1) { + t.Errorf("after shift: CanResume(0, 1) = true, want false (outside window)") + } + if cache.CanResume(0, 2) { + t.Errorf("after shift: CanResume(0, 2) = true, want false (outside window)") + } + if cache.CanResume(0, 3) { + t.Errorf("after shift: CanResume(0, 3) = true, want false (outside window)") + } + if cache.CanResume(0, 4) { + t.Errorf("after shift: CanResume(0, 4) = true, want false (outside window)") + } + if cache.CanResume(0, 5) { + t.Errorf("after shift: CanResume(0, 5) = true, want false (outside window)") + } + if !cache.CanResume(0, 6) { + t.Errorf("after shift: CanResume(0, 6) = false, want true (inside window)") + } + if !cache.CanResume(0, 7) { + t.Errorf("after shift: CanResume(0, 7) = false, want true (latest position)") + } +} + type testBackend struct { ml.Backend }