From 61a882521657a27b5e1a0834eb6ab69617290ef2 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Tue, 18 Mar 2025 10:38:28 -0700
Subject: [PATCH 01/31] convert: return name of unsupported architecture
 (#9862)

When a model's architecture cannot be converted return the name of the unsupported arch in the error message.
---
 convert/convert.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert/convert.go b/convert/convert.go
index 7b9fe31f..a31b0d6c 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -201,7 +201,7 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 	case "CohereForCausalLM":
 		conv = &commandrModel{}
 	default:
-		return errors.New("unsupported architecture")
+		return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
 	}
 
 	if err := json.Unmarshal(bts, conv); err != nil {

From df94175a0fb0356c9b9e9a62b73d908633c08810 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Tue, 18 Mar 2025 16:51:33 -0700
Subject: [PATCH 02/31] ggml: return error on failure to read tensor data
 (#9872)

When converting a ggml model if there is a failure to read tensor data a nil error value was being returned. It should be assigned to the actual error from reading.
---
 ml/backend/ggml/ggml.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 0c6f1c76..6732470e 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -330,7 +330,7 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
 		}
 	}
 
-	if g.Wait() != nil {
+	if err := g.Wait(); err != nil {
 		return nil, err
 	}
 

From da0e345200fbb47653d2f9c60fcc60ba7b0a7187 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Tue, 18 Mar 2025 18:08:19 -0700
Subject: [PATCH 03/31] ml: use input context for extracting outputs (#9875)

---
 model/models/gemma2/model.go | 2 +-
 model/models/gemma3/model.go | 2 +-
 model/models/llama/model.go  | 2 +-
 model/models/mllama/model.go | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/model/models/gemma2/model.go b/model/models/gemma2/model.go
index 2b8597c4..fbefebe2 100644
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -179,7 +179,7 @@ func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
 		return nil, err
 	}
 
-	outputs, err := ctx.Output().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	outputs, err := ctx.Input().FromIntSlice(opts.Outputs, len(opts.Outputs))
 	if err != nil {
 		return nil, err
 	}
diff --git a/model/models/gemma3/model.go b/model/models/gemma3/model.go
index 32ad80f4..95f89ad4 100644
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -150,7 +150,7 @@ func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
 		return nil, err
 	}
 
-	outputs, err := ctx.Output().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	outputs, err := ctx.Input().FromIntSlice(opts.Outputs, len(opts.Outputs))
 	if err != nil {
 		return nil, err
 	}
diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index 19a2ab8c..87eb9b75 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -150,7 +150,7 @@ func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
 		return nil, err
 	}
 
-	outputs, err := ctx.Output().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	outputs, err := ctx.Input().FromIntSlice(opts.Outputs, len(opts.Outputs))
 	if err != nil {
 		return nil, err
 	}
diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go
index fa4d570c..0aa11f17 100644
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -154,7 +154,7 @@ func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
 		return nil, err
 	}
 
-	outputs, err := ctx.Output().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	outputs, err := ctx.Input().FromIntSlice(opts.Outputs, len(opts.Outputs))
 	if err != nil {
 		return nil, err
 	}

From 2ddacd7516aa78d2ad4899681c7a7f9af2eedc65 Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Wed, 19 Mar 2025 14:59:57 -0700
Subject: [PATCH 04/31] server/internal/client/ollama: confirm all chunksums
 were received (#9893)

If the chunksums response is missing a chunk, the client should fail
the download. This changes the client to check that all bytes are
accounted for in the chunksums response.

It is possible there are overlaps or gaps in the chunksums response and
so the size is not the only thing left to check, but this provides
enough coverage for now. We may want to check that chunks are contiguous
later.
---
 server/internal/client/ollama/registry.go     | 120 ++++++++----------
 .../internal/client/ollama/registry_test.go   |  92 ++++++++++++--
 2 files changed, 134 insertions(+), 78 deletions(-)

diff --git a/server/internal/client/ollama/registry.go b/server/internal/client/ollama/registry.go
index d1d01ba4..fdac71bb 100644
--- a/server/internal/client/ollama/registry.go
+++ b/server/internal/client/ollama/registry.go
@@ -37,7 +37,6 @@ import (
 	"golang.org/x/sync/errgroup"
 
 	"github.com/ollama/ollama/server/internal/cache/blob"
-	"github.com/ollama/ollama/server/internal/internal/backoff"
 	"github.com/ollama/ollama/server/internal/internal/names"
 
 	_ "embed"
@@ -213,12 +212,6 @@ type Registry struct {
 	// request. If zero, [DefaultChunkingThreshold] is used.
 	ChunkingThreshold int64
 
-	// MaxChunkSize is the maximum size of a chunk to download. If zero,
-	// the default is [DefaultMaxChunkSize].
-	//
-	// It is only used when a layer is larger than [MaxChunkingThreshold].
-	MaxChunkSize int64
-
 	// Mask, if set, is the name used to convert non-fully qualified names
 	// to fully qualified names. If empty, [DefaultMask] is used.
 	Mask string
@@ -447,6 +440,11 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 	if err != nil {
 		return err
 	}
+
+	// TODO(bmizerany): decide if this should be considered valid. Maybe
+	// server-side we special case '{}' to have some special meaning? Maybe
+	// "archiving" a tag (which is how we reason about it in the registry
+	// already, just with a different twist).
 	if len(m.Layers) == 0 {
 		return fmt.Errorf("%w: no layers", ErrManifestInvalid)
 	}
@@ -456,11 +454,7 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 		return err
 	}
 
-	exists := func(l *Layer) bool {
-		info, err := c.Get(l.Digest)
-		return err == nil && info.Size == l.Size
-	}
-
+	// TODO(bmizerany): work to remove the need to do this
 	layers := m.Layers
 	if m.Config != nil && m.Config.Digest.IsValid() {
 		layers = append(layers, m.Config)
@@ -469,19 +463,16 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 	// Send initial layer trace events to allow clients to have an
 	// understanding of work to be done before work starts.
 	t := traceFromContext(ctx)
-	skip := make([]bool, len(layers))
-	for i, l := range layers {
+	for _, l := range layers {
 		t.update(l, 0, nil)
-		if exists(l) {
-			skip[i] = true
-			t.update(l, l.Size, ErrCached)
-		}
 	}
 
-	g, ctx := errgroup.WithContext(ctx)
+	var g errgroup.Group
 	g.SetLimit(r.maxStreams())
-	for i, l := range layers {
-		if skip[i] {
+	for _, l := range layers {
+		info, err := c.Get(l.Digest)
+		if err == nil && info.Size == l.Size {
+			t.update(l, l.Size, ErrCached)
 			continue
 		}
 
@@ -490,63 +481,50 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 			t.update(l, 0, err)
 			continue
 		}
+		// TODO(bmizerany): fix this unbounded use of defer
 		defer chunked.Close()
 
 		var progress atomic.Int64
 		for cs, err := range r.chunksums(ctx, name, l) {
 			if err != nil {
+				// Bad chunksums response, update tracing
+				// clients and then bail.
 				t.update(l, progress.Load(), err)
-				break
+				return err
 			}
 
 			g.Go(func() (err error) {
-				defer func() { t.update(l, progress.Load(), err) }()
-
-				for _, err := range backoff.Loop(ctx, 3*time.Second) {
+				defer func() {
 					if err != nil {
-						return err
+						err = fmt.Errorf("error downloading %s: %w", cs.Digest.Short(), err)
 					}
-					err := func() error {
-						req, err := http.NewRequestWithContext(ctx, "GET", cs.URL, nil)
-						if err != nil {
-							return err
-						}
-						req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", cs.Chunk.Start, cs.Chunk.End))
-						res, err := sendRequest(r.client(), req)
-						if err != nil {
-							return err
-						}
-						defer res.Body.Close()
+					t.update(l, progress.Load(), err)
+				}()
 
-						// Count bytes towards
-						// progress, as they arrive, so
-						// that our bytes piggyback
-						// other chunk updates on
-						// completion.
-						//
-						// This tactic is enough to
-						// show "smooth" progress given
-						// the current CLI client. In
-						// the near future, the server
-						// should report download rate
-						// since it knows better than
-						// a client that is measuring
-						// rate based on wall-clock
-						// time-since-last-update.
-						body := &trackingReader{r: res.Body, n: &progress}
-
-						err = chunked.Put(cs.Chunk, cs.Digest, body)
-						if err != nil {
-							return err
-						}
-
-						return nil
-					}()
-					if !canRetry(err) {
-						return err
-					}
+				req, err := http.NewRequestWithContext(ctx, "GET", cs.URL, nil)
+				if err != nil {
+					return err
 				}
-				return nil
+				req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", cs.Chunk.Start, cs.Chunk.End))
+				res, err := sendRequest(r.client(), req)
+				if err != nil {
+					return err
+				}
+				defer res.Body.Close()
+
+				// Count bytes towards progress, as they
+				// arrive, so that our bytes piggyback other
+				// chunk updates on completion.
+				//
+				// This tactic is enough to show "smooth"
+				// progress given the current CLI client. In
+				// the near future, the server should report
+				// download rate since it knows better than a
+				// client that is measuring rate based on
+				// wall-clock time-since-last-update.
+				body := &trackingReader{r: res.Body, n: &progress}
+
+				return chunked.Put(cs.Chunk, cs.Digest, body)
 			})
 		}
 	}
@@ -554,13 +532,10 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 		return err
 	}
 
-	// store the manifest blob
 	md := blob.DigestFromBytes(m.Data)
 	if err := blob.PutBytes(c, md, m.Data); err != nil {
 		return err
 	}
-
-	// commit the manifest with a link
 	return c.Link(m.Name, md)
 }
 
@@ -782,12 +757,15 @@ func (r *Registry) chunksums(ctx context.Context, name string, l *Layer) iter.Se
 		}
 		blobURL := res.Header.Get("Content-Location")
 
+		var size int64
 		s := bufio.NewScanner(res.Body)
 		s.Split(bufio.ScanWords)
 		for {
 			if !s.Scan() {
 				if s.Err() != nil {
 					yield(chunksum{}, s.Err())
+				} else if size != l.Size {
+					yield(chunksum{}, fmt.Errorf("size mismatch: layer size %d != sum of chunks %d", size, l.Size))
 				}
 				return
 			}
@@ -811,6 +789,12 @@ func (r *Registry) chunksums(ctx context.Context, name string, l *Layer) iter.Se
 				return
 			}
 
+			size += chunk.Size()
+			if size > l.Size {
+				yield(chunksum{}, fmt.Errorf("chunk size %d exceeds layer size %d", size, l.Size))
+				return
+			}
+
 			cs := chunksum{
 				URL:    blobURL,
 				Chunk:  chunk,
diff --git a/server/internal/client/ollama/registry_test.go b/server/internal/client/ollama/registry_test.go
index 30fb58ab..30529543 100644
--- a/server/internal/client/ollama/registry_test.go
+++ b/server/internal/client/ollama/registry_test.go
@@ -17,6 +17,7 @@ import (
 	"reflect"
 	"slices"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 
@@ -56,21 +57,21 @@ func (rr recordRoundTripper) RoundTrip(req *http.Request) (*http.Response, error
 
 // newClient constructs a cache with predefined manifests for testing. The manifests are:
 //
-//	empty: no data
-//	zero: no layers
-//	single: one layer with the contents "exists"
-//	multiple: two layers with the contents "exists" and "here"
-//	notfound: a layer that does not exist in the cache
-//	null: one null layer (e.g. [null])
-//	sizemismatch: one valid layer, and one with a size mismatch (file size is less than the reported size)
-//	invalid: a layer with invalid JSON data
+//	empty:         no data
+//	zero:          no layers
+//	single:        one layer with the contents "exists"
+//	multiple:      two layers with the contents "exists" and "here"
+//	notfound:      a layer that does not exist in the cache
+//	null:          one null layer (e.g. [null])
+//	sizemismatch:  one valid layer, and one with a size mismatch (file size is less than the reported size)
+//	invalid:       a layer with invalid JSON data
 //
 // Tests that want to ensure the client does not communicate with the upstream
 // registry should pass a nil handler, which will cause a panic if
 // communication is attempted.
 //
 // To simulate a network error, pass a handler that returns a 499 status code.
-func newClient(t *testing.T, h http.HandlerFunc) (*Registry, *blob.DiskCache) {
+func newClient(t *testing.T, upstreamRegistry http.HandlerFunc) (*Registry, *blob.DiskCache) {
 	t.Helper()
 
 	c, err := blob.Open(t.TempDir())
@@ -88,7 +89,7 @@ func newClient(t *testing.T, h http.HandlerFunc) (*Registry, *blob.DiskCache) {
 	r := &Registry{
 		Cache: c,
 		HTTPClient: &http.Client{
-			Transport: recordRoundTripper(h),
+			Transport: recordRoundTripper(upstreamRegistry),
 		},
 	}
 
@@ -767,3 +768,74 @@ func TestUnlink(t *testing.T) {
 		}
 	})
 }
+
+func TestPullChunksums(t *testing.T) {
+	check := testutil.Checker(t)
+
+	content := "hello"
+	var chunksums string
+	contentDigest := func() blob.Digest {
+		return blob.DigestFromBytes(content)
+	}
+	rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case strings.Contains(r.URL.Path, "/manifests/latest"):
+			fmt.Fprintf(w, `{"layers":[{"digest":%q,"size":%d}]}`, contentDigest(), len(content))
+		case strings.HasSuffix(r.URL.Path, "/chunksums/"+contentDigest().String()):
+			loc := fmt.Sprintf("http://blob.store/v2/library/test/blobs/%s", contentDigest())
+			w.Header().Set("Content-Location", loc)
+			io.WriteString(w, chunksums)
+		case strings.Contains(r.URL.Path, "/blobs/"+contentDigest().String()):
+			http.ServeContent(w, r, contentDigest().String(), time.Time{}, strings.NewReader(content))
+		default:
+			t.Errorf("unexpected request: %v", r)
+			http.NotFound(w, r)
+		}
+	})
+
+	rc.MaxStreams = 1        // prevent concurrent chunk downloads
+	rc.ChunkingThreshold = 1 // for all blobs to be chunked
+
+	var mu sync.Mutex
+	var reads []int64
+	ctx := WithTrace(t.Context(), &Trace{
+		Update: func(l *Layer, n int64, err error) {
+			t.Logf("Update: %v %d %v", l, n, err)
+			mu.Lock()
+			reads = append(reads, n)
+			mu.Unlock()
+		},
+	})
+
+	chunksums = fmt.Sprintf("%s 0-2\n%s 3-4\n",
+		blob.DigestFromBytes("hel"),
+		blob.DigestFromBytes("lo"),
+	)
+	err := rc.Pull(ctx, "test")
+	check(err)
+	if !slices.Equal(reads, []int64{0, 3, 5}) {
+		t.Errorf("reads = %v; want %v", reads, []int64{0, 3, 5})
+	}
+
+	mw, err := rc.Resolve(t.Context(), "test")
+	check(err)
+	mg, err := rc.ResolveLocal("test")
+	check(err)
+	if !reflect.DeepEqual(mw, mg) {
+		t.Errorf("mw = %v; mg = %v", mw, mg)
+	}
+	for i := range mg.Layers {
+		_, err = c.Get(mg.Layers[i].Digest)
+		if err != nil {
+			t.Errorf("Get(%v): %v", mg.Layers[i].Digest, err)
+		}
+	}
+
+	// missing chunks
+	content = "llama"
+	chunksums = fmt.Sprintf("%s 0-1\n", blob.DigestFromBytes("ll"))
+	err = rc.Pull(ctx, "missingchunks")
+	if err == nil {
+		t.Error("expected error because of missing chunks")
+	}
+}

From b078dd157cbca4ba31ce49128b0d8e1e4da99b39 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 19 Mar 2025 15:34:41 -0700
Subject: [PATCH 05/31] gemma2: Remove second call to Rows

Looks like a merge conflict that broke the model.
---
 model/models/gemma2/model.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/model/models/gemma2/model.go b/model/models/gemma2/model.go
index fbefebe2..29ffa231 100644
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -211,8 +211,7 @@ func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
 	// final logit softcap
 	hiddenState = hiddenState.Scale(ctx, 1.0/float64(m.Options.finalLogitSoftcap))
 	hiddenState = hiddenState.Tanh(ctx)
-	hiddenState = hiddenState.Scale(ctx, float64(m.Options.finalLogitSoftcap))
-	return hiddenState.Rows(ctx, outputs), nil
+	return hiddenState.Scale(ctx, float64(m.Options.finalLogitSoftcap)), nil
 }
 
 func init() {

From f8c3dbe5b5ee342d97e4c71d684b85b00273c33d Mon Sep 17 00:00:00 2001
From: Patrick Devine <patrick@infrahq.com>
Date: Thu, 20 Mar 2025 00:15:30 -0700
Subject: [PATCH 06/31] templates: add autotemplate for gemma3 (#9880)

This change allows the gemma3 template to be autodetected during `ollama
create`.
---
 server/model.go                                     |  2 +-
 template/gemma3-instruct.gotmpl                     | 13 +++++++++++++
 template/gemma3-instruct.json                       |  6 ++++++
 template/index.json                                 |  4 ++++
 .../system-user-assistant-user                      | 10 ++++++++++
 template/testdata/gemma3-instruct.gotmpl/user       |  4 ++++
 .../gemma3-instruct.gotmpl/user-assistant-user      |  8 ++++++++
 7 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 template/gemma3-instruct.gotmpl
 create mode 100644 template/gemma3-instruct.json
 create mode 100644 template/testdata/gemma3-instruct.gotmpl/system-user-assistant-user
 create mode 100644 template/testdata/gemma3-instruct.gotmpl/user
 create mode 100644 template/testdata/gemma3-instruct.gotmpl/user-assistant-user

diff --git a/server/model.go b/server/model.go
index 4c82fb99..d0f503e2 100644
--- a/server/model.go
+++ b/server/model.go
@@ -82,7 +82,7 @@ func detectChatTemplate(layers []*layerGGML) ([]*layerGGML, error) {
 	for _, layer := range layers {
 		if s := layer.GGML.KV().ChatTemplate(); s != "" {
 			if t, err := template.Named(s); err != nil {
-				slog.Debug("template detection", "error", err)
+				slog.Debug("template detection", "error", err, "template", s)
 			} else {
 				layer, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template")
 				if err != nil {
diff --git a/template/gemma3-instruct.gotmpl b/template/gemma3-instruct.gotmpl
new file mode 100644
index 00000000..eb69d9f5
--- /dev/null
+++ b/template/gemma3-instruct.gotmpl
@@ -0,0 +1,13 @@
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 }}
+{{- if eq .Role "user" }}<start_of_turn>user
+{{- if and (eq $i 1) $.System }}
+{{ $.System }}
+{{ end }}
+{{ .Content }}<end_of_turn>
+{{ else if eq .Role "assistant" }}<start_of_turn>model
+{{ .Content }}<end_of_turn>
+{{ end }}
+{{- if $last }}<start_of_turn>model
+{{ end }}
+{{- end }}
diff --git a/template/gemma3-instruct.json b/template/gemma3-instruct.json
new file mode 100644
index 00000000..b1dac3fe
--- /dev/null
+++ b/template/gemma3-instruct.json
@@ -0,0 +1,6 @@
+{
+  "stop": [
+    "<end_of_turn>"
+  ],
+  "temperature": 0.1
+}
diff --git a/template/index.json b/template/index.json
index 7a27747c..5714665a 100644
--- a/template/index.json
+++ b/template/index.json
@@ -87,6 +87,10 @@
     "template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
     "name": "gemma-instruct"
   },
+  {
+    "template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
+    "name": "gemma3-instruct"
+  },
   {
     "template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
     "name": "llama3-instruct"
diff --git a/template/testdata/gemma3-instruct.gotmpl/system-user-assistant-user b/template/testdata/gemma3-instruct.gotmpl/system-user-assistant-user
new file mode 100644
index 00000000..5453a1db
--- /dev/null
+++ b/template/testdata/gemma3-instruct.gotmpl/system-user-assistant-user
@@ -0,0 +1,10 @@
+<start_of_turn>user
+You are a helpful assistant.
+
+Hello, how are you?<end_of_turn>
+<start_of_turn>model
+I'm doing great. How can I help you today?<end_of_turn>
+<start_of_turn>user
+I'd like to show off how chat templating works!<end_of_turn>
+<start_of_turn>model
+
diff --git a/template/testdata/gemma3-instruct.gotmpl/user b/template/testdata/gemma3-instruct.gotmpl/user
new file mode 100644
index 00000000..fed1f5fa
--- /dev/null
+++ b/template/testdata/gemma3-instruct.gotmpl/user
@@ -0,0 +1,4 @@
+<start_of_turn>user
+Hello, how are you?<end_of_turn>
+<start_of_turn>model
+
diff --git a/template/testdata/gemma3-instruct.gotmpl/user-assistant-user b/template/testdata/gemma3-instruct.gotmpl/user-assistant-user
new file mode 100644
index 00000000..ab15a071
--- /dev/null
+++ b/template/testdata/gemma3-instruct.gotmpl/user-assistant-user
@@ -0,0 +1,8 @@
+<start_of_turn>user
+Hello, how are you?<end_of_turn>
+<start_of_turn>model
+I'm doing great. How can I help you today?<end_of_turn>
+<start_of_turn>user
+I'd like to show off how chat templating works!<end_of_turn>
+<start_of_turn>model
+

From 42a14f7f633110ab83343848865d4612cfefb398 Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Thu, 20 Mar 2025 11:11:18 -0700
Subject: [PATCH 07/31] sample: add error handling for empty logits (#9740)

---
 sample/samplers.go        | 14 +++----
 sample/samplers_test.go   | 24 +++++++++++
 sample/transforms_test.go | 88 +++++++++++++++++++++++++++++----------
 3 files changed, 97 insertions(+), 29 deletions(-)

diff --git a/sample/samplers.go b/sample/samplers.go
index 7c12da08..ef803369 100644
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -26,6 +26,10 @@ type Sampler struct {
 }
 
 func (s *Sampler) Sample(logits []float32) (int32, error) {
+	if len(logits) == 0 {
+		return -1, errors.New("sample: no logits provided to sample")
+	}
+
 	tokens := make([]token, len(logits))
 	for i := range logits {
 		tokens[i].id = int32(i)
@@ -94,13 +98,6 @@ func (s *Sampler) sample(tokens []token) (token, error) {
 	tokens = topP(tokens, s.topP)
 	tokens = minP(tokens, s.minP)
 
-	// TODO: this should fall back to greedy sampling
-	// or topP, topK values etc should be such that
-	// there are always tokens to sample from
-	if len(tokens) == 0 {
-		return token{}, errors.New("no tokens to sample from")
-	}
-
 	var r float32
 	if s.rng != nil {
 		r = s.rng.Float32()
@@ -123,6 +120,9 @@ func (s *Sampler) sample(tokens []token) (token, error) {
 		return 1
 	})
 
+	if math.IsNaN(float64(sum)) {
+		return token{}, errors.New("sample: logits sum to NaN, check model output")
+	}
 	return tokens[idx], nil
 }
 
diff --git a/sample/samplers_test.go b/sample/samplers_test.go
index 38b9b352..d79dce47 100644
--- a/sample/samplers_test.go
+++ b/sample/samplers_test.go
@@ -1,6 +1,7 @@
 package sample
 
 import (
+	"math"
 	"math/rand/v2"
 	"testing"
 )
@@ -29,6 +30,29 @@ func TestWeighted(t *testing.T) {
 	if want != got {
 		t.Errorf("index mismatch: want %d, got %d", want, got)
 	}
+
+	// Test very high p
+	logits = []float32{1.0, 0.9999999999999999, 0.5, 0.1}
+	// Use extremely small topP to filter out all tokens
+	sampler = NewSampler(1.0, 0, 1e-10, 0, 0, nil)
+	got, err = sampler.Sample(logits)
+	if err != nil {
+		t.Error(err)
+		return
+	}
+	// Should get the token with the highest logit
+	want = int32(0)
+	if want != got {
+		t.Errorf("index mismatch: want %d, got %d", want, got)
+	}
+
+	logits = []float32{float32(math.NaN()), float32(math.NaN()), float32(math.NaN())}
+	sampler = NewSampler(1, 0, 0.95, 0.05, 0, nil)
+	got, err = sampler.Sample(logits)
+	if err == nil {
+		t.Errorf("expected error, got %d", got)
+		return
+	}
 }
 
 func BenchmarkSample(b *testing.B) {
diff --git a/sample/transforms_test.go b/sample/transforms_test.go
index 7faf30a5..5307c5f8 100644
--- a/sample/transforms_test.go
+++ b/sample/transforms_test.go
@@ -168,27 +168,53 @@ func TestTopP(t *testing.T) {
 	softmax(tokens)
 	tokens = topK(tokens, 20)
 
-	// Then apply topP
-	tokens = topP(tokens, 0.95)
+	// Test with very high p value
+	got := topP(tokens, 1.0)
 
-	// Should keep tokens until cumsum > 0.95
-	if len(tokens) > 3 {
+	// Should keep all tokens since p is 1
+	if len(got) != len(input) {
+		t.Errorf("topP(1.0): should keep all tokens, got %d, want %d", len(got), len(input))
+	}
+
+	// Test with normal p value
+	got = topP(tokens, 0.95)
+
+	if len(got) > 3 {
 		t.Errorf("topP(0.95): kept too many tokens: got %d", len(tokens))
-		t.Logf("got: %v", tokens)
+		t.Logf("got: %v", got)
 	}
 
 	// Test edge case - ensure at least one token remains
-	input = []float32{-1e6, -1e6, -1e6} // One dominant token
+	input = []float32{-1e6, -1e6, -1e7}
 	tokens = toTokens(input)
+	tokens = topK(tokens, 20)
 	softmax(tokens)
-	tokens = topP(tokens, 0.0) // Very small p
-	if len(tokens) < 1 {
+	got = topP(tokens, 0.0)
+	if len(got) < 1 {
 		t.Error("topP should keep at least one token")
 	}
+
+	// Test with zero p value
+	got = topP(tokens, 0.0)
+
+	// Should keep only the highest probability token
+	if len(got) != 1 {
+		t.Errorf("topP(0.0): should keep only one token, got %d", len(got))
+		t.Logf("got: %v", got)
+	}
+
+	tokens = toTokens(input)
+	tokens = topK(tokens, 20)
+	softmax(tokens)
+	got = topP(tokens, 1e-10)
+	if len(got) == 0 {
+		t.Errorf("topP(1e-10): should keep at least one token, got %d", len(got))
+		t.Logf("got: %v", got)
+	}
 }
 
 func TestMinP(t *testing.T) {
-	input := []float32{-3, -2, -1, 0, 1, 2, 4, 3}
+	input := []float32{-2, 0, -1, -3, 2, 1, 4, 3}
 	tokens := toTokens(input)
 
 	// First apply temperature and softmax
@@ -225,30 +251,48 @@ func TestMinP(t *testing.T) {
 		t.Logf("got: %v", tokens)
 	}
 
+	// Test with single token
+	tokens = toTokens(input[:1])
+	tokens = topK(tokens, 20)
+	softmax(tokens)
+	tokens = minP(tokens, 0.1)
+
+	// Should keep only the highest probability token
+	if len(tokens) != 1 {
+		t.Errorf("minP(0.1): should return single token, got %d", len(tokens))
+		t.Logf("got: %v", tokens)
+	}
+
 	input = []float32{1e-10, 1e-10, 1e-10}
 	tokens = toTokens(input)
 	softmax(tokens)
 	tokens = minP(tokens, 1.0)
 	if len(tokens) < 1 {
 		t.Error("minP should keep at least one token even with extreme probabilities")
-	}
-}
+		got := minP(tokens, 1.0)
 
-func TestSortLogits(t *testing.T) {
-	input := []float32{0.026986899, 0.043722924, 0.036774673, 0.27755088, 0.0046718004, 0.08582123, 0.20409796, 0.00412893, 0.15720603, 0.045046154, 0.0030491839, 0.01681367}
-	tokens := toTokens(input)
+		if len(got) != 1 {
+			t.Errorf("minP(1.0): should keep all tokens, got %d, want %d", len(got), len(tokens))
+		}
 
-	tokens = topK(tokens, 20)
+		// Test with normal p value
+		got = minP(tokens, 0.2)
 
-	for i := 1; i < len(tokens); i++ {
-		if tokens[i].value > tokens[i-1].value {
-			t.Errorf("sortLogits: tokens not sorted in descending order at index %d: %f > %f",
-				i, tokens[i].value, tokens[i-1].value)
+		// Should keep tokens with prob >= 0.2 * max_prob
+		if len(got) > 3 {
+			t.Errorf("minP(0.2): kept too many tokens: got %d", len(got))
+			t.Logf("got: %v", got)
+		}
+
+		// Test with zero p value
+		got = minP(tokens, 0.0)
+
+		// Should keep only the highest probability token
+		if len(got) != len(tokens) {
+			t.Errorf("minP(0.0): should keep only one token, got %d", len(got))
+			t.Logf("got: %v", got)
 		}
 	}
-
-	want := []float32{0.27755088, 0.20409796, 0.15720603, 0.08582123, 0.045046154, 0.043722924, 0.036774673, 0.026986899, 0.01681367, 0.0046718004, 0.00412893, 0.0030491839}
-	compareLogits(t, "sortLogits", want, tokens)
 }
 
 func BenchmarkTransforms(b *testing.B) {

From ffbfe833da387f9b6806fe887b85992c11d26eaa Mon Sep 17 00:00:00 2001
From: rylativity <41017744+rylativity@users.noreply.github.com>
Date: Thu, 20 Mar 2025 16:11:17 -0400
Subject: [PATCH 08/31] parser: remove role validation from Modelfile parser
 (#9874)

* updates parser/parser.go to allow arbitrary roles in Modelfile MESSAGE blocks
---
 parser/parser.go      | 18 +++++++-----------
 parser/parser_test.go | 10 +++++-----
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/parser/parser.go b/parser/parser.go
index 6832351f..0a32d571 100644
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -7,6 +7,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"log/slog"
 	"net/http"
 	"os"
 	"os/user"
@@ -300,9 +301,8 @@ const (
 )
 
 var (
-	errMissingFrom        = errors.New("no FROM line")
-	errInvalidMessageRole = errors.New("message role must be one of \"system\", \"user\", or \"assistant\"")
-	errInvalidCommand     = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
+	errMissingFrom    = errors.New("no FROM line")
+	errInvalidCommand = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
 )
 
 type ParserError struct {
@@ -379,14 +379,10 @@ func ParseFile(r io.Reader) (*Modelfile, error) {
 			case stateParameter:
 				cmd.Name = b.String()
 			case stateMessage:
-				if !isValidMessageRole(b.String()) {
-					return nil, &ParserError{
-						LineNumber: currLine,
-						Msg:        errInvalidMessageRole.Error(),
-					}
-				}
-
 				role = b.String()
+				if !isKnownMessageRole(b.String()) {
+					slog.Warn("received non-standard role", "role", role)
+				}
 			case stateComment, stateNil:
 				// pass
 			case stateValue:
@@ -556,7 +552,7 @@ func isNewline(r rune) bool {
 	return r == '\r' || r == '\n'
 }
 
-func isValidMessageRole(role string) bool {
+func isKnownMessageRole(role string) bool {
 	return role == "system" || role == "user" || role == "assistant"
 }
 
diff --git a/parser/parser_test.go b/parser/parser_test.go
index 097c058f..c4f8f4aa 100644
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -256,13 +256,13 @@ You are a multiline file parser. Always parse things.
 		{
 			`
 FROM foo
-MESSAGE badguy I'm a bad guy!
+MESSAGE somerandomrole I'm ok with you adding any role message now!
 `,
-			nil,
-			&ParserError{
-				LineNumber: 3,
-				Msg:        errInvalidMessageRole.Error(),
+			[]Command{
+				{Name: "model", Args: "foo"},
+				{Name: "message", Args: "somerandomrole: I'm ok with you adding any role message now!"},
 			},
+			nil,
 		},
 		{
 			`

From 0c220935bd9e23339af1c8f943f5418cbe1b705b Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 19 Mar 2025 14:28:15 -0700
Subject: [PATCH 09/31] input: Rename Options to Batch

Options is no longer very descriptive of this struct.
---
 kvcache/cache.go                  |  2 +-
 kvcache/causal.go                 | 12 ++++++------
 kvcache/causal_test.go            |  2 +-
 kvcache/encoder.go                |  6 +++---
 kvcache/wrapper.go                |  8 ++++----
 model/input/input.go              | 25 +++++++++++++++++++------
 model/model.go                    | 14 +++++++-------
 model/model_test.go               |  2 +-
 model/models/gemma2/model.go      |  8 ++++----
 model/models/gemma3/model.go      | 10 +++++-----
 model/models/gemma3/model_text.go |  4 ++--
 model/models/llama/model.go       |  8 ++++----
 model/models/mllama/model.go      | 12 ++++++------
 runner/ollamarunner/runner.go     | 20 ++++++++++----------
 14 files changed, 73 insertions(+), 60 deletions(-)

diff --git a/kvcache/cache.go b/kvcache/cache.go
index d3548905..aa0a2056 100644
--- a/kvcache/cache.go
+++ b/kvcache/cache.go
@@ -52,7 +52,7 @@ type Cache interface {
 	// StartForward is called before the start of the model's forward pass.
 	// For each token in the coming batch, there must be a corresponding
 	// entry in positions and seqs.
-	StartForward(ctx ml.Context, opts input.Options) error
+	StartForward(ctx ml.Context, batch input.Batch) error
 
 	// CopyPrefix copies tokens in the range [0, len) from srcSeq to dstSeq
 	CopyPrefix(srcSeq, dstSeq int, len int32)
diff --git a/kvcache/causal.go b/kvcache/causal.go
index edf6666d..79fa24e8 100644
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -140,10 +140,10 @@ func (c *Causal) Close() {
 	}
 }
 
-func (c *Causal) StartForward(ctx ml.Context, opts input.Options) error {
-	c.curBatchSize = len(opts.Positions)
-	c.curSequences = opts.Sequences
-	c.curPositions = opts.Positions
+func (c *Causal) StartForward(ctx ml.Context, batch input.Batch) error {
+	c.curBatchSize = len(batch.Positions)
+	c.curSequences = batch.Sequences
+	c.curPositions = batch.Positions
 	c.opts.Except = nil
 
 	var err error
@@ -157,8 +157,8 @@ func (c *Causal) StartForward(ctx ml.Context, opts input.Options) error {
 	}
 
 	c.curCellRange = newRange()
-	for i, pos := range opts.Positions {
-		seq := opts.Sequences[i]
+	for i, pos := range batch.Positions {
+		seq := batch.Sequences[i]
 
 		c.cells[c.curLoc+i] = cacheCell{pos: pos, sequences: []int{seq}}
 
diff --git a/kvcache/causal_test.go b/kvcache/causal_test.go
index 56d85ceb..0f2385db 100644
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -270,7 +270,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
 			context := backend.NewContext()
 			defer context.Close()
 
-			err := cache.StartForward(context, input.Options{Positions: test.pos, Sequences: test.seqs})
+			err := cache.StartForward(context, input.Batch{Positions: test.pos, Sequences: test.seqs})
 			if err != nil {
 				panic(err)
 			}
diff --git a/kvcache/encoder.go b/kvcache/encoder.go
index 6a9df2ab..94c5d99c 100644
--- a/kvcache/encoder.go
+++ b/kvcache/encoder.go
@@ -79,10 +79,10 @@ func (c *EncoderCache) Close() {
 	}
 }
 
-func (c *EncoderCache) StartForward(ctx ml.Context, opts input.Options) error {
+func (c *EncoderCache) StartForward(ctx ml.Context, batch input.Batch) error {
 	// We work with the most recent image
-	if len(opts.Multimodal) > 0 {
-		c.curPos = opts.Positions[opts.Multimodal[len(opts.Multimodal)-1].Index]
+	if len(batch.Multimodal) > 0 {
+		c.curPos = batch.Positions[batch.Multimodal[len(batch.Multimodal)-1].Index]
 	}
 
 	return nil
diff --git a/kvcache/wrapper.go b/kvcache/wrapper.go
index aaccd166..c85807a0 100644
--- a/kvcache/wrapper.go
+++ b/kvcache/wrapper.go
@@ -41,14 +41,14 @@ func (c *WrapperCache) Close() {
 	}
 }
 
-func (c *WrapperCache) StartForward(ctx ml.Context, opts input.Options) error {
+func (c *WrapperCache) StartForward(ctx ml.Context, batch input.Batch) error {
 	for i, cache := range c.caches {
-		err := cache.StartForward(ctx, opts)
+		err := cache.StartForward(ctx, batch)
 		if err != nil {
 			// unwind on error - Remove with endIndex set to math.MaxInt32 does not fail
 			for j := i - 1; j >= 0; j-- {
-				for k := range opts.Positions {
-					_ = c.caches[j].Remove(opts.Sequences[k], opts.Positions[k], math.MaxInt32)
+				for k := range batch.Positions {
+					_ = c.caches[j].Remove(batch.Sequences[k], batch.Positions[k], math.MaxInt32)
 				}
 			}
 			return err
diff --git a/model/input/input.go b/model/input/input.go
index 30bdcf06..ce43efb5 100644
--- a/model/input/input.go
+++ b/model/input/input.go
@@ -33,11 +33,24 @@ type MultimodalIndex struct {
 	Multimodal any
 }
 
-// Options contains the inputs for a model forward pass
-type Options struct {
-	Inputs     []int32
+// Batch contains the inputs for a model forward pass
+type Batch struct {
+	// Inputs is the input tokens, including placeholders for multimodal inputs.
+	Inputs []int32
+
+	// Multimodal is a set of multimodal embeddings previously created by
+	// EncodeMultimodal, along with an index into Inputs. Unused for text-only
+	// models or for batches without multimodal elements.
 	Multimodal []MultimodalIndex
-	Positions  []int32
-	Sequences  []int
-	Outputs    []int32
+
+	// Positions is the position for each Input, relative to its sequence. Equal
+	// in length to Inputs.
+	Positions []int32
+
+	// Sequences is the sequence for each Input. Equal in length to Inputs.
+	Sequences []int
+
+	// Outputs are the set of indicies into Inputs for which output data should
+	// be returned.
+	Outputs []int32
 }
diff --git a/model/model.go b/model/model.go
index 53e47add..94156ae2 100644
--- a/model/model.go
+++ b/model/model.go
@@ -26,7 +26,7 @@ var ErrNoVisionModel = errors.New("this model is missing data required for image
 
 // Model implements a specific model architecture, defining the forward pass and any model-specific configuration
 type Model interface {
-	Forward(ml.Context, input.Options) (ml.Tensor, error)
+	Forward(ml.Context, input.Batch) (ml.Tensor, error)
 
 	Backend() ml.Backend
 	Config() config
@@ -280,24 +280,24 @@ func canNil(t reflect.Type) bool {
 		t.Kind() == reflect.Slice
 }
 
-func Forward(ctx ml.Context, m Model, opts input.Options) (ml.Tensor, error) {
-	if len(opts.Positions) != len(opts.Sequences) {
-		return nil, fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(opts.Positions), len(opts.Sequences))
+func Forward(ctx ml.Context, m Model, batch input.Batch) (ml.Tensor, error) {
+	if len(batch.Positions) != len(batch.Sequences) {
+		return nil, fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(batch.Positions), len(batch.Sequences))
 	}
 
-	if len(opts.Positions) < 1 {
+	if len(batch.Positions) < 1 {
 		return nil, errors.New("batch size cannot be less than 1")
 	}
 
 	cache := m.Config().Cache
 	if cache != nil {
-		err := cache.StartForward(ctx, opts)
+		err := cache.StartForward(ctx, batch)
 		if err != nil {
 			return nil, err
 		}
 	}
 
-	t, err := m.Forward(ctx, opts)
+	t, err := m.Forward(ctx, batch)
 	if err != nil {
 		return nil, err
 	}
diff --git a/model/model_test.go b/model/model_test.go
index 354dd1d8..0b1ea08e 100644
--- a/model/model_test.go
+++ b/model/model_test.go
@@ -163,7 +163,7 @@ func TestGetTextProcessor(t *testing.T) {
 
 type notTextProcessorModel struct{}
 
-func (notTextProcessorModel) Forward(ml.Context, input.Options) (ml.Tensor, error) {
+func (notTextProcessorModel) Forward(ml.Context, input.Batch) (ml.Tensor, error) {
 	panic("unimplemented")
 }
 
diff --git a/model/models/gemma2/model.go b/model/models/gemma2/model.go
index 29ffa231..2b347d72 100644
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -168,18 +168,18 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 	return hiddenState.Add(ctx, residual)
 }
 
-func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
-	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	inputs, err := ctx.Input().FromIntSlice(batch.Inputs, len(batch.Inputs))
 	if err != nil {
 		return nil, err
 	}
 
-	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 	if err != nil {
 		return nil, err
 	}
 
-	outputs, err := ctx.Input().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 	if err != nil {
 		return nil, err
 	}
diff --git a/model/models/gemma3/model.go b/model/models/gemma3/model.go
index 95f89ad4..900bf31e 100644
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -139,23 +139,23 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	return result, nil
 }
 
-func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
-	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	inputs, err := ctx.Input().FromIntSlice(batch.Inputs, len(batch.Inputs))
 	if err != nil {
 		return nil, err
 	}
 
-	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 	if err != nil {
 		return nil, err
 	}
 
-	outputs, err := ctx.Input().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 	if err != nil {
 		return nil, err
 	}
 
-	return m.TextModel.Forward(ctx, inputs, positions, outputs, opts, m.Cache), nil
+	return m.TextModel.Forward(ctx, inputs, positions, outputs, batch, m.Cache), nil
 }
 
 func init() {
diff --git a/model/models/gemma3/model_text.go b/model/models/gemma3/model_text.go
index 567f65a5..7d8b6577 100644
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -171,13 +171,13 @@ func (l *TextLayer) Forward(ctx ml.Context, layer int, hiddenState, positionIDs,
 	return hiddenState.Add(ctx, residual)
 }
 
-func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, opts input.Options, cache kvcache.Cache) ml.Tensor {
+func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) ml.Tensor {
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.TextOptions.hiddenSize)))
 
 	// set image embeddings
 	var except []int
-	for _, image := range opts.Multimodal {
+	for _, image := range batch.Multimodal {
 		visionOutputs := image.Multimodal.(ml.Tensor)
 		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
 
diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index 87eb9b75..e5ecd29e 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -139,18 +139,18 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 	return hiddenState.Add(ctx, residual)
 }
 
-func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
-	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	inputs, err := ctx.Input().FromIntSlice(batch.Inputs, len(batch.Inputs))
 	if err != nil {
 		return nil, err
 	}
 
-	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 	if err != nil {
 		return nil, err
 	}
 
-	outputs, err := ctx.Input().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 	if err != nil {
 		return nil, err
 	}
diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go
index 0aa11f17..6d9c608e 100644
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -135,26 +135,26 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	return inputs, nil
 }
 
-func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
-	if len(opts.Multimodal) > 0 {
-		images := opts.Multimodal[len(opts.Multimodal)-1].Multimodal.([]ml.Tensor)
+	if len(batch.Multimodal) > 0 {
+		images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal.([]ml.Tensor)
 		if len(images) > 0 {
 			crossAttentionStates = images[len(images)-1]
 		}
 	}
 
-	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
+	inputs, err := ctx.Input().FromIntSlice(batch.Inputs, len(batch.Inputs))
 	if err != nil {
 		return nil, err
 	}
 
-	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 	if err != nil {
 		return nil, err
 	}
 
-	outputs, err := ctx.Input().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 	if err != nil {
 		return nil, err
 	}
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index 9a1a549c..91463f93 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -348,7 +348,7 @@ func (s *Server) processBatch() error {
 	}
 	defer s.mu.Unlock()
 
-	var options input.Options
+	var batch input.Batch
 
 	for i, seq := range s.seqs {
 		if seq == nil {
@@ -395,17 +395,17 @@ func (s *Server) processBatch() error {
 				}
 			}
 
-			options.Inputs = append(options.Inputs, inp.Token)
+			batch.Inputs = append(batch.Inputs, inp.Token)
 			if inp.Multimodal != nil {
-				options.Multimodal = append(options.Multimodal, input.MultimodalIndex{Index: len(options.Inputs) - 1, Multimodal: inp.Multimodal})
+				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batch.Inputs) - 1, Multimodal: inp.Multimodal})
 			}
 
-			options.Positions = append(options.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
-			options.Sequences = append(options.Sequences, seq.cache.Id)
+			batch.Positions = append(batch.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
+			batch.Sequences = append(batch.Sequences, seq.cache.Id)
 
-			seq.iBatch = len(options.Outputs)
+			seq.iBatch = len(batch.Outputs)
 			if j+1 == len(seq.inputs) {
-				options.Outputs = append(options.Outputs, int32(len(options.Inputs)-1))
+				batch.Outputs = append(batch.Outputs, int32(len(batch.Inputs)-1))
 			}
 			seq.pendingInputs = append(seq.pendingInputs, inp)
 		}
@@ -413,14 +413,14 @@ func (s *Server) processBatch() error {
 		seq.inputs = seq.inputs[len(seq.pendingInputs):]
 	}
 
-	if len(options.Inputs) == 0 {
+	if len(batch.Inputs) == 0 {
 		return nil
 	}
 
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()
 
-	modelOutput, err := model.Forward(ctx, s.model, options)
+	modelOutput, err := model.Forward(ctx, s.model, batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
 	}
@@ -460,7 +460,7 @@ func (s *Server) processBatch() error {
 		}
 
 		// sample a token
-		vocabSize := len(logits) / len(options.Outputs)
+		vocabSize := len(logits) / len(batch.Outputs)
 
 		token, err := seq.sampler.Sample(logits[seq.iBatch*vocabSize : (seq.iBatch+1)*vocabSize])
 		if err != nil {

From 0fbfcf3c9c7bfdbf4616238595eafd7eca2a916c Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 19 Mar 2025 14:36:21 -0700
Subject: [PATCH 10/31] model: Pass input tensor instead of raw data to models

Rather than directly giving the input data to models, we can
pass a tensor instead. In the short term, this saves some duplicated
code.

Longer term, we will want to overlap setting up the next batch with
processing of the current one. In this case, we will only have the
shape of tensor but it will not be loaded with data at the time of
graph generation. By passing only a tensor to models now, we set up
this possibility and prevent them from relying on data that they won't
have in the future.

Although the same could be done for Positions and Outputs, in some
cases we either need the raw input data or don't use them at all.
Therefore, for now we leave them as they are and allow models to
convert them to tensors as needed.
---
 model/input/input.go          |  4 +++-
 model/model.go                |  8 +++++++-
 model/models/gemma2/model.go  |  7 +------
 model/models/gemma3/model.go  |  7 +------
 model/models/llama/model.go   |  7 +------
 model/models/mllama/model.go  |  7 +------
 runner/ollamarunner/runner.go | 11 ++++++-----
 7 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/model/input/input.go b/model/input/input.go
index ce43efb5..d66f52a0 100644
--- a/model/input/input.go
+++ b/model/input/input.go
@@ -1,5 +1,7 @@
 package input
 
+import "github.com/ollama/ollama/ml"
+
 // Input represents one token in the input stream
 type Input struct {
 	// Token is a single element of text.
@@ -36,7 +38,7 @@ type MultimodalIndex struct {
 // Batch contains the inputs for a model forward pass
 type Batch struct {
 	// Inputs is the input tokens, including placeholders for multimodal inputs.
-	Inputs []int32
+	Inputs ml.Tensor
 
 	// Multimodal is a set of multimodal embeddings previously created by
 	// EncodeMultimodal, along with an index into Inputs. Unused for text-only
diff --git a/model/model.go b/model/model.go
index 94156ae2..ab29916a 100644
--- a/model/model.go
+++ b/model/model.go
@@ -280,7 +280,7 @@ func canNil(t reflect.Type) bool {
 		t.Kind() == reflect.Slice
 }
 
-func Forward(ctx ml.Context, m Model, batch input.Batch) (ml.Tensor, error) {
+func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Tensor, error) {
 	if len(batch.Positions) != len(batch.Sequences) {
 		return nil, fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(batch.Positions), len(batch.Sequences))
 	}
@@ -289,6 +289,12 @@ func Forward(ctx ml.Context, m Model, batch input.Batch) (ml.Tensor, error) {
 		return nil, errors.New("batch size cannot be less than 1")
 	}
 
+	var err error
+	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
+	if err != nil {
+		return nil, err
+	}
+
 	cache := m.Config().Cache
 	if cache != nil {
 		err := cache.StartForward(ctx, batch)
diff --git a/model/models/gemma2/model.go b/model/models/gemma2/model.go
index 2b347d72..67c69ee8 100644
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -169,11 +169,6 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	inputs, err := ctx.Input().FromIntSlice(batch.Inputs, len(batch.Inputs))
-	if err != nil {
-		return nil, err
-	}
-
 	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 	if err != nil {
 		return nil, err
@@ -184,7 +179,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 		return nil, err
 	}
 
-	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
+	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
 
 	if len(m.Layers) == gemma27BLayerCount {
diff --git a/model/models/gemma3/model.go b/model/models/gemma3/model.go
index 900bf31e..567ad1a4 100644
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -140,11 +140,6 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	inputs, err := ctx.Input().FromIntSlice(batch.Inputs, len(batch.Inputs))
-	if err != nil {
-		return nil, err
-	}
-
 	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 	if err != nil {
 		return nil, err
@@ -155,7 +150,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 		return nil, err
 	}
 
-	return m.TextModel.Forward(ctx, inputs, positions, outputs, batch, m.Cache), nil
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
 
 func init() {
diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index e5ecd29e..5c173997 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -140,11 +140,6 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	inputs, err := ctx.Input().FromIntSlice(batch.Inputs, len(batch.Inputs))
-	if err != nil {
-		return nil, err
-	}
-
 	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 	if err != nil {
 		return nil, err
@@ -155,7 +150,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 		return nil, err
 	}
 
-	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
+	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 
 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)
diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go
index 6d9c608e..988a189d 100644
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -144,11 +144,6 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 		}
 	}
 
-	inputs, err := ctx.Input().FromIntSlice(batch.Inputs, len(batch.Inputs))
-	if err != nil {
-		return nil, err
-	}
-
 	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 	if err != nil {
 		return nil, err
@@ -160,7 +155,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	}
 
 	// TODO: attention mask, cross attention mask
-	return m.TextModel.Forward(ctx, inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
 }
 
 func init() {
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index 91463f93..443b34b0 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -348,6 +348,7 @@ func (s *Server) processBatch() error {
 	}
 	defer s.mu.Unlock()
 
+	var batchInputs []int32
 	var batch input.Batch
 
 	for i, seq := range s.seqs {
@@ -395,9 +396,9 @@ func (s *Server) processBatch() error {
 				}
 			}
 
-			batch.Inputs = append(batch.Inputs, inp.Token)
+			batchInputs = append(batchInputs, inp.Token)
 			if inp.Multimodal != nil {
-				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batch.Inputs) - 1, Multimodal: inp.Multimodal})
+				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: inp.Multimodal})
 			}
 
 			batch.Positions = append(batch.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
@@ -405,7 +406,7 @@ func (s *Server) processBatch() error {
 
 			seq.iBatch = len(batch.Outputs)
 			if j+1 == len(seq.inputs) {
-				batch.Outputs = append(batch.Outputs, int32(len(batch.Inputs)-1))
+				batch.Outputs = append(batch.Outputs, int32(len(batchInputs)-1))
 			}
 			seq.pendingInputs = append(seq.pendingInputs, inp)
 		}
@@ -413,14 +414,14 @@ func (s *Server) processBatch() error {
 		seq.inputs = seq.inputs[len(seq.pendingInputs):]
 	}
 
-	if len(batch.Inputs) == 0 {
+	if len(batchInputs) == 0 {
 		return nil
 	}
 
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()
 
-	modelOutput, err := model.Forward(ctx, s.model, batch)
+	modelOutput, err := model.Forward(ctx, s.model, batchInputs, batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
 	}

From d3e9ca3eda5585204358f4ba23b3c7cb23bdb3e2 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 20 Mar 2025 14:27:17 -0700
Subject: [PATCH 11/31] kvcache: Account for source tensors in defrag operation
 count

Defragging the KV cache can generate a lot of operations, so we
need to be careful that we don't overflow the number that the graph
can support. We currently account for all of the nodes that we add
to the graph for each move but we also need to include the original
cache tensors as well.

Fixes #9904
---
 kvcache/causal.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kvcache/causal.go b/kvcache/causal.go
index 79fa24e8..e5216d58 100644
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -321,7 +321,8 @@ func (c *Causal) defrag() {
 	ctx := c.backend.NewContext()
 
 	// For every move, 6 tensors are required per layer (2 views and a
-	// copy for each of k and v).
+	// copy for each of k and v). We also need to refer to the original
+	// k and v cache tensors - once per layer, not per move.
 	layers := 0
 	for _, key := range c.keys {
 		if key == nil {
@@ -330,7 +331,7 @@ func (c *Causal) defrag() {
 		layers++
 	}
 
-	maxMoves := ctx.MaxGraphNodes() / (6 * layers)
+	maxMoves := (ctx.MaxGraphNodes() - 2*layers) / (6 * layers)
 	moves := 0
 
 	var pendingSrc, pendingDst, pendingLen int

From 0ff28758b3a5e9dc0149a93d87677dd0585590c1 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 20 Mar 2025 10:35:19 -0700
Subject: [PATCH 12/31] ollamarunner: Provide mechanism for backends to report
 loading progress

This enables the runner to report progress back to the Ollama server,
both for showing status to the user and also to prevent the server
from killing the runner if it thinks things have stalled.

Most of the infrastructure was already there, this extends it to
be available to the backends.
---
 ml/backend.go                 | 4 ++++
 runner/ollamarunner/runner.go | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/ml/backend.go b/ml/backend.go
index c63c73d4..66eb37f7 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -60,6 +60,10 @@ type CacheConfig struct {
 
 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
+	// Progress is a callback function that allows reporting percentage completion
+	// of model loading
+	Progress func(float32)
+
 	// NumThreads sets the number of threads to use if running on the CPU
 	NumThreads int
 
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index 443b34b0..90eb0de6 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -783,6 +783,9 @@ func Execute(args []string) error {
 	}
 
 	params := ml.BackendParams{
+		Progress: func(progress float32) {
+			server.progress = progress
+		},
 		NumThreads:     *threads,
 		NumGPULayers:   *numGPULayers,
 		MainGPU:        *mainGPU,

From 6d1103048eac63f27148d6d8fe47c98cbb6f184f Mon Sep 17 00:00:00 2001
From: Patrick Devine <patrick@infrahq.com>
Date: Fri, 21 Mar 2025 11:13:54 -0700
Subject: [PATCH 13/31] fix: show correct bool value for kv in verbose show
 information (#9928)

---
 cmd/cmd.go      | 2 ++
 cmd/cmd_test.go | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 710f49a7..abb4806b 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -703,6 +703,8 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 			for _, k := range keys {
 				var v string
 				switch vData := resp.ModelInfo[k].(type) {
+				case bool:
+					v = fmt.Sprintf("%t", vData)
 				case string:
 					v = vData
 				case float64:
diff --git a/cmd/cmd_test.go b/cmd/cmd_test.go
index 41b03e1b..ea3bdffe 100644
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -87,6 +87,8 @@ func TestShowInfo(t *testing.T) {
 			ModelInfo: map[string]any{
 				"general.architecture":    "test",
 				"general.parameter_count": float64(8_000_000_000),
+				"some.true_bool":          true,
+				"some.false_bool":         false,
 				"test.context_length":     float64(1000),
 				"test.embedding_length":   float64(11434),
 			},
@@ -111,6 +113,8 @@ func TestShowInfo(t *testing.T) {
   Metadata
     general.architecture       test     
     general.parameter_count    8e+09    
+    some.false_bool            false    
+    some.true_bool             true     
     test.context_length        1000     
     test.embedding_length      11434    
 

From 3ed7ad3ab32b458aa2fdb8d0144c546efdb26a72 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 18 Mar 2025 14:31:52 -0700
Subject: [PATCH 14/31] kvcache: Pass granular cache size into implementations

Currently the runner computes the kv size needed and creates a
cache of that size. This is the context size times number of
parallel sequences.

Cache implementations can make better decisions about their memory
usage, so instead pass in the required capacity, number of sequences
and maximum batch size. For now, the causal cache just uses this to
compute the size in the same way as before.
---
 kvcache/cache.go              |  9 +++++++--
 kvcache/causal.go             | 33 +++++++++++++++++----------------
 kvcache/causal_test.go        | 12 ++++++------
 kvcache/encoder.go            |  6 +++++-
 kvcache/wrapper.go            |  4 ++--
 runner/ollamarunner/cache.go  | 10 ++++++----
 runner/ollamarunner/runner.go |  2 +-
 7 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/kvcache/cache.go b/kvcache/cache.go
index aa0a2056..18aec800 100644
--- a/kvcache/cache.go
+++ b/kvcache/cache.go
@@ -43,8 +43,13 @@ type Cache interface {
 
 	// ** cache management **
 
-	// Init sets up runtime parameters
-	Init(backend ml.Backend, dtype ml.DType, capacity int32)
+	// Init sets up runtime parameters.
+	// backend: Used to allocate cache data storage and execute management operations (such as defrag)
+	// dtype: The data type for storing cache entries
+	// maxSequences: The maximum number of sequences stored in the cache - across all batches
+	// capacity: The number of cache entries to store, per sequence
+	// maxBatch: The maximum number of tokens that can occur in a single batch
+	Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int)
 
 	// Close closes the cache and frees resources associated with it
 	Close()
diff --git a/kvcache/causal.go b/kvcache/causal.go
index e5216d58..ced409c3 100644
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -20,7 +20,6 @@ type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, e
 // The mask is of shape history size, batch size
 type Causal struct {
 	DType      ml.DType
-	Capacity   int32
 	windowSize int32
 
 	opts CausalOptions
@@ -98,7 +97,7 @@ func NewSWACache(windowSize int32, shift shiftFn) *Causal {
 	}
 }
 
-func (c *Causal) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
+func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
 	if c.config == nil {
 		var config ml.CacheConfig
 		if cc, ok := backend.(ml.BackendCacheConfig); ok {
@@ -119,9 +118,11 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
 		c.config.MaskDType = ml.DTypeF32
 	}
 
+	cacheSize := maxSequences * capacity
+	cacheSize = roundUp(cacheSize, c.config.CachePadding)
+	c.cells = make([]cacheCell, cacheSize)
+
 	c.DType = dtype
-	c.Capacity = int32(roundUp(int(capacity), c.config.CachePadding))
-	c.cells = make([]cacheCell, c.Capacity)
 	c.cellRanges = make(map[int]cellRange)
 	c.backend = backend
 }
@@ -210,7 +211,7 @@ func (c *Causal) findStartLoc() (int, error) {
 		}
 	}
 
-	return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, c.Capacity)
+	return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, len(c.cells))
 }
 
 func roundDown(length, pad int) int {
@@ -265,7 +266,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 	return maskTensor, nil
 }
 
-func (c *Causal) moveCells(ctx ml.Context, src, dst, len int) {
+func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
 	for i, key := range c.keys {
 		if key == nil {
 			continue
@@ -275,8 +276,8 @@ func (c *Causal) moveCells(ctx ml.Context, src, dst, len int) {
 		numKVHeads := key.Dim(1)
 		rowSize := key.Stride(2)
 
-		kSrcView := key.View(ctx, rowSize*src, kHeadDim*numKVHeads*len)
-		kDstView := key.View(ctx, rowSize*dst, kHeadDim*numKVHeads*len)
+		kSrcView := key.View(ctx, rowSize*src, kHeadDim*numKVHeads*length)
+		kDstView := key.View(ctx, rowSize*dst, kHeadDim*numKVHeads*length)
 
 		value := c.values[i]
 		var vSrcView, vDstView ml.Tensor
@@ -284,14 +285,14 @@ func (c *Causal) moveCells(ctx ml.Context, src, dst, len int) {
 			vHeadDim := value.Dim(1)
 			elemSize := value.Stride(0)
 
-			vSrcView = value.View(ctx, elemSize*src, len, int(c.Capacity)*elemSize, vHeadDim*numKVHeads)
-			vDstView = value.View(ctx, elemSize*dst, len, int(c.Capacity)*elemSize, vHeadDim*numKVHeads)
+			vSrcView = value.View(ctx, elemSize*src, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
+			vDstView = value.View(ctx, elemSize*dst, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
 		} else {
 			vHeadDim := value.Dim(0)
 			rowSize := value.Stride(2)
 
-			vSrcView = value.View(ctx, rowSize*src, vHeadDim*numKVHeads*len)
-			vDstView = value.View(ctx, rowSize*dst, vHeadDim*numKVHeads*len)
+			vSrcView = value.View(ctx, rowSize*src, vHeadDim*numKVHeads*length)
+			vDstView = value.View(ctx, rowSize*dst, vHeadDim*numKVHeads*length)
 		}
 
 		ctx.Forward(
@@ -480,14 +481,14 @@ func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
 	}
 
 	if _, ok := c.keys[c.curLayer]; !ok {
-		c.keys[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, kHeadDim, numKVHeads, int(c.Capacity))
+		c.keys[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, kHeadDim, numKVHeads, len(c.cells))
 	}
 
 	if _, ok := c.values[c.curLayer]; !ok {
 		if c.config.PermutedV {
-			c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, int(c.Capacity), vHeadDim, numKVHeads)
+			c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, len(c.cells), vHeadDim, numKVHeads)
 		} else {
-			c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, vHeadDim, numKVHeads, int(c.Capacity))
+			c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, vHeadDim, numKVHeads, len(c.cells))
 		}
 	}
 
@@ -498,7 +499,7 @@ func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
 		elemSize := c.values[c.curLayer].Stride(0)
 
 		value = value.Permute(ctx, 1, 2, 0, 3)
-		ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, elemSize*c.curLoc, batchSize, int(c.Capacity)*elemSize, vHeadDim*numKVHeads)))
+		ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, elemSize*c.curLoc, batchSize, len(c.cells)*elemSize, vHeadDim*numKVHeads)))
 	} else {
 		rowSize := c.values[c.curLayer].Stride(2)
 
diff --git a/kvcache/causal_test.go b/kvcache/causal_test.go
index 0f2385db..66a2e835 100644
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -25,7 +25,7 @@ func TestStore(t *testing.T) {
 	cache := NewCausalCache(nil)
 	defer cache.Close()
 
-	cache.Init(backend, ml.DTypeF16, 16)
+	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
 
 	tests := []testCase{
 		{
@@ -58,7 +58,7 @@ func TestSWA(t *testing.T) {
 	cache := NewSWACache(1, nil)
 	defer cache.Close()
 
-	cache.Init(backend, ml.DTypeF32, 16)
+	cache.Init(backend, ml.DTypeF32, 1, 16, 16)
 
 	tests := []testCase{
 		{
@@ -81,7 +81,7 @@ func TestSequences(t *testing.T) {
 	cache := NewCausalCache(nil)
 	defer cache.Close()
 
-	cache.Init(backend, ml.DTypeF16, 16)
+	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
 
 	tests := []testCase{
 		{
@@ -116,7 +116,7 @@ func TestRemove(t *testing.T) {
 	})
 	defer cache.Close()
 
-	cache.Init(backend, ml.DTypeF16, 16)
+	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
 
 	tests := []testCase{
 		{
@@ -181,7 +181,7 @@ func TestDefrag(t *testing.T) {
 	})
 	defer cache.Close()
 
-	cache.Init(backend, ml.DTypeF16, 16)
+	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
 
 	tests := []testCase{
 		{
@@ -229,7 +229,7 @@ func TestCopy(t *testing.T) {
 	cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { return key, nil })
 	defer cache.Close()
 
-	cache.Init(backend, ml.DTypeF16, 16)
+	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
 
 	tests := []testCase{
 		{
diff --git a/kvcache/encoder.go b/kvcache/encoder.go
index 94c5d99c..07ff4291 100644
--- a/kvcache/encoder.go
+++ b/kvcache/encoder.go
@@ -49,7 +49,7 @@ func NewEncoderCache() *EncoderCache {
 	}
 }
 
-func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
+func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
 	if c.config == nil {
 		var config ml.CacheConfig
 		if cc, ok := backend.(ml.BackendCacheConfig); ok {
@@ -58,6 +58,10 @@ func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32)
 		c.config = &config
 	}
 
+	if maxSequences > 1 {
+		panic(fmt.Errorf("encoder cache does not support multiple sequences; requested: %v", maxSequences))
+	}
+
 	if c.config.CachePadding != 0 && c.config.CachePadding != 1 {
 		panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))
 	}
diff --git a/kvcache/wrapper.go b/kvcache/wrapper.go
index c85807a0..0e8ff1f3 100644
--- a/kvcache/wrapper.go
+++ b/kvcache/wrapper.go
@@ -23,9 +23,9 @@ func NewWrapperCache(caches ...Cache) *WrapperCache {
 	}
 }
 
-func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
+func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
 	for _, cache := range c.caches {
-		cache.Init(backend, dtype, capacity)
+		cache.Init(backend, dtype, maxSequences, capacity, maxBatch)
 	}
 }
 
diff --git a/runner/ollamarunner/cache.go b/runner/ollamarunner/cache.go
index cf5e6b91..aa56c982 100644
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -31,8 +31,10 @@ type InputCache struct {
 	cache kvcache.Cache
 }
 
-func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, multiUserCache bool) (*InputCache, error) {
-	if kvSize/int32(numSlots) < 1 {
+func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, batchSize int, multiUserCache bool) (*InputCache, error) {
+	numCtx := kvSize / int32(numSlots)
+
+	if numCtx < 1 {
 		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
 	}
 
@@ -44,11 +46,11 @@ func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots
 
 	cache := model.Config().Cache
 	if cache != nil {
-		cache.Init(model.Backend(), kvCacheTypeFromStr(kvCacheType), kvSize)
+		cache.Init(model.Backend(), kvCacheTypeFromStr(kvCacheType), numSlots, int(numCtx), batchSize)
 	}
 
 	return &InputCache{
-		numCtx:         kvSize / int32(numSlots),
+		numCtx:         numCtx,
 		enabled:        cache != nil,
 		slots:          slots,
 		multiUserCache: multiUserCache,
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index 90eb0de6..67d9a1b0 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -699,7 +699,7 @@ func (s *Server) loadModel(
 		panic("loras are not yet implemented")
 	}
 
-	s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, multiUserCache)
+	s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
 	if err != nil {
 		panic(err)
 	}

From 2d6eac9084a29060ccff69014e28e206a3a7a663 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 18 Mar 2025 13:13:32 -0700
Subject: [PATCH 15/31] kvcache: Optimize sliding window attention

Currently sliding window attention allocates and uses the full
context size and just masks out any tokens that are outside of the
window. However, we really only need (roughly) the sliding window
size.

At large context sizes this improves two things:
 - Memory allocated - since the fully context size is allocated up front,
   memory requirements drop substantially. On Gemma3:4b with a 32k
   context window, total memory usage (including weights and non-sliding
   layers) drops from ~20GB to ~8GB.
 - Computation - ranges that are completely outside of the sliding
   window are now removed from the tensors that are returned from the
   cache rather than simply being masked out. This results in more
   efficient processing, scaling with the size of the context that
   has actually been used.

Notable, this does not update the scheduler for any model to be aware of
the smaller memory requirements. This is difficult for Gemma3 because
the layers are heterogeneous between sliding and non-sliding attention.
As a result, while actual memory consumption will be reduced, the
scheduler will over-estimate the requirements of the model. This means
that splitting between GPUs or GPUs and CPUs will still be suboptimal.

Bug #9730
---
 kvcache/causal.go      | 53 +++++++++++++++++++++++++++++++++++++++++-
 kvcache/causal_test.go | 14 +++++++++--
 2 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/kvcache/causal.go b/kvcache/causal.go
index ced409c3..aacaf540 100644
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -118,7 +118,12 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 		c.config.MaskDType = ml.DTypeF32
 	}
 
-	cacheSize := maxSequences * capacity
+	var cacheSize int
+	if c.windowSize == math.MaxInt32 || capacity < int(c.windowSize)+maxBatch {
+		cacheSize = maxSequences * capacity
+	} else {
+		cacheSize = maxSequences * (int(c.windowSize) + maxBatch)
+	}
 	cacheSize = roundUp(cacheSize, c.config.CachePadding)
 	c.cells = make([]cacheCell, cacheSize)
 
@@ -147,6 +152,8 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch) error {
 	c.curPositions = batch.Positions
 	c.opts.Except = nil
 
+	c.updateSlidingWindow()
+
 	var err error
 	c.curLoc, err = c.findStartLoc()
 	if errors.Is(err, ErrKvCacheFull) {
@@ -214,6 +221,50 @@ func (c *Causal) findStartLoc() (int, error) {
 	return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, len(c.cells))
 }
 
+func (c *Causal) updateSlidingWindow() {
+	if c.windowSize == math.MaxInt32 {
+		return
+	}
+
+	// create a map of unique sequences to the lowest position in that sequence
+	lowestPos := make(map[int]int32)
+	for i := range c.curPositions {
+		seq := c.curSequences[i]
+
+		pos, ok := lowestPos[seq]
+		if !ok {
+			pos = c.curPositions[i]
+		} else if c.curPositions[i] < pos {
+			pos = c.curPositions[i]
+		}
+
+		lowestPos[seq] = pos
+	}
+
+	// delete any entries that are beyond the window of the oldest position in the sequence
+	for seq, pos := range lowestPos {
+		oldRange, ok := c.cellRanges[seq]
+		if !ok {
+			continue
+		}
+
+		newRange := newRange()
+
+		for i := oldRange.min; i <= oldRange.max; i++ {
+			if slices.Contains(c.cells[i].sequences, seq) {
+				if c.cells[i].pos < pos-c.windowSize {
+					c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
+				} else {
+					newRange.min = min(newRange.min, i)
+					newRange.max = max(newRange.max, i)
+				}
+			}
+		}
+
+		c.cellRanges[seq] = newRange
+	}
+}
+
 func roundDown(length, pad int) int {
 	return (length / pad) * pad
 }
diff --git a/kvcache/causal_test.go b/kvcache/causal_test.go
index 66a2e835..617f5363 100644
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -58,11 +58,11 @@ func TestSWA(t *testing.T) {
 	cache := NewSWACache(1, nil)
 	defer cache.Close()
 
-	cache.Init(backend, ml.DTypeF32, 1, 16, 16)
+	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
 
 	tests := []testCase{
 		{
-			name:          "SlidingWindow",
+			name:          "FirstBatch",
 			in:            []float32{1, 2, 3, 4},
 			inShape:       []int{1, 1, 4},
 			seqs:          []int{0, 0, 0, 0},
@@ -71,6 +71,16 @@ func TestSWA(t *testing.T) {
 			expectedShape: []int{1, 1, 4},
 			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
 		},
+		{
+			name:          "SecondBatch",
+			in:            []float32{5, 6},
+			inShape:       []int{1, 1, 2},
+			seqs:          []int{0, 0},
+			pos:           []int32{4, 5},
+			expected:      []float32{5, 6, 3, 4},
+			expectedShape: []int{1, 1, 4},
+			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1))},
+		},
 	}
 
 	testCache(t, backend, cache, tests)

From d14ce75b95430eedecdf9c1dacd6912df61a1ad7 Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Fri, 21 Mar 2025 12:35:47 -0700
Subject: [PATCH 16/31] docs: update final response for /api/chat stream
 (#9919)

---
 docs/api.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/api.md b/docs/api.md
index 7de81049..fe044d79 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -558,6 +558,10 @@ Final response:
 {
   "model": "llama3.2",
   "created_at": "2023-08-04T19:22:45.499127Z",
+  "message": {
+    "role": "assistant",
+    "content": ""
+  },
   "done": true,
   "total_duration": 4883583458,
   "load_duration": 1334875,

From 00ebda8cc4f2031096973b26d6b0de7425a9ca82 Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Fri, 21 Mar 2025 12:38:09 -0700
Subject: [PATCH 17/31] Revert "parser: remove role validation from Modelfile
 parser" (#9917)

This reverts commit ffbfe833da387f9b6806fe887b85992c11d26eaa.
---
 parser/parser.go      | 18 +++++++++++-------
 parser/parser_test.go | 10 +++++-----
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/parser/parser.go b/parser/parser.go
index 0a32d571..6832351f 100644
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -7,7 +7,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"log/slog"
 	"net/http"
 	"os"
 	"os/user"
@@ -301,8 +300,9 @@ const (
 )
 
 var (
-	errMissingFrom    = errors.New("no FROM line")
-	errInvalidCommand = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
+	errMissingFrom        = errors.New("no FROM line")
+	errInvalidMessageRole = errors.New("message role must be one of \"system\", \"user\", or \"assistant\"")
+	errInvalidCommand     = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
 )
 
 type ParserError struct {
@@ -379,10 +379,14 @@ func ParseFile(r io.Reader) (*Modelfile, error) {
 			case stateParameter:
 				cmd.Name = b.String()
 			case stateMessage:
-				role = b.String()
-				if !isKnownMessageRole(b.String()) {
-					slog.Warn("received non-standard role", "role", role)
+				if !isValidMessageRole(b.String()) {
+					return nil, &ParserError{
+						LineNumber: currLine,
+						Msg:        errInvalidMessageRole.Error(),
+					}
 				}
+
+				role = b.String()
 			case stateComment, stateNil:
 				// pass
 			case stateValue:
@@ -552,7 +556,7 @@ func isNewline(r rune) bool {
 	return r == '\r' || r == '\n'
 }
 
-func isKnownMessageRole(role string) bool {
+func isValidMessageRole(role string) bool {
 	return role == "system" || role == "user" || role == "assistant"
 }
 
diff --git a/parser/parser_test.go b/parser/parser_test.go
index c4f8f4aa..097c058f 100644
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -256,13 +256,13 @@ You are a multiline file parser. Always parse things.
 		{
 			`
 FROM foo
-MESSAGE somerandomrole I'm ok with you adding any role message now!
+MESSAGE badguy I'm a bad guy!
 `,
-			[]Command{
-				{Name: "model", Args: "foo"},
-				{Name: "message", Args: "somerandomrole: I'm ok with you adding any role message now!"},
-			},
 			nil,
+			&ParserError{
+				LineNumber: 3,
+				Msg:        errInvalidMessageRole.Error(),
+			},
 		},
 		{
 			`

From c794fef2f27d141393064665d2774b341d091393 Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Fri, 21 Mar 2025 13:03:43 -0700
Subject: [PATCH 18/31] server/internal/client/ollama: persist through chunk
 download errors (#9923)

---
 server/internal/client/ollama/registry.go     | 61 ++++++++++++-------
 .../internal/client/ollama/registry_test.go   | 31 +++++++++-
 server/internal/registry/server.go            | 33 ++++++----
 3 files changed, 89 insertions(+), 36 deletions(-)

diff --git a/server/internal/client/ollama/registry.go b/server/internal/client/ollama/registry.go
index fdac71bb..59041867 100644
--- a/server/internal/client/ollama/registry.go
+++ b/server/internal/client/ollama/registry.go
@@ -59,6 +59,11 @@ var (
 	// ErrCached is passed to [Trace.PushUpdate] when a layer already
 	// exists. It is a non-fatal error and is never returned by [Registry.Push].
 	ErrCached = errors.New("cached")
+
+	// ErrIncomplete is returned by [Registry.Pull] when a model pull was
+	// incomplete due to one or more layer download failures. Users that
+	// want specific errors should use [WithTrace].
+	ErrIncomplete = errors.New("incomplete")
 )
 
 // Defaults
@@ -271,8 +276,19 @@ func DefaultRegistry() (*Registry, error) {
 
 func UserAgent() string {
 	buildinfo, _ := debug.ReadBuildInfo()
+
+	version := buildinfo.Main.Version
+	if version == "(devel)" {
+		// When using `go run .` the version is "(devel)". This is seen
+		// as an invalid version by ollama.com and so it defaults to
+		// "needs upgrade" for some requests, such as pulls. These
+		// checks can be skipped by using the special version "v0.0.0",
+		// so we set it to that here.
+		version = "v0.0.0"
+	}
+
 	return fmt.Sprintf("ollama/%s (%s %s) Go/%s",
-		buildinfo.Main.Version,
+		version,
 		runtime.GOARCH,
 		runtime.GOOS,
 		runtime.Version(),
@@ -418,13 +434,14 @@ func canRetry(err error) bool {
 //
 // It always calls update with a nil error.
 type trackingReader struct {
-	r io.Reader
-	n *atomic.Int64
+	l      *Layer
+	r      io.Reader
+	update func(l *Layer, n int64, err error)
 }
 
 func (r *trackingReader) Read(p []byte) (n int, err error) {
 	n, err = r.r.Read(p)
-	r.n.Add(int64(n))
+	r.update(r.l, int64(n), nil)
 	return
 }
 
@@ -462,16 +479,20 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 
 	// Send initial layer trace events to allow clients to have an
 	// understanding of work to be done before work starts.
+	var expected int64
 	t := traceFromContext(ctx)
 	for _, l := range layers {
 		t.update(l, 0, nil)
+		expected += l.Size
 	}
 
+	var total atomic.Int64
 	var g errgroup.Group
 	g.SetLimit(r.maxStreams())
 	for _, l := range layers {
 		info, err := c.Get(l.Digest)
 		if err == nil && info.Size == l.Size {
+			total.Add(l.Size)
 			t.update(l, l.Size, ErrCached)
 			continue
 		}
@@ -484,21 +505,25 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 		// TODO(bmizerany): fix this unbounded use of defer
 		defer chunked.Close()
 
-		var progress atomic.Int64
 		for cs, err := range r.chunksums(ctx, name, l) {
 			if err != nil {
-				// Bad chunksums response, update tracing
-				// clients and then bail.
-				t.update(l, progress.Load(), err)
-				return err
+				// Chunksum stream was interrupted, so tell
+				// trace about it, and let in-flight chunk
+				// downloads finish. Once they finish, return
+				// ErrIncomplete, which is triggered by the
+				// fact that the total bytes received is less
+				// than the expected bytes.
+				t.update(l, 0, err)
+				break
 			}
 
 			g.Go(func() (err error) {
 				defer func() {
-					if err != nil {
+					if err == nil || errors.Is(err, ErrCached) {
+						total.Add(cs.Chunk.Size())
+					} else {
 						err = fmt.Errorf("error downloading %s: %w", cs.Digest.Short(), err)
 					}
-					t.update(l, progress.Load(), err)
 				}()
 
 				req, err := http.NewRequestWithContext(ctx, "GET", cs.URL, nil)
@@ -522,7 +547,7 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 				// download rate since it knows better than a
 				// client that is measuring rate based on
 				// wall-clock time-since-last-update.
-				body := &trackingReader{r: res.Body, n: &progress}
+				body := &trackingReader{l: l, r: res.Body, update: t.update}
 
 				return chunked.Put(cs.Chunk, cs.Digest, body)
 			})
@@ -531,6 +556,9 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 	if err := g.Wait(); err != nil {
 		return err
 	}
+	if total.Load() != expected {
+		return fmt.Errorf("%w: received %d/%d", ErrIncomplete, total.Load(), expected)
+	}
 
 	md := blob.DigestFromBytes(m.Data)
 	if err := blob.PutBytes(c, md, m.Data); err != nil {
@@ -757,15 +785,12 @@ func (r *Registry) chunksums(ctx context.Context, name string, l *Layer) iter.Se
 		}
 		blobURL := res.Header.Get("Content-Location")
 
-		var size int64
 		s := bufio.NewScanner(res.Body)
 		s.Split(bufio.ScanWords)
 		for {
 			if !s.Scan() {
 				if s.Err() != nil {
 					yield(chunksum{}, s.Err())
-				} else if size != l.Size {
-					yield(chunksum{}, fmt.Errorf("size mismatch: layer size %d != sum of chunks %d", size, l.Size))
 				}
 				return
 			}
@@ -789,12 +814,6 @@ func (r *Registry) chunksums(ctx context.Context, name string, l *Layer) iter.Se
 				return
 			}
 
-			size += chunk.Size()
-			if size > l.Size {
-				yield(chunksum{}, fmt.Errorf("chunk size %d exceeds layer size %d", size, l.Size))
-				return
-			}
-
 			cs := chunksum{
 				URL:    blobURL,
 				Chunk:  chunk,
diff --git a/server/internal/client/ollama/registry_test.go b/server/internal/client/ollama/registry_test.go
index 30529543..f8136c06 100644
--- a/server/internal/client/ollama/registry_test.go
+++ b/server/internal/client/ollama/registry_test.go
@@ -25,6 +25,28 @@ import (
 	"github.com/ollama/ollama/server/internal/testutil"
 )
 
+func ExampleRegistry_cancelOnFirstError() {
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	ctx = WithTrace(ctx, &Trace{
+		Update: func(l *Layer, n int64, err error) {
+			if err != nil {
+				// Discontinue pulling layers if there is an
+				// error instead of continuing to pull more
+				// data.
+				cancel()
+			}
+		},
+	})
+
+	var r Registry
+	if err := r.Pull(ctx, "model"); err != nil {
+		// panic for demo purposes
+		panic(err)
+	}
+}
+
 func TestManifestMarshalJSON(t *testing.T) {
 	// All manifests should contain an "empty" config object.
 	var m Manifest
@@ -813,8 +835,13 @@ func TestPullChunksums(t *testing.T) {
 	)
 	err := rc.Pull(ctx, "test")
 	check(err)
-	if !slices.Equal(reads, []int64{0, 3, 5}) {
-		t.Errorf("reads = %v; want %v", reads, []int64{0, 3, 5})
+	wantReads := []int64{
+		0, // initial signaling of layer pull starting
+		3, // first chunk read
+		2, // second chunk read
+	}
+	if !slices.Equal(reads, wantReads) {
+		t.Errorf("reads = %v; want %v", reads, wantReads)
 	}
 
 	mw, err := rc.Resolve(t.Context(), "test")
diff --git a/server/internal/registry/server.go b/server/internal/registry/server.go
index 2a935b52..1910b187 100644
--- a/server/internal/registry/server.go
+++ b/server/internal/registry/server.go
@@ -200,7 +200,7 @@ type params struct {
 	//
 	// Unfortunately, this API was designed to be a bit awkward. Stream is
 	// defined to default to true if not present, so we need a way to check
-	// if the client decisively it to false. So, we use a pointer to a
+	// if the client decisively set it to false. So, we use a pointer to a
 	// bool. Gross.
 	//
 	// Use [stream()] to get the correct value for this field.
@@ -280,17 +280,17 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
 	progress := make(map[*ollama.Layer]int64)
 
 	progressCopy := make(map[*ollama.Layer]int64, len(progress))
-	pushUpdate := func() {
+	flushProgress := func() {
 		defer maybeFlush()
 
-		// TODO(bmizerany): This scales poorly with more layers due to
-		// needing to flush out them all in one big update. We _could_
-		// just flush on the changed ones, or just track the whole
-		// download. Needs more thought. This is fine for now.
+		// TODO(bmizerany): Flushing every layer in one update doesn't
+		// scale well. We could flush only the modified layers or track
+		// the full download. Needs further consideration, though it's
+		// fine for now.
 		mu.Lock()
 		maps.Copy(progressCopy, progress)
 		mu.Unlock()
-		for l, n := range progress {
+		for l, n := range progressCopy {
 			enc.Encode(progressUpdateJSON{
 				Digest:    l.Digest,
 				Total:     l.Size,
@@ -298,19 +298,26 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
 			})
 		}
 	}
+	defer flushProgress()
 
-	t := time.NewTicker(time.Hour) // "unstarted" timer
+	t := time.NewTicker(1000 * time.Hour) // "unstarted" timer
 	start := sync.OnceFunc(func() {
-		pushUpdate()
+		flushProgress() // flush initial state
 		t.Reset(100 * time.Millisecond)
 	})
 	ctx := ollama.WithTrace(r.Context(), &ollama.Trace{
 		Update: func(l *ollama.Layer, n int64, err error) {
 			if n > 0 {
-				start() // flush initial state
+				// Block flushing progress updates until every
+				// layer is accounted for. Clients depend on a
+				// complete model size to calculate progress
+				// correctly; if they use an incomplete total,
+				// progress indicators would erratically jump
+				// as new layers are registered.
+				start()
 			}
 			mu.Lock()
-			progress[l] = n
+			progress[l] += n
 			mu.Unlock()
 		},
 	})
@@ -323,9 +330,9 @@ func (s *Local) handlePull(w http.ResponseWriter, r *http.Request) error {
 	for {
 		select {
 		case <-t.C:
-			pushUpdate()
+			flushProgress()
 		case err := <-done:
-			pushUpdate()
+			flushProgress()
 			if err != nil {
 				var status string
 				if errors.Is(err, ollama.ErrModelNotFound) {

From fb6252d786c9cafdd2dbfa434535d85c611c0ff0 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Fri, 21 Mar 2025 13:08:20 -0700
Subject: [PATCH 19/31] benchmark: performance of running ollama server (#8643)

---
 benchmark/server_benchmark_test.go | 178 +++++++++++++++++++++++++++++
 docs/benchmark.md                  |  59 ++++++++++
 2 files changed, 237 insertions(+)
 create mode 100644 benchmark/server_benchmark_test.go
 create mode 100644 docs/benchmark.md

diff --git a/benchmark/server_benchmark_test.go b/benchmark/server_benchmark_test.go
new file mode 100644
index 00000000..b27aa630
--- /dev/null
+++ b/benchmark/server_benchmark_test.go
@@ -0,0 +1,178 @@
+package benchmark
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+// Command line flags
+var modelFlag string
+
+func init() {
+	flag.StringVar(&modelFlag, "m", "", "Name of the model to benchmark")
+	flag.Lookup("m").DefValue = "model"
+}
+
+// modelName returns the model name from flags, failing the test if not set
+func modelName(b *testing.B) string {
+	if modelFlag == "" {
+		b.Fatal("Error: -m flag is required for benchmark tests")
+	}
+	return modelFlag
+}
+
+type TestCase struct {
+	name      string
+	prompt    string
+	maxTokens int
+}
+
+// runGenerateBenchmark contains the common generate and metrics logic
+func runGenerateBenchmark(b *testing.B, ctx context.Context, client *api.Client, req *api.GenerateRequest) {
+	start := time.Now()
+	var ttft time.Duration
+	var metrics api.Metrics
+
+	err := client.Generate(ctx, req, func(resp api.GenerateResponse) error {
+		if ttft == 0 && resp.Response != "" {
+			ttft = time.Since(start)
+		}
+		if resp.Done {
+			metrics = resp.Metrics
+		}
+		return nil
+	})
+
+	// Report custom metrics as part of the benchmark results
+	b.ReportMetric(float64(ttft.Milliseconds()), "ttft_ms")
+	b.ReportMetric(float64(metrics.LoadDuration.Milliseconds()), "load_ms")
+
+	// Token throughput metrics
+	promptThroughput := float64(metrics.PromptEvalCount) / metrics.PromptEvalDuration.Seconds()
+	genThroughput := float64(metrics.EvalCount) / metrics.EvalDuration.Seconds()
+	b.ReportMetric(promptThroughput, "prompt_tok/s")
+	b.ReportMetric(genThroughput, "gen_tok/s")
+
+	// Token counts
+	b.ReportMetric(float64(metrics.PromptEvalCount), "prompt_tokens")
+	b.ReportMetric(float64(metrics.EvalCount), "gen_tokens")
+	if err != nil {
+		b.Fatal(err)
+	}
+}
+
+// BenchmarkColdStart runs benchmarks with model loading from cold state
+func BenchmarkColdStart(b *testing.B) {
+	client := setup(b)
+	tests := []TestCase{
+		{"short_prompt", "Write a long story", 100},
+		{"medium_prompt", "Write a detailed economic analysis", 500},
+		{"long_prompt", "Write a comprehensive AI research paper", 1000},
+	}
+	m := modelName(b)
+
+	for _, tt := range tests {
+		b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
+			ctx := context.Background()
+
+			// Set number of tokens as our throughput metric
+			b.SetBytes(int64(tt.maxTokens))
+
+			for b.Loop() {
+				b.StopTimer()
+				// Ensure model is unloaded before each iteration
+				unload(client, m, b)
+				b.StartTimer()
+
+				req := &api.GenerateRequest{
+					Model:   m,
+					Prompt:  tt.prompt,
+					Options: map[string]interface{}{"num_predict": tt.maxTokens, "temperature": 0.1},
+				}
+
+				runGenerateBenchmark(b, ctx, client, req)
+			}
+		})
+	}
+}
+
+// BenchmarkWarmStart runs benchmarks with pre-loaded model
+func BenchmarkWarmStart(b *testing.B) {
+	client := setup(b)
+	tests := []TestCase{
+		{"short_prompt", "Write a long story", 100},
+		{"medium_prompt", "Write a detailed economic analysis", 500},
+		{"long_prompt", "Write a comprehensive AI research paper", 1000},
+	}
+	m := modelName(b)
+
+	for _, tt := range tests {
+		b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
+			ctx := context.Background()
+
+			// Pre-warm the model
+			warmup(client, m, tt.prompt, b)
+
+			// Set number of tokens as our throughput metric
+			b.SetBytes(int64(tt.maxTokens))
+
+			for b.Loop() {
+				req := &api.GenerateRequest{
+					Model:   m,
+					Prompt:  tt.prompt,
+					Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
+				}
+
+				runGenerateBenchmark(b, ctx, client, req)
+			}
+		})
+	}
+}
+
+// setup verifies server and model availability
+func setup(b *testing.B) *api.Client {
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		b.Fatal(err)
+	}
+	if _, err := client.Show(context.Background(), &api.ShowRequest{Model: modelName(b)}); err != nil {
+		b.Fatalf("Model unavailable: %v", err)
+	}
+
+	return client
+}
+
+// warmup ensures the model is loaded and warmed up
+func warmup(client *api.Client, model string, prompt string, b *testing.B) {
+	for range 3 {
+		err := client.Generate(
+			context.Background(),
+			&api.GenerateRequest{
+				Model:   model,
+				Prompt:  prompt,
+				Options: map[string]interface{}{"num_predict": 50, "temperature": 0.1},
+			},
+			func(api.GenerateResponse) error { return nil },
+		)
+		if err != nil {
+			b.Logf("Error during model warm-up: %v", err)
+		}
+	}
+}
+
+// unload forces model unloading using KeepAlive: 0 parameter
+func unload(client *api.Client, model string, b *testing.B) {
+	req := &api.GenerateRequest{
+		Model:     model,
+		KeepAlive: &api.Duration{Duration: 0},
+	}
+	if err := client.Generate(context.Background(), req, func(api.GenerateResponse) error { return nil }); err != nil {
+		b.Logf("Unload error: %v", err)
+	}
+	time.Sleep(1 * time.Second)
+}
diff --git a/docs/benchmark.md b/docs/benchmark.md
new file mode 100644
index 00000000..a7bed808
--- /dev/null
+++ b/docs/benchmark.md
@@ -0,0 +1,59 @@
+# Benchmark
+
+Go benchmark tests that measure end-to-end performance of a running Ollama server. Run these tests to evaluate model inference performance on your hardware and measure the impact of code changes.
+
+## When to use
+
+Run these benchmarks when:
+- Making changes to the model inference engine
+- Modifying model loading/unloading logic
+- Changing prompt processing or token generation code
+- Implementing a new model architecture
+- Testing performance across different hardware setups
+
+## Prerequisites
+- Ollama server running locally with `ollama serve` on `127.0.0.1:11434`
+## Usage and Examples
+
+>[!NOTE]
+>All commands must be run from the root directory of the Ollama project.
+
+Basic syntax:
+```bash
+go test -bench=. ./benchmark/... -m $MODEL_NAME
+```
+
+Required flags:
+- `-bench=.`: Run all benchmarks
+- `-m`: Model name to benchmark
+
+Optional flags:
+- `-count N`: Number of times to run the benchmark (useful for statistical analysis)
+- `-timeout T`: Maximum time for the benchmark to run (e.g. "10m" for 10 minutes)
+
+Common usage patterns:
+
+Single benchmark run with a model specified:
+```bash
+go test -bench=. ./benchmark/... -m llama3.3
+```
+
+## Output metrics
+
+The benchmark reports several key metrics:
+
+- `gen_tok/s`: Generated tokens per second
+- `prompt_tok/s`: Prompt processing tokens per second
+- `ttft_ms`: Time to first token in milliseconds
+- `load_ms`: Model load time in milliseconds
+- `gen_tokens`: Total tokens generated
+- `prompt_tokens`: Total prompt tokens processed
+
+Each benchmark runs two scenarios:
+- Cold start: Model is loaded from disk for each test
+- Warm start: Model is pre-loaded in memory
+
+Three prompt lengths are tested for each scenario:
+- Short prompt (100 tokens)
+- Medium prompt (500 tokens)
+- Long prompt (1000 tokens)

From 74bd09652d69c77a4bed34b3afda74c87295115b Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 19 Mar 2025 13:03:16 -0700
Subject: [PATCH 20/31] ml/backend/ggml: load tensors in 32KiB chunks

---
 ml/backend.go                 |  9 ++---
 ml/backend/ggml/ggml.go       | 65 ++++++++++++++++++++++++-----------
 model/model.go                |  5 +--
 runner/ollamarunner/runner.go | 11 +++---
 4 files changed, 59 insertions(+), 31 deletions(-)

diff --git a/ml/backend.go b/ml/backend.go
index 66eb37f7..354faf43 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -2,6 +2,7 @@ package ml
 
 import (
 	"bytes"
+	"context"
 	"encoding/binary"
 	"fmt"
 	"os"
@@ -80,9 +81,9 @@ type BackendParams struct {
 	FlashAttention bool
 }
 
-var backends = make(map[string]func(*os.File, BackendParams) (Backend, error))
+var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error))
 
-func RegisterBackend(name string, f func(*os.File, BackendParams) (Backend, error)) {
+func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) {
 	if _, ok := backends[name]; ok {
 		panic("backend: backend already registered")
 	}
@@ -90,9 +91,9 @@ func RegisterBackend(name string, f func(*os.File, BackendParams) (Backend, erro
 	backends[name] = f
 }
 
-func NewBackend(f *os.File, params BackendParams) (Backend, error) {
+func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) {
 	if backend, ok := backends["ggml"]; ok {
-		return backend(f, params)
+		return backend(ctx, f, params)
 	}
 
 	return nil, fmt.Errorf("unsupported backend")
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 6732470e..f6b01774 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -9,15 +9,17 @@ package ggml
 import "C"
 
 import (
-	"errors"
+	"context"
 	"fmt"
 	"io"
 	"log/slog"
 	"maps"
 	"os"
+	"runtime"
 	"slices"
 	"strconv"
 	"strings"
+	"sync/atomic"
 	"unicode"
 	"unsafe"
 
@@ -58,7 +60,7 @@ type Backend struct {
 	maxGraphNodes int
 }
 
-func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
+func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
 	meta, n, err := fs.Decode(r, -1)
 	if err != nil {
 		return nil, err
@@ -297,12 +299,16 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
 		}
 	}
 
-	// concurrently read in tensor data. uses a section reader which is safe for concurrent reads
-	sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset))
-	var g errgroup.Group
+	var doneBytes atomic.Uint64
+	totalBytes := uint64(n) - meta.Tensors().Offset
+
+	g, ctx := errgroup.WithContext(ctx)
+	g.SetLimit(runtime.GOMAXPROCS(0))
 	for _, t := range meta.Tensors().Items() {
-		for _, target := range targets[t.Name] {
-			g.Go(func() error {
+		g.Go(func() error {
+			tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
+			for i := range tts {
+				target := targets[t.Name][i]
 				if target == "" {
 					target = t.Name
 				}
@@ -312,24 +318,43 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
 					return fmt.Errorf("unassigned tensor: %s", t.Name)
 				}
 
-				bts := C.malloc(C.size_t(t.Size()))
-				if bts == nil {
-					return errors.New("failed to allocate tensor buffer")
-				}
-				defer C.free(bts)
+				tts[i] = tt
+			}
 
-				buf := unsafe.Slice((*byte)(bts), t.Size())
-				n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), buf)
-				if err != nil || n != len(buf) {
-					return errors.New("read failed")
+			sr := io.NewSectionReader(r, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
+			bts := make([]byte, 128*format.KibiByte)
+
+			var s uint64
+			for s < t.Size() {
+				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
+				if err != nil {
+					return err
 				}
 
-				C.ggml_backend_tensor_set(tt, bts, 0, C.size_t(t.Size()))
-				return nil
-			})
-		}
+				for _, tt := range tts {
+					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
+				}
+
+				s += uint64(n)
+
+				if params.Progress != nil {
+					done := doneBytes.Add(uint64(n))
+					params.Progress(float32(done) / float32(totalBytes))
+				}
+			}
+
+			return nil
+		})
 	}
 
+	// start a goroutine to cancel the errgroup if the parent context is done
+	go func() {
+		<-ctx.Done()
+		g.Go(func() error {
+			return ctx.Err()
+		})
+	}()
+
 	if err := g.Wait(); err != nil {
 		return nil, err
 	}
diff --git a/model/model.go b/model/model.go
index ab29916a..8355a55a 100644
--- a/model/model.go
+++ b/model/model.go
@@ -1,6 +1,7 @@
 package model
 
 import (
+	"context"
 	"errors"
 	"fmt"
 	_ "image/jpeg"
@@ -94,14 +95,14 @@ func Register(name string, f func(ml.Config) (Model, error)) {
 }
 
 // New initializes a new model instance with the provided configuration based on the metadata in the model file
-func New(modelPath string, params ml.BackendParams) (Model, error) {
+func New(ctx context.Context, modelPath string, params ml.BackendParams) (Model, error) {
 	r, err := os.Open(modelPath)
 	if err != nil {
 		return nil, err
 	}
 	defer r.Close()
 
-	b, err := ml.NewBackend(r, params)
+	b, err := ml.NewBackend(ctx, r, params)
 	if err != nil {
 		return nil, err
 	}
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index 67d9a1b0..31d20db8 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -678,6 +678,7 @@ func (m *multiLPath) String() string {
 }
 
 func (s *Server) loadModel(
+	ctx context.Context,
 	mpath string,
 	params ml.BackendParams,
 	lpath multiLPath,
@@ -687,7 +688,7 @@ func (s *Server) loadModel(
 	multiUserCache bool,
 ) {
 	var err error
-	s.model, err = model.New(mpath, params)
+	s.model, err = model.New(ctx, mpath, params)
 	if err != nil {
 		panic(err)
 	}
@@ -794,13 +795,13 @@ func Execute(args []string) error {
 	}
 
 	server.ready.Add(1)
-	go server.loadModel(*mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
-
-	server.cond = sync.NewCond(&server.mu)
-
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
 
+	go server.loadModel(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
+
+	server.cond = sync.NewCond(&server.mu)
+
 	go server.run(ctx)
 
 	addr := "127.0.0.1:" + strconv.Itoa(*port)

From ce929984a33230269905e0e3cfa335cb8d6ba781 Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Fri, 21 Mar 2025 16:16:38 -0700
Subject: [PATCH 21/31] server/internal/client/ollama: fix file descriptor
 management in Pull (#9931)

Close chunked writers as soon as downloads complete, rather than
deferring closure until Pull exits. This prevents exhausting file
descriptors when pulling many layers.

Instead of unbounded defers, use a WaitGroup and background goroutine
to close each chunked writer as soon as its downloads finish.

Also rename 'total' to 'received' for clarity.
---
 server/internal/client/ollama/registry.go | 56 +++++++++++++----------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/server/internal/client/ollama/registry.go b/server/internal/client/ollama/registry.go
index 59041867..665defd5 100644
--- a/server/internal/client/ollama/registry.go
+++ b/server/internal/client/ollama/registry.go
@@ -486,44 +486,43 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 		expected += l.Size
 	}
 
-	var total atomic.Int64
+	var received atomic.Int64
 	var g errgroup.Group
 	g.SetLimit(r.maxStreams())
 	for _, l := range layers {
 		info, err := c.Get(l.Digest)
 		if err == nil && info.Size == l.Size {
-			total.Add(l.Size)
+			received.Add(l.Size)
 			t.update(l, l.Size, ErrCached)
 			continue
 		}
 
+		var wg sync.WaitGroup
 		chunked, err := c.Chunked(l.Digest, l.Size)
 		if err != nil {
 			t.update(l, 0, err)
 			continue
 		}
-		// TODO(bmizerany): fix this unbounded use of defer
-		defer chunked.Close()
 
 		for cs, err := range r.chunksums(ctx, name, l) {
 			if err != nil {
-				// Chunksum stream was interrupted, so tell
-				// trace about it, and let in-flight chunk
-				// downloads finish. Once they finish, return
-				// ErrIncomplete, which is triggered by the
-				// fact that the total bytes received is less
-				// than the expected bytes.
+				// Chunksum stream interrupted. Note in trace
+				// log and let in-flight downloads complete.
+				// This will naturally trigger ErrIncomplete
+				// since received < expected bytes.
 				t.update(l, 0, err)
 				break
 			}
 
+			wg.Add(1)
 			g.Go(func() (err error) {
 				defer func() {
-					if err == nil || errors.Is(err, ErrCached) {
-						total.Add(cs.Chunk.Size())
+					if err == nil {
+						received.Add(cs.Chunk.Size())
 					} else {
 						err = fmt.Errorf("error downloading %s: %w", cs.Digest.Short(), err)
 					}
+					wg.Done()
 				}()
 
 				req, err := http.NewRequestWithContext(ctx, "GET", cs.URL, nil)
@@ -537,27 +536,34 @@ func (r *Registry) Pull(ctx context.Context, name string) error {
 				}
 				defer res.Body.Close()
 
-				// Count bytes towards progress, as they
-				// arrive, so that our bytes piggyback other
-				// chunk updates on completion.
-				//
-				// This tactic is enough to show "smooth"
-				// progress given the current CLI client. In
-				// the near future, the server should report
-				// download rate since it knows better than a
-				// client that is measuring rate based on
-				// wall-clock time-since-last-update.
 				body := &trackingReader{l: l, r: res.Body, update: t.update}
-
 				return chunked.Put(cs.Chunk, cs.Digest, body)
 			})
 		}
+
+		// Close writer immediately after downloads finish, not at Pull
+		// exit. Using defer would keep file descriptors open until all
+		// layers complete, potentially exhausting system limits with
+		// many layers.
+		//
+		// The WaitGroup tracks when all chunks finish downloading,
+		// allowing precise writer closure in a background goroutine.
+		// Each layer briefly uses one extra goroutine while at most
+		// maxStreams()-1 chunks download in parallel.
+		//
+		// This caps file descriptors at maxStreams() instead of
+		// growing with layer count.
+		g.Go(func() error {
+			wg.Wait()
+			chunked.Close()
+			return nil
+		})
 	}
 	if err := g.Wait(); err != nil {
 		return err
 	}
-	if total.Load() != expected {
-		return fmt.Errorf("%w: received %d/%d", ErrIncomplete, total.Load(), expected)
+	if received.Load() != expected {
+		return fmt.Errorf("%w: received %d/%d", ErrIncomplete, received.Load(), expected)
 	}
 
 	md := blob.DigestFromBytes(m.Data)

From 131f0355a59f4840b057fb8f3c2e59e456f91041 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matheus=20C=2E=20Fran=C3=A7a?=
 <matheus-catarino@hotmail.com>
Date: Mon, 24 Mar 2025 13:25:58 -0300
Subject: [PATCH 22/31] readme: add ollama-d library (#9907)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 47d0aebd..50e8dba0 100644
--- a/README.md
+++ b/README.md
@@ -512,6 +512,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama for Zig](https://github.com/dravenk/ollama-zig)
 - [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
 - [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
+- [Ollama for D](https://github.com/kassane/ollama-d)
 
 ### Mobile
 

From 5e0b904e887fc648fb8a3a55283f8f33063a78eb Mon Sep 17 00:00:00 2001
From: copeland3300 <copeland3300@gmail.com>
Date: Tue, 25 Mar 2025 12:52:23 -0400
Subject: [PATCH 23/31] docs: add flags to example linux log output command
 (#9852)

---
 docs/troubleshooting.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 4275cdf3..32ad48c4 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -9,7 +9,7 @@ cat ~/.ollama/logs/server.log
 On **Linux** systems with systemd, the logs can be found with this command:
 
 ```shell
-journalctl -u ollama --no-pager
+journalctl -u ollama --no-pager --follow --pager-end 
 ```
 
 When you run Ollama in a **container**, the logs go to stdout/stderr in the container:

From 1feff619779115d76f033eb59a7a896aad6c2e18 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Mon, 24 Mar 2025 21:17:53 -0700
Subject: [PATCH 24/31] kvcache: Sliding window cache only needs a single batch
 total

When computing the size of the cache for sliding window attention,
we don't need to multiple the batch size by the number of parallel
sequences - the batch size is constant.

This also simplifies the check for whether to allocate the cache
size based on capacity or window size as the batch size is already
incorporated into the capacity when handled by the runner.
---
 kvcache/causal.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kvcache/causal.go b/kvcache/causal.go
index aacaf540..fb4f0f74 100644
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -119,10 +119,10 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 	}
 
 	var cacheSize int
-	if c.windowSize == math.MaxInt32 || capacity < int(c.windowSize)+maxBatch {
+	if c.windowSize == math.MaxInt32 || capacity < int(c.windowSize) {
 		cacheSize = maxSequences * capacity
 	} else {
-		cacheSize = maxSequences * (int(c.windowSize) + maxBatch)
+		cacheSize = (maxSequences * int(c.windowSize)) + maxBatch
 	}
 	cacheSize = roundUp(cacheSize, c.config.CachePadding)
 	c.cells = make([]cacheCell, cacheSize)

From f4f0992b6ea5d651eff609461c24ece936bd5708 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 25 Mar 2025 11:41:26 -0700
Subject: [PATCH 25/31] llm: Fix debug logging for memory estimates

---
 llm/memory.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/memory.go b/llm/memory.go
index ac830ee8..86694d06 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -374,7 +374,7 @@ func (m MemoryEstimate) LogValue() slog.Value {
 			slog.Group(
 				"weights",
 				// memory of the weights
-				"total", format.HumanBytes2(m.memoryWeights),
+				"total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput),
 				// memory of repeating layers
 				"repeating", format.HumanBytes2(m.memoryWeights),
 				// memory of non-repeating layers

From f66216e3990b73869341c58ac9561b26c468c558 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Mon, 24 Mar 2025 13:39:07 -0700
Subject: [PATCH 26/31] ggml: Support heterogeneous KV cache layer sizes in
 memory estimation

Gemma3 uses sliding windows for its context on 5/6 layers, significantly
reducing memory usage but leading to uneven usage across layers,
which makes allocation to the correct GPU difficult. We currently
estimate very conservatively by assuming all layers are consistent
at the max size.

Llama3.2-vision is also inconsistent between self attention and cross
attention layers - at moment, we calculate the correct total size
and then average this across layers. In some cases, this may lead
to crashes if a large layer is placed on a GPU sized by the average.

This allows memory estimation to calculate per-layer KV cache size
and take this account when placing layers onto GPUs. We already do
this for weights that vary per-tensor, so this is a logical extension.

Fixes #9730
Fixes #9890
---
 fs/ggml/ggml.go    | 39 +++++++++++++++++++++++++++------------
 llm/memory.go      | 24 +++++++++++++++---------
 llm/memory_test.go |  4 ++--
 llm/server.go      |  2 +-
 server/sched.go    |  8 ++++----
 5 files changed, 49 insertions(+), 28 deletions(-)

diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index 0be69e82..c88583fb 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -413,7 +413,7 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	}, offset, nil
 }
 
-func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
+func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
 	embedding := f.KV().EmbeddingLength()
 	heads := f.KV().HeadCount()
 	headsKV := f.KV().HeadCountKV()
@@ -426,7 +426,10 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO
 	layers := f.Tensors().GroupLayers()
 
 	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
-	kv = uint64(float64(context*f.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
+	kv = make([]uint64, f.KV().BlockCount())
+	for i := range kv {
+		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
+	}
 
 	switch f.KV().Architecture() {
 	case "llama":
@@ -460,16 +463,14 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO
 	case "mllama":
 		var visionTokens, tiles uint64 = 1601, 4
 
-		if crossAttentionLayers, ok := f.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
-			kv = headsKV *
-				(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
-				(2* // sizeof(float16)
-					(f.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
-					context +
-					4* // sizeof(float32)
-						uint64(crossAttentionLayers.size)* // num cross attention layers
-						visionTokens*
-						tiles)
+		crossAttentionLayers := f.KV().Uints("attention.cross_attention_layers")
+		for i := range kv {
+			if slices.Contains(crossAttentionLayers, uint32(i)) {
+				kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
+					4 * // sizeof(float32)
+					visionTokens *
+					tiles
+			}
 		}
 
 		fullOffload = max(
@@ -505,6 +506,20 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO
 				4*embeddingHeadsK*context*8+
 				embedding*embeddingHeadsK*heads*9/16,
 		)
+
+		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
+		// engine. Gemma3 always uses the Ollama engine.
+		if f.KV().Architecture() == "gemma3" {
+			const gemma3GlobalCacheCount = 6
+			slidingWindow := (uint64(numParallel) * uint64(f.KV().Uint("attention.sliding_window"))) + batch
+			for i := range kv {
+				// Every 6th layer is a global layer, which is the full context size that has already been set. The other
+				// layers are the smaller local (sliding) layers.
+				if (i+1)%gemma3GlobalCacheCount != 0 {
+					kv[i] = uint64(float64(slidingWindow*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
+				}
+			}
+		}
 	case "command-r":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
diff --git a/llm/memory.go b/llm/memory.go
index 86694d06..85a0fabd 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -15,12 +15,12 @@ import (
 )
 
 // This algorithm looks for a complete fit to determine if we need to unload other models
-func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
+func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
-		estimate := EstimateGPULayers(gpus, f, projectors, opts)
+		estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
 		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
 		if opts.NumGPU < 0 {
 			if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
@@ -71,7 +71,7 @@ type MemoryEstimate struct {
 
 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options) MemoryEstimate {
+func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
 	// Graph size for a partial offload, applies to all GPUs
 	var graphPartialOffload uint64
 
@@ -137,13 +137,19 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		}
 	}
 
-	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
+	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct)
 
-	// KV is proportional to the number of layers
-	layerSize += kv / f.KV().BlockCount()
+	if len(kv) > 0 {
+		layerSize += kv[0]
+	}
+
+	var kvTotal uint64
+	for _, kvLayer := range kv {
+		kvTotal += kvLayer
+	}
 
 	if graphPartialOffload == 0 {
-		graphPartialOffload = f.KV().GQA() * kv / 6
+		graphPartialOffload = f.KV().GQA() * kvTotal / 6
 	}
 	if graphFullOffload == 0 {
 		graphFullOffload = graphPartialOffload
@@ -217,7 +223,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		// Some models have inconsistent layer sizes
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
 			layerSize = blk.Size()
-			layerSize += kv / f.KV().BlockCount()
+			layerSize += kv[i]
 			memoryWeights += blk.Size()
 		}
 
@@ -315,7 +321,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		layersRequested:     opts.NumGPU,
 		layersModel:         int(f.KV().BlockCount()) + 1,
 		availableList:       availableList,
-		kv:                  kv,
+		kv:                  kvTotal,
 		allocationsList:     allocationsList,
 		memoryWeights:       memoryWeights,
 		memoryLayerOutput:   memoryLayerOutput,
diff --git a/llm/memory_test.go b/llm/memory_test.go
index 40cc01df..213784a0 100644
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -61,7 +61,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	projectors := []string{}
 	opts := api.DefaultOptions()
 	t.Run("cpu", func(t *testing.T) {
-		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+		estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
 		assert.Equal(t, 0, estimate.Layers)
 		assert.Equal(t, uint64(0), estimate.Graph)
 	})
@@ -112,7 +112,7 @@ func TestEstimateGPULayers(t *testing.T) {
 			gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
 			gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
 			gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
-			estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+			estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
 			assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
 			assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
 			var layerSums uint64
diff --git a/llm/server.go b/llm/server.go
index adc11aae..e6046db6 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -109,7 +109,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		gpus = discover.GetCPUInfo()
 	}
 
-	estimate := EstimateGPULayers(gpus, f, projectors, opts)
+	estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
 	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {
 		case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
diff --git a/server/sched.go b/server/sched.go
index b4600dbf..9126c296 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -711,7 +711,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
 			req.opts.NumCtx = req.origNumCtx * p
 			if !envconfig.SchedSpread() {
 				for _, g := range sgl {
-					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 						*numParallel = p
 						return []discover.GpuInfo{g}
@@ -727,7 +727,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
 		// Now try all the GPUs
 		for _, p := range numParallelToTry {
 			req.opts.NumCtx = req.origNumCtx * p
-			if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+			if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
 				slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
 				*numParallel = p
 				return sgl
@@ -750,7 +750,7 @@ func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.Gp
 	var bestEstimate uint64
 	var bestFit int
 	for i, gl := range byLibrary {
-		_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
+		_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, *numParallel)
 		if estimatedVRAM > bestEstimate {
 			bestEstimate = estimatedVRAM
 			bestFit = i
@@ -825,7 +825,7 @@ func (s *Scheduler) expireRunner(model *Model) {
 // If not, pick a runner to unload, else return nil and the request can be loaded
 func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef {
 	slog.Debug("evaluating if CPU model load will fit in available system memory")
-	estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts)
+	estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts, req.opts.NumCtx/req.origNumCtx)
 	if estimate.TotalSize <= gpus[0].FreeMemory {
 		slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
 		return nil

From dd66712e3159161c1de9c39a12fb83edf8813d39 Mon Sep 17 00:00:00 2001
From: Hengky Steen <hengkysteen@gmail.com>
Date: Thu, 27 Mar 2025 03:38:05 +0700
Subject: [PATCH 27/31] docs: add ollamb to community projects

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 50e8dba0..d0aecaef 100644
--- a/README.md
+++ b/README.md
@@ -394,6 +394,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
 - [Ellama](https://github.com/zeozeozeo/ellama) (Friendly native app to chat with an Ollama instance)
 - [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history
+- [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)
 
 ### Cloud
 

From e5d84fb90b21d71f8eb816656ca0b34191425216 Mon Sep 17 00:00:00 2001
From: molbal <molbal@outlook.com>
Date: Wed, 26 Mar 2025 21:39:01 +0100
Subject: [PATCH 28/31] docs: add molbal/orca-cli to community integrations
 (#9909)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index d0aecaef..3afd83c3 100644
--- a/README.md
+++ b/README.md
@@ -435,6 +435,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
 - [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
+- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull and download models from Ollama Registry in your terminal.
 
 ### Apple Vision Pro
 

From b816ff86c923e0290f58f2275e831fc17c29ba37 Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Wed, 26 Mar 2025 17:34:18 -0700
Subject: [PATCH 29/31] docs: make context length faq readable (#10006)

---
 docs/faq.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/faq.md b/docs/faq.md
index 66959cca..f418da47 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,7 +20,13 @@ Please refer to the [GPU docs](./gpu.md).
 
 ## How can I specify the context window size?
 
-By default, Ollama uses a context window size of 2048 tokens. This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context length to 8K, use: `OLLAMA_CONTEXT_LENGTH=8192 ollama serve`.
+By default, Ollama uses a context window size of 2048 tokens. 
+
+This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 
+
+```shell
+OLLAMA_CONTEXT_LENGTH=8192 ollama serve
+```
 
 To change this when using `ollama run`, use `/set parameter`:
 

From ead27aa9fe85b4a1e1c434080d5e005e3cd68a16 Mon Sep 17 00:00:00 2001
From: saman-amd <saman.khatir@amd.com>
Date: Thu, 27 Mar 2025 07:35:19 -0700
Subject: [PATCH 30/31] Add gfx1200 & gfx1201 support on linux (#9878)

---
 CMakeLists.txt                                |   6 +-
 CMakePresets.json                             |   2 +-
 llama/patches/0022-add-rdna4-support.patch    | 103 ++++++++++++++++++
 ml/backend/ggml/ggml/src/ggml-cuda/common.cuh |   6 +-
 ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu     |   2 +-
 ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh    |   4 +-
 ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu    |   4 +-
 .../ggml/ggml/src/ggml-cuda/vendors/hip.h     |   4 +
 8 files changed, 120 insertions(+), 11 deletions(-)
 create mode 100644 llama/patches/0022-add-rdna4-support.patch

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 034fc7d7..e2447f32 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,9 +86,9 @@ if(CMAKE_CUDA_COMPILER)
     )
 endif()
 
-set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(906|908|90a):xnack[+-]$"
+set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(906|908|90a|1200|1201):xnack[+-]$"
     CACHE STRING
-    "Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(906|908|90a):xnack[+-]$\"."
+    "Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(906|908|90a|1200|1201):xnack[+-]$\"."
 )
 
 check_language(HIP)
@@ -97,7 +97,7 @@ if(CMAKE_HIP_COMPILER)
 
     find_package(hip REQUIRED)
     if(NOT AMDGPU_TARGETS)
-        list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012])$")
+        list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012]|120[01])$")
     elseif(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX)
         list(FILTER AMDGPU_TARGETS EXCLUDE REGEX ${WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX})
     endif()
diff --git a/CMakePresets.json b/CMakePresets.json
index f3507e3f..bf99917f 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -56,7 +56,7 @@
       "name": "ROCm 6",
       "inherits": [ "ROCm" ],
       "cacheVariables": {
-        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
+        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
       }
     }
   ],
diff --git a/llama/patches/0022-add-rdna4-support.patch b/llama/patches/0022-add-rdna4-support.patch
new file mode 100644
index 00000000..9b77ae84
--- /dev/null
+++ b/llama/patches/0022-add-rdna4-support.patch
@@ -0,0 +1,103 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Saman <saman.khatir@amd.com>
+Date: Wed, 19 Mar 2025 14:02:26 -0700
+Subject: [PATCH] add rdna4 support
+
+---
+ ggml/src/ggml-cuda/common.cuh    | 6 ++++--
+ ggml/src/ggml-cuda/mmq.cu        | 2 +-
+ ggml/src/ggml-cuda/mmq.cuh       | 4 ++--
+ ggml/src/ggml-cuda/mmvq.cu       | 4 ++--
+ ggml/src/ggml-cuda/vendors/hip.h | 4 ++++
+ 5 files changed, 13 insertions(+), 7 deletions(-)
+
+diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
+index adf0d3ec..b24593fc 100644
+--- a/ggml/src/ggml-cuda/common.cuh
++++ b/ggml/src/ggml-cuda/common.cuh
+@@ -61,11 +61,13 @@
+ #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
+ #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
+ #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
++#define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
+ 
+ #define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
+ #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
+ #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
+-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
++#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
++#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
+ #define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
+ #define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
+ 
+@@ -386,7 +388,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+ #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
+     c = __builtin_amdgcn_sdot4(a, b, c, false);
+-#elif defined(RDNA3)
++#elif defined(RDNA3) || defined(RDNA4)
+     c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
+ #elif defined(__gfx1010__) || defined(__gfx900__)
+     int tmp1;
+diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
+index 10f2ebb1..933d945c 100644
+--- a/ggml/src/ggml-cuda/mmq.cu
++++ b/ggml/src/ggml-cuda/mmq.cu
+@@ -149,5 +149,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
+         return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
+     }
+ 
+-    return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
++    return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
+ }
+diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
+index 0451c65f..66ce2bc9 100644
+--- a/ggml/src/ggml-cuda/mmq.cuh
++++ b/ggml/src/ggml-cuda/mmq.cuh
+@@ -2577,9 +2577,9 @@ static __device__ void mul_mat_q_process_tile(
+ 
+ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+-#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
++#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
+     __launch_bounds__(WARP_SIZE*nwarps, 2)
+-#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
++#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
+ #else
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+     __launch_bounds__(WARP_SIZE*nwarps, 1)
+diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
+index 4fb466ca..23ae7abc 100644
+--- a/ggml/src/ggml-cuda/mmvq.cu
++++ b/ggml/src/ggml-cuda/mmvq.cu
+@@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(
+ 
+     constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
+ 
+-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
++#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4))
+     constexpr int nwarps              = 1;
+     constexpr int rows_per_cuda_block = 1;
+ #else
+     constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
+     constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
+-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
++#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) && !defined(RDNA4)
+ 
+     const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
+     const     int row0 = rows_per_cuda_block*blockIdx.x;
+diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
+index 81964611..a62544b5 100644
+--- a/ggml/src/ggml-cuda/vendors/hip.h
++++ b/ggml/src/ggml-cuda/vendors/hip.h
+@@ -150,6 +150,10 @@
+ #define CDNA
+ #endif
+ 
++#if defined(__gfx1200__) || defined(__gfx1201__)
++#define RDNA4
++#endif
++
+ #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+     defined(__gfx1150__) || defined(__gfx1151__)
+ #define RDNA3
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
index adf0d3ec..b24593fc 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
@@ -61,11 +61,13 @@
 #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
 #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
 #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
+#define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
 
 #define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
 #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
 #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
+#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
 #define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
 #define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
 
@@ -386,7 +388,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
     c = __builtin_amdgcn_sdot4(a, b, c, false);
-#elif defined(RDNA3)
+#elif defined(RDNA3) || defined(RDNA4)
     c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
 #elif defined(__gfx1010__) || defined(__gfx900__)
     int tmp1;
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
index 10f2ebb1..933d945c 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
@@ -149,5 +149,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
         return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
     }
 
-    return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
+    return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
 }
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
index 0451c65f..66ce2bc9 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
@@ -2577,9 +2577,9 @@ static __device__ void mul_mat_q_process_tile(
 
 template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
+#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
     __launch_bounds__(WARP_SIZE*nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
+#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
 #else
 #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
     __launch_bounds__(WARP_SIZE*nwarps, 1)
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
index 4fb466ca..23ae7abc 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
@@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(
 
     constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
 
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4))
     constexpr int nwarps              = 1;
     constexpr int rows_per_cuda_block = 1;
 #else
     constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
     constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) && !defined(RDNA4)
 
     const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
     const     int row0 = rows_per_cuda_block*blockIdx.x;
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
index 81964611..a62544b5 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
@@ -150,6 +150,10 @@
 #define CDNA
 #endif
 
+#if defined(__gfx1200__) || defined(__gfx1201__)
+#define RDNA4
+#endif
+
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
     defined(__gfx1150__) || defined(__gfx1151__)
 #define RDNA3

From 01aa7887221e7bd286ebcb14a088c94ba1c22a99 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 27 Mar 2025 11:52:09 -0700
Subject: [PATCH 31/31] ml: Remove Output from Context interface

Model implementations should use Input for all of their tensors
supplied to the model. This includes tensors that relate to the
outputs, which is confusing since there is also an Output funciton.

Since Output is only used internally in GGML and not used by any
model implementations, we can remove it from the interface to
reduce confusion.
---
 kvcache/causal_test.go  |  1 -
 ml/backend.go           |  6 ++----
 ml/backend/ggml/ggml.go | 19 +------------------
 3 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/kvcache/causal_test.go b/kvcache/causal_test.go
index 617f5363..b1dc7d77 100644
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -362,7 +362,6 @@ func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 }
 
 func (c *testContext) Input() ml.Context    { return c }
-func (c *testContext) Output() ml.Context   { return c }
 func (c *testContext) Layer(int) ml.Context { return c }
 
 func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
diff --git a/ml/backend.go b/ml/backend.go
index 354faf43..cfb18d6a 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -110,12 +110,10 @@ type Context interface {
 	MaxGraphNodes() int
 	Close()
 
-	// Input returns a context appropriate for creating input tensors
+	// Input returns a context appropriate for creating tensors that are
+	// inputs to the model (which includes things like output locations)
 	Input() Context
 
-	// Output returns a context appropriate for creating output tensors
-	Output() Context
-
 	// Layer returns a context appropriate for creating intermediate tensors
 	Layer(int) Context
 }
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index f6b01774..b6f59ae0 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -48,9 +48,6 @@ type Backend struct {
 	// input is the backend used for inputs
 	input *C.struct_ggml_backend_buffer_type
 
-	// output is the backend used for outputs
-	output *C.struct_ggml_backend_buffer_type
-
 	// layers is the backend used for repeating layers
 	layers map[int]*C.struct_ggml_backend_buffer_type
 
@@ -400,8 +397,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			C.size_t(maxGraphNodes),
 			C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
 		),
-		input:  deviceBufferTypes[input.d],
-		output: deviceBufferTypes[output.d],
+		input: deviceBufferTypes[input.d],
 		layers: func() map[int]*C.struct_ggml_backend_buffer_type {
 			m := make(map[int]*C.struct_ggml_backend_buffer_type)
 			for i, layer := range layers {
@@ -482,19 +478,6 @@ func (c Context) Input() ml.Context {
 	return &c
 }
 
-func (c Context) Output() ml.Context {
-	if c.b.output != nil {
-		return &Context{
-			b:             c.b,
-			ctx:           c.ctx,
-			buft:          c.b.output,
-			maxGraphNodes: c.maxGraphNodes,
-		}
-	}
-
-	return &c
-}
-
 func (c Context) Layer(i int) ml.Context {
 	if buft, ok := c.b.layers[i]; ok {
 		return &Context{