From af68d60a58c7ade657a1f4641bf301a29d977174 Mon Sep 17 00:00:00 2001
From: Soulter <37870767+Soulter@users.noreply.github.com>
Date: Sun, 2 Mar 2025 13:58:34 +0800
Subject: [PATCH 01/17] readme: add AstrBot to community integrations (#9442)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 8d471f5e..7bf306d5 100644
--- a/README.md
+++ b/README.md
@@ -387,6 +387,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [yla](https://github.com/danielekp/yla) (Web interface to freely interact with your customized models)
 - [LangBot](https://github.com/RockChinQ/LangBot) (LLM-based instant messaging bots platform, with Agents, RAG features, supports multiple platforms)
 - [1Panel](https://github.com/1Panel-dev/1Panel/) (Web-based Linux Server Management Tool)
+- [AstrBot](https://github.com/Soulter/AstrBot/) (User-friendly LLM-based multi-platform chatbot with a WebUI, supporting RAG, LLM agents, and plugins integration)
 
 ### Cloud
 

From ee048b76d42fbfb108701682cf71c57daeb6678b Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Sun, 2 Mar 2025 13:30:41 -0800
Subject: [PATCH 02/17] server/internal/client/ollama: handle extended names in
 client/ollama (#9454)

The extended name format is a superset of the name format that only the
client needs to know about, not the server or other dependents of the
name package, so move the split logic into the client package.

Also, take advantage of knowing about the extended name format to allow
the client to use the extended name format when unlinking to verify they
are unlinking the manifest with the content they intend.
---
 server/internal/client/ollama/registry.go     |  65 +++++++----
 .../internal/client/ollama/registry_test.go   | 106 +++++++++++++++---
 2 files changed, 136 insertions(+), 35 deletions(-)

diff --git a/server/internal/client/ollama/registry.go b/server/internal/client/ollama/registry.go
index 82a8bbca..7ffc16db 100644
--- a/server/internal/client/ollama/registry.go
+++ b/server/internal/client/ollama/registry.go
@@ -212,12 +212,16 @@ type Registry struct {
 	Mask string
 }
 
-func (r *Registry) completeName(name string) names.Name {
+func (r *Registry) parseName(name string) (names.Name, error) {
 	mask := defaultMask
 	if r.Mask != "" {
 		mask = names.Parse(r.Mask)
 	}
-	return names.Merge(names.Parse(name), mask)
+	n := names.Merge(names.Parse(name), mask)
+	if !n.IsFullyQualified() {
+		return names.Name{}, fmt.Errorf("%w: %q", ErrNameInvalid, name)
+	}
+	return n, nil
 }
 
 // DefaultRegistry returns a new Registry configured from the environment. The
@@ -306,7 +310,7 @@ func (r *Registry) Push(ctx context.Context, c *blob.DiskCache, name string, p *
 
 	t := traceFromContext(ctx)
 
-	scheme, n, _, err := parseName(name, r.Mask)
+	scheme, n, _, err := r.parseNameExtended(name)
 	if err != nil {
 		// This should never happen since ResolveLocal should have
 		// already validated the name.
@@ -400,7 +404,7 @@ func canRetry(err error) bool {
 // typically slower than splitting the model up across layers, and is mostly
 // utilized for layers of type equal to "application/vnd.ollama.image".
 func (r *Registry) Pull(ctx context.Context, c *blob.DiskCache, name string) error {
-	scheme, n, _, err := parseName(name, r.Mask)
+	scheme, n, _, err := r.parseNameExtended(name)
 	if err != nil {
 		return err
 	}
@@ -551,9 +555,9 @@ func (r *Registry) Pull(ctx context.Context, c *blob.DiskCache, name string) err
 // Unlink is like [blob.DiskCache.Unlink], but makes name fully qualified
 // before attempting to unlink the model.
 func (r *Registry) Unlink(c *blob.DiskCache, name string) (ok bool, _ error) {
-	n := r.completeName(name)
-	if !n.IsFullyQualified() {
-		return false, fmt.Errorf("%w: %q", ErrNameInvalid, name)
+	n, err := r.parseName(name)
+	if err != nil {
+		return false, err
 	}
 	return c.Unlink(n.String())
 }
@@ -626,10 +630,9 @@ type Layer struct {
 	Size      int64       `json:"size"`
 }
 
-// ResolveLocal resolves a name to a Manifest in the local cache. The name is
-// parsed using [names.Split] but the scheme is ignored.
+// ResolveLocal resolves a name to a Manifest in the local cache.
 func (r *Registry) ResolveLocal(c *blob.DiskCache, name string) (*Manifest, error) {
-	_, n, d, err := parseName(name, r.Mask)
+	_, n, d, err := r.parseNameExtended(name)
 	if err != nil {
 		return nil, err
 	}
@@ -655,7 +658,7 @@ func (r *Registry) ResolveLocal(c *blob.DiskCache, name string) (*Manifest, erro
 
 // Resolve resolves a name to a Manifest in the remote registry.
 func (r *Registry) Resolve(ctx context.Context, name string) (*Manifest, error) {
-	scheme, n, d, err := parseName(name, r.Mask)
+	scheme, n, d, err := r.parseNameExtended(name)
 	if err != nil {
 		return nil, err
 	}
@@ -859,7 +862,7 @@ var supportedSchemes = []string{
 
 var supportedSchemesMessage = fmt.Sprintf("supported schemes are %v", strings.Join(supportedSchemes, ", "))
 
-// parseName parses and validates an extended name, returning the scheme, name,
+// parseNameExtended parses and validates an extended name, returning the scheme, name,
 // and digest.
 //
 // If the scheme is empty, scheme will be "https". If an unsupported scheme is
@@ -870,8 +873,8 @@ var supportedSchemesMessage = fmt.Sprintf("supported schemes are %v", strings.Jo
 //
 // If the name is not, once merged with the mask, fully qualified,
 // [ErrNameInvalid] wrapped with a display friendly message is returned.
-func parseName(s string, mask string) (scheme string, _ names.Name, _ blob.Digest, _ error) {
-	scheme, name, digest := names.Split(s)
+func (r *Registry) parseNameExtended(s string) (scheme string, _ names.Name, _ blob.Digest, _ error) {
+	scheme, name, digest := splitExtended(s)
 	scheme = cmp.Or(scheme, "https")
 	if !slices.Contains(supportedSchemes, scheme) {
 		err := withPublicMessagef(ErrNameInvalid, "unsupported scheme: %q: %s", scheme, supportedSchemesMessage)
@@ -894,13 +897,33 @@ func parseName(s string, mask string) (scheme string, _ names.Name, _ blob.Diges
 		}
 	}
 
-	maskName := defaultMask
-	if mask != "" {
-		maskName = names.Parse(mask)
-	}
-	n := names.Merge(names.Parse(name), maskName)
-	if !n.IsFullyQualified() {
-		return "", names.Name{}, blob.Digest{}, fmt.Errorf("%w: %q", ErrNameInvalid, s)
+	n, err := r.parseName(name)
+	if err != nil {
+		return "", names.Name{}, blob.Digest{}, err
 	}
 	return scheme, n, d, nil
 }
+
+// splitExtended splits an extended name string into its scheme, name, and digest
+// parts.
+//
+// Examples:
+//
+//	http://ollama.com/bmizerany/smol:latest@digest
+//	https://ollama.com/bmizerany/smol:latest
+//	ollama.com/bmizerany/smol:latest@digest // returns "https" scheme.
+//	model@digest
+//	@digest
+func splitExtended(s string) (scheme, name, digest string) {
+	i := strings.Index(s, "://")
+	if i >= 0 {
+		scheme = s[:i]
+		s = s[i+3:]
+	}
+	i = strings.LastIndex(s, "@")
+	if i >= 0 {
+		digest = s[i+1:]
+		s = s[:i]
+	}
+	return scheme, s, digest
+}
diff --git a/server/internal/client/ollama/registry_test.go b/server/internal/client/ollama/registry_test.go
index 20a1f159..92b53637 100644
--- a/server/internal/client/ollama/registry_test.go
+++ b/server/internal/client/ollama/registry_test.go
@@ -2,6 +2,7 @@ package ollama
 
 import (
 	"bytes"
+	"cmp"
 	"context"
 	"encoding/json"
 	"errors"
@@ -91,7 +92,7 @@ func newClient(t *testing.T, h http.HandlerFunc) (*Registry, *blob.DiskCache) {
 	}
 
 	link := func(name string, manifest string) {
-		_, n, _, err := parseName(name, r.Mask)
+		n, err := r.parseName(name)
 		if err != nil {
 			panic(err)
 		}
@@ -709,25 +710,16 @@ func TestErrorUnmarshal(t *testing.T) {
 //
 // It is only for testing error messages, not that all invalids and valids are
 // covered. Those are in other tests for names.Name and blob.Digest.
-func TestParseNameErrors(t *testing.T) {
+func TestParseNameExtendedErrors(t *testing.T) {
 	cases := []struct {
 		name string
 		err  error
 		want string
-	}{
-		{"x", nil, ""},
-		{"x@", nil, ""},
-
-		{"", ErrNameInvalid, `invalid or missing name: ""`},
-		{"://", ErrNameInvalid, `invalid or missing name: "://"`},
-		{"x://", ErrNameInvalid, `unsupported scheme: "x": supported schemes are http, https, https+insecure`},
-
-		{"@sha123-1234", ErrNameInvalid, `invalid digest: "sha123-1234"`},
-		{"x@sha123-1234", ErrNameInvalid, `invalid digest: "sha123-1234"`},
-	}
+	}{}
 
+	var r Registry
 	for _, tt := range cases {
-		_, _, _, err := parseName(tt.name, DefaultMask)
+		_, _, _, err := r.parseNameExtended(tt.name)
 		if !errors.Is(err, tt.err) {
 			t.Errorf("[%s]: err = %v; want %v", tt.name, err, tt.err)
 		}
@@ -736,3 +728,89 @@ func TestParseNameErrors(t *testing.T) {
 		}
 	}
 }
+
+func TestParseNameExtended(t *testing.T) {
+	cases := []struct {
+		in     string
+		scheme string
+		name   string
+		digest string
+		err    string
+	}{
+		{in: "http://m", scheme: "http", name: "m"},
+		{in: "https+insecure://m", scheme: "https+insecure", name: "m"},
+		{in: "http+insecure://m", err: "unsupported scheme"},
+
+		{in: "http://m@sha256:1111111111111111111111111111111111111111111111111111111111111111", scheme: "http", name: "m", digest: "sha256:1111111111111111111111111111111111111111111111111111111111111111"},
+
+		{in: "", err: "invalid or missing name"},
+		{in: "m", scheme: "https", name: "m"},
+		{in: "://", err: "invalid or missing name"},
+		{in: "@sha256:deadbeef", err: "invalid digest"},
+		{in: "@sha256:deadbeef@sha256:deadbeef", err: "invalid digest"},
+	}
+	for _, tt := range cases {
+		t.Run(tt.in, func(t *testing.T) {
+			var r Registry
+			scheme, n, digest, err := r.parseNameExtended(tt.in)
+			if err != nil {
+				if tt.err == "" {
+					t.Errorf("err = %v; want nil", err)
+				} else if !strings.Contains(err.Error(), tt.err) {
+					t.Errorf("err = %v; want %q", err, tt.err)
+				}
+			} else if tt.err != "" {
+				t.Errorf("err = nil; want %q", tt.err)
+			}
+			if err == nil && !n.IsFullyQualified() {
+				t.Errorf("name = %q; want fully qualified", n)
+			}
+
+			if scheme != tt.scheme {
+				t.Errorf("scheme = %q; want %q", scheme, tt.scheme)
+			}
+
+			// smoke-test name is superset of tt.name
+			if !strings.Contains(n.String(), tt.name) {
+				t.Errorf("name = %q; want %q", n, tt.name)
+			}
+
+			tt.digest = cmp.Or(tt.digest, (&blob.Digest{}).String())
+			if digest.String() != tt.digest {
+				t.Errorf("digest = %q; want %q", digest, tt.digest)
+			}
+		})
+	}
+}
+
+func TestUnlink(t *testing.T) {
+	t.Run("found by name", func(t *testing.T) {
+		rc, c := newClient(t, nil)
+
+		// confirm linked
+		_, err := rc.ResolveLocal(c, "single")
+		if err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+
+		// unlink
+		_, err = rc.Unlink(c, "single")
+		testutil.Check(t, err)
+
+		// confirm unlinked
+		_, err = rc.ResolveLocal(c, "single")
+		if !errors.Is(err, fs.ErrNotExist) {
+			t.Errorf("err = %v; want fs.ErrNotExist", err)
+		}
+	})
+	t.Run("not found by name", func(t *testing.T) {
+		rc, c := newClient(t, nil)
+		ok, err := rc.Unlink(c, "manifestNotFound")
+		if err != nil {
+			t.Fatal(err)
+		}
+		if ok {
+			t.Error("expected not found")
+		}
+	})
+}

From e41c4cbea722e40f8e346e5a6c4344a8f47d06d9 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sun, 2 Mar 2025 16:48:31 -0800
Subject: [PATCH 03/17] build: install ccache manually in Dockerfile (#9464)

Reverts ccache installation to be done manually via curl instead of
using the dnf package manager as this has side effects of prepending
ccache's install directory to the front of the PATH
---
 Dockerfile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 46d4713e..f94beae1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,8 +12,9 @@ FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base
 RUN yum install -y yum-utils \
     && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
     && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
-    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
-    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
+    && dnf install -y yum-utils gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
+    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \
+    && curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /usr/local/bin --strip-components 1
 ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 
 FROM --platform=linux/arm64 almalinux:8 AS base-arm64

From 3519dd1c6ee8cfd5afcf9989bb589747beccf6f7 Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Sun, 2 Mar 2025 20:55:44 -0800
Subject: [PATCH 04/17] server/internal/client/ollama: hold DiskCache on
 Registry (#9463)

Previously, using a Registry required a DiskCache to be passed in for
use in various methods. This was a bit cumbersome, as the DiskCache is
required for most operations, and the DefaultCache is used in most of
those cases. This change makes the DiskCache an optional field on the
Registry struct.

This also changes DefaultCache to initialize on first use. This is to
not burden clients with the cost of creating a new cache per use, or
having to hold onto a cache for the lifetime of the Registry.

Also, slip in some minor docs updates for Trace.
---
 server/internal/client/ollama/registry.go     |  65 ++++++++---
 .../internal/client/ollama/registry_test.go   | 102 +++++++++---------
 server/internal/client/ollama/trace.go        |   3 +
 server/internal/cmd/opp/opp.go                |  39 +++----
 server/internal/registry/server.go            |   4 +-
 server/internal/registry/server_test.go       |   6 +-
 server/routes.go                              |  10 +-
 server/routes_test.go                         |   8 +-
 8 files changed, 132 insertions(+), 105 deletions(-)

diff --git a/server/internal/client/ollama/registry.go b/server/internal/client/ollama/registry.go
index 7ffc16db..007de5e8 100644
--- a/server/internal/client/ollama/registry.go
+++ b/server/internal/client/ollama/registry.go
@@ -27,6 +27,7 @@ import (
 	"slices"
 	"strconv"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"time"
 
@@ -73,19 +74,22 @@ const (
 	DefaultMaxChunkSize = 8 << 20
 )
 
-// DefaultCache returns a new disk cache for storing models. If the
-// OLLAMA_MODELS environment variable is set, it uses that directory;
-// otherwise, it uses $HOME/.ollama/models.
-func DefaultCache() (*blob.DiskCache, error) {
+var defaultCache = sync.OnceValues(func() (*blob.DiskCache, error) {
 	dir := os.Getenv("OLLAMA_MODELS")
 	if dir == "" {
-		home, err := os.UserHomeDir()
-		if err != nil {
-			return nil, err
-		}
+		home, _ := os.UserHomeDir()
+		home = cmp.Or(home, ".")
 		dir = filepath.Join(home, ".ollama", "models")
 	}
 	return blob.Open(dir)
+})
+
+// DefaultCache returns the default cache used by the registry. It is
+// configured from the OLLAMA_MODELS environment variable, or defaults to
+// $HOME/.ollama/models, or, if an error occurs obtaining the home directory,
+// it uses the current working directory.
+func DefaultCache() (*blob.DiskCache, error) {
+	return defaultCache()
 }
 
 // Error is the standard error returned by Ollama APIs. It can represent a
@@ -168,6 +172,10 @@ func CompleteName(name string) string {
 // Registry is a client for performing push and pull operations against an
 // Ollama registry.
 type Registry struct {
+	// Cache is the cache used to store models. If nil, [DefaultCache] is
+	// used.
+	Cache *blob.DiskCache
+
 	// UserAgent is the User-Agent header to send with requests to the
 	// registry. If empty, the User-Agent is determined by HTTPClient.
 	UserAgent string
@@ -206,12 +214,18 @@ type Registry struct {
 	// It is only used when a layer is larger than [MaxChunkingThreshold].
 	MaxChunkSize int64
 
-	// Mask, if set, is the name used to convert non-fully qualified
-	// names to fully qualified names. If empty, the default mask
-	// ("registry.ollama.ai/library/_:latest") is used.
+	// Mask, if set, is the name used to convert non-fully qualified names
+	// to fully qualified names. If empty, [DefaultMask] is used.
 	Mask string
 }
 
+func (r *Registry) cache() (*blob.DiskCache, error) {
+	if r.Cache != nil {
+		return r.Cache, nil
+	}
+	return defaultCache()
+}
+
 func (r *Registry) parseName(name string) (names.Name, error) {
 	mask := defaultMask
 	if r.Mask != "" {
@@ -282,12 +296,17 @@ type PushParams struct {
 }
 
 // Push pushes the model with the name in the cache to the remote registry.
-func (r *Registry) Push(ctx context.Context, c *blob.DiskCache, name string, p *PushParams) error {
+func (r *Registry) Push(ctx context.Context, name string, p *PushParams) error {
 	if p == nil {
 		p = &PushParams{}
 	}
 
-	m, err := r.ResolveLocal(c, cmp.Or(p.From, name))
+	c, err := r.cache()
+	if err != nil {
+		return err
+	}
+
+	m, err := r.ResolveLocal(cmp.Or(p.From, name))
 	if err != nil {
 		return err
 	}
@@ -403,7 +422,7 @@ func canRetry(err error) bool {
 // chunks of the specified size, and then reassembled and verified. This is
 // typically slower than splitting the model up across layers, and is mostly
 // utilized for layers of type equal to "application/vnd.ollama.image".
-func (r *Registry) Pull(ctx context.Context, c *blob.DiskCache, name string) error {
+func (r *Registry) Pull(ctx context.Context, name string) error {
 	scheme, n, _, err := r.parseNameExtended(name)
 	if err != nil {
 		return err
@@ -417,6 +436,11 @@ func (r *Registry) Pull(ctx context.Context, c *blob.DiskCache, name string) err
 		return fmt.Errorf("%w: no layers", ErrManifestInvalid)
 	}
 
+	c, err := r.cache()
+	if err != nil {
+		return err
+	}
+
 	exists := func(l *Layer) bool {
 		info, err := c.Get(l.Digest)
 		return err == nil && info.Size == l.Size
@@ -554,11 +578,15 @@ func (r *Registry) Pull(ctx context.Context, c *blob.DiskCache, name string) err
 
 // Unlink is like [blob.DiskCache.Unlink], but makes name fully qualified
 // before attempting to unlink the model.
-func (r *Registry) Unlink(c *blob.DiskCache, name string) (ok bool, _ error) {
+func (r *Registry) Unlink(name string) (ok bool, _ error) {
 	n, err := r.parseName(name)
 	if err != nil {
 		return false, err
 	}
+	c, err := r.cache()
+	if err != nil {
+		return false, err
+	}
 	return c.Unlink(n.String())
 }
 
@@ -631,12 +659,17 @@ type Layer struct {
 }
 
 // ResolveLocal resolves a name to a Manifest in the local cache.
-func (r *Registry) ResolveLocal(c *blob.DiskCache, name string) (*Manifest, error) {
+func (r *Registry) ResolveLocal(name string) (*Manifest, error) {
 	_, n, d, err := r.parseNameExtended(name)
 	if err != nil {
 		return nil, err
 	}
+	c, err := r.cache()
+	if err != nil {
+		return nil, err
+	}
 	if !d.IsValid() {
+		// No digest, so resolve the manifest by name.
 		d, err = c.Resolve(n.String())
 		if err != nil {
 			return nil, err
diff --git a/server/internal/client/ollama/registry_test.go b/server/internal/client/ollama/registry_test.go
index 92b53637..b9b4271b 100644
--- a/server/internal/client/ollama/registry_test.go
+++ b/server/internal/client/ollama/registry_test.go
@@ -73,6 +73,7 @@ func (rr recordRoundTripper) RoundTrip(req *http.Request) (*http.Response, error
 // To simulate a network error, pass a handler that returns a 499 status code.
 func newClient(t *testing.T, h http.HandlerFunc) (*Registry, *blob.DiskCache) {
 	t.Helper()
+
 	c, err := blob.Open(t.TempDir())
 	if err != nil {
 		t.Fatal(err)
@@ -86,6 +87,7 @@ func newClient(t *testing.T, h http.HandlerFunc) (*Registry, *blob.DiskCache) {
 	}
 
 	r := &Registry{
+		Cache: c,
 		HTTPClient: &http.Client{
 			Transport: recordRoundTripper(h),
 		},
@@ -152,55 +154,55 @@ func withTraceUnexpected(ctx context.Context) (context.Context, *Trace) {
 }
 
 func TestPushZero(t *testing.T) {
-	rc, c := newClient(t, okHandler)
-	err := rc.Push(t.Context(), c, "empty", nil)
+	rc, _ := newClient(t, okHandler)
+	err := rc.Push(t.Context(), "empty", nil)
 	if !errors.Is(err, ErrManifestInvalid) {
 		t.Errorf("err = %v; want %v", err, ErrManifestInvalid)
 	}
 }
 
 func TestPushSingle(t *testing.T) {
-	rc, c := newClient(t, okHandler)
-	err := rc.Push(t.Context(), c, "single", nil)
+	rc, _ := newClient(t, okHandler)
+	err := rc.Push(t.Context(), "single", nil)
 	testutil.Check(t, err)
 }
 
 func TestPushMultiple(t *testing.T) {
-	rc, c := newClient(t, okHandler)
-	err := rc.Push(t.Context(), c, "multiple", nil)
+	rc, _ := newClient(t, okHandler)
+	err := rc.Push(t.Context(), "multiple", nil)
 	testutil.Check(t, err)
 }
 
 func TestPushNotFound(t *testing.T) {
-	rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
+	rc, _ := newClient(t, func(w http.ResponseWriter, r *http.Request) {
 		t.Errorf("unexpected request: %v", r)
 	})
-	err := rc.Push(t.Context(), c, "notfound", nil)
+	err := rc.Push(t.Context(), "notfound", nil)
 	if !errors.Is(err, fs.ErrNotExist) {
 		t.Errorf("err = %v; want %v", err, fs.ErrNotExist)
 	}
 }
 
 func TestPushNullLayer(t *testing.T) {
-	rc, c := newClient(t, nil)
-	err := rc.Push(t.Context(), c, "null", nil)
+	rc, _ := newClient(t, nil)
+	err := rc.Push(t.Context(), "null", nil)
 	if err == nil || !strings.Contains(err.Error(), "invalid manifest") {
 		t.Errorf("err = %v; want invalid manifest", err)
 	}
 }
 
 func TestPushSizeMismatch(t *testing.T) {
-	rc, c := newClient(t, nil)
+	rc, _ := newClient(t, nil)
 	ctx, _ := withTraceUnexpected(t.Context())
-	got := rc.Push(ctx, c, "sizemismatch", nil)
+	got := rc.Push(ctx, "sizemismatch", nil)
 	if got == nil || !strings.Contains(got.Error(), "size mismatch") {
 		t.Errorf("err = %v; want size mismatch", got)
 	}
 }
 
 func TestPushInvalid(t *testing.T) {
-	rc, c := newClient(t, nil)
-	err := rc.Push(t.Context(), c, "invalid", nil)
+	rc, _ := newClient(t, nil)
+	err := rc.Push(t.Context(), "invalid", nil)
 	if err == nil || !strings.Contains(err.Error(), "invalid manifest") {
 		t.Errorf("err = %v; want invalid manifest", err)
 	}
@@ -208,7 +210,7 @@ func TestPushInvalid(t *testing.T) {
 
 func TestPushExistsAtRemote(t *testing.T) {
 	var pushed bool
-	rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
+	rc, _ := newClient(t, func(w http.ResponseWriter, r *http.Request) {
 		if strings.Contains(r.URL.Path, "/uploads/") {
 			if !pushed {
 				// First push. Return an uploadURL.
@@ -236,35 +238,35 @@ func TestPushExistsAtRemote(t *testing.T) {
 
 	check := testutil.Checker(t)
 
-	err := rc.Push(ctx, c, "single", nil)
+	err := rc.Push(ctx, "single", nil)
 	check(err)
 
 	if !errors.Is(errors.Join(errs...), nil) {
 		t.Errorf("errs = %v; want %v", errs, []error{ErrCached})
 	}
 
-	err = rc.Push(ctx, c, "single", nil)
+	err = rc.Push(ctx, "single", nil)
 	check(err)
 }
 
 func TestPushRemoteError(t *testing.T) {
-	rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
+	rc, _ := newClient(t, func(w http.ResponseWriter, r *http.Request) {
 		if strings.Contains(r.URL.Path, "/blobs/") {
 			w.WriteHeader(500)
 			io.WriteString(w, `{"errors":[{"code":"blob_error"}]}`)
 			return
 		}
 	})
-	got := rc.Push(t.Context(), c, "single", nil)
+	got := rc.Push(t.Context(), "single", nil)
 	checkErrCode(t, got, 500, "blob_error")
 }
 
 func TestPushLocationError(t *testing.T) {
-	rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
+	rc, _ := newClient(t, func(w http.ResponseWriter, r *http.Request) {
 		w.Header().Set("Location", ":///x")
 		w.WriteHeader(http.StatusAccepted)
 	})
-	got := rc.Push(t.Context(), c, "single", nil)
+	got := rc.Push(t.Context(), "single", nil)
 	wantContains := "invalid upload URL"
 	if got == nil || !strings.Contains(got.Error(), wantContains) {
 		t.Errorf("err = %v; want to contain %v", got, wantContains)
@@ -272,14 +274,14 @@ func TestPushLocationError(t *testing.T) {
 }
 
 func TestPushUploadRoundtripError(t *testing.T) {
-	rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
+	rc, _ := newClient(t, func(w http.ResponseWriter, r *http.Request) {
 		if r.Host == "blob.store" {
 			w.WriteHeader(499) // force RoundTrip error on upload
 			return
 		}
 		w.Header().Set("Location", "http://blob.store/blobs/123")
 	})
-	got := rc.Push(t.Context(), c, "single", nil)
+	got := rc.Push(t.Context(), "single", nil)
 	if !errors.Is(got, errRoundTrip) {
 		t.Errorf("got = %v; want %v", got, errRoundTrip)
 	}
@@ -295,20 +297,20 @@ func TestPushUploadFileOpenError(t *testing.T) {
 			os.Remove(c.GetFile(l.Digest))
 		},
 	})
-	got := rc.Push(ctx, c, "single", nil)
+	got := rc.Push(ctx, "single", nil)
 	if !errors.Is(got, fs.ErrNotExist) {
 		t.Errorf("got = %v; want fs.ErrNotExist", got)
 	}
 }
 
 func TestPushCommitRoundtripError(t *testing.T) {
-	rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
+	rc, _ := newClient(t, func(w http.ResponseWriter, r *http.Request) {
 		if strings.Contains(r.URL.Path, "/blobs/") {
 			panic("unexpected")
 		}
 		w.WriteHeader(499) // force RoundTrip error
 	})
-	err := rc.Push(t.Context(), c, "zero", nil)
+	err := rc.Push(t.Context(), "zero", nil)
 	if !errors.Is(err, errRoundTrip) {
 		t.Errorf("err = %v; want %v", err, errRoundTrip)
 	}
@@ -322,8 +324,8 @@ func checkNotExist(t *testing.T, err error) {
 }
 
 func TestRegistryPullInvalidName(t *testing.T) {
-	rc, c := newClient(t, nil)
-	err := rc.Pull(t.Context(), c, "://")
+	rc, _ := newClient(t, nil)
+	err := rc.Pull(t.Context(), "://")
 	if !errors.Is(err, ErrNameInvalid) {
 		t.Errorf("err = %v; want %v", err, ErrNameInvalid)
 	}
@@ -338,10 +340,10 @@ func TestRegistryPullInvalidManifest(t *testing.T) {
 	}
 
 	for _, resp := range cases {
-		rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
+		rc, _ := newClient(t, func(w http.ResponseWriter, r *http.Request) {
 			io.WriteString(w, resp)
 		})
-		err := rc.Pull(t.Context(), c, "x")
+		err := rc.Pull(t.Context(), "x")
 		if !errors.Is(err, ErrManifestInvalid) {
 			t.Errorf("err = %v; want invalid manifest", err)
 		}
@@ -364,18 +366,18 @@ func TestRegistryPullNotCached(t *testing.T) {
 	})
 
 	// Confirm that the layer does not exist locally
-	_, err := rc.ResolveLocal(c, "model")
+	_, err := rc.ResolveLocal("model")
 	checkNotExist(t, err)
 
 	_, err = c.Get(d)
 	checkNotExist(t, err)
 
-	err = rc.Pull(t.Context(), c, "model")
+	err = rc.Pull(t.Context(), "model")
 	check(err)
 
 	mw, err := rc.Resolve(t.Context(), "model")
 	check(err)
-	mg, err := rc.ResolveLocal(c, "model")
+	mg, err := rc.ResolveLocal("model")
 	check(err)
 	if !reflect.DeepEqual(mw, mg) {
 		t.Errorf("mw = %v; mg = %v", mw, mg)
@@ -400,7 +402,7 @@ func TestRegistryPullNotCached(t *testing.T) {
 
 func TestRegistryPullCached(t *testing.T) {
 	cached := blob.DigestFromBytes("exists")
-	rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
+	rc, _ := newClient(t, func(w http.ResponseWriter, r *http.Request) {
 		if strings.Contains(r.URL.Path, "/blobs/") {
 			w.WriteHeader(499) // should not be called
 			return
@@ -423,7 +425,7 @@ func TestRegistryPullCached(t *testing.T) {
 	ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
 	defer cancel()
 
-	err := rc.Pull(ctx, c, "single")
+	err := rc.Pull(ctx, "single")
 	testutil.Check(t, err)
 
 	want := []int64{6}
@@ -436,30 +438,30 @@ func TestRegistryPullCached(t *testing.T) {
 }
 
 func TestRegistryPullManifestNotFound(t *testing.T) {
-	rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
+	rc, _ := newClient(t, func(w http.ResponseWriter, r *http.Request) {
 		w.WriteHeader(http.StatusNotFound)
 	})
-	err := rc.Pull(t.Context(), c, "notfound")
+	err := rc.Pull(t.Context(), "notfound")
 	checkErrCode(t, err, 404, "")
 }
 
 func TestRegistryPullResolveRemoteError(t *testing.T) {
-	rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
+	rc, _ := newClient(t, func(w http.ResponseWriter, r *http.Request) {
 		w.WriteHeader(http.StatusInternalServerError)
 		io.WriteString(w, `{"errors":[{"code":"an_error"}]}`)
 	})
-	err := rc.Pull(t.Context(), c, "single")
+	err := rc.Pull(t.Context(), "single")
 	checkErrCode(t, err, 500, "an_error")
 }
 
 func TestRegistryPullResolveRoundtripError(t *testing.T) {
-	rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
+	rc, _ := newClient(t, func(w http.ResponseWriter, r *http.Request) {
 		if strings.Contains(r.URL.Path, "/manifests/") {
 			w.WriteHeader(499) // force RoundTrip error
 			return
 		}
 	})
-	err := rc.Pull(t.Context(), c, "single")
+	err := rc.Pull(t.Context(), "single")
 	if !errors.Is(err, errRoundTrip) {
 		t.Errorf("err = %v; want %v", err, errRoundTrip)
 	}
@@ -512,7 +514,7 @@ func TestRegistryPullMixedCachedNotCached(t *testing.T) {
 
 		// Check that we pull all layers that we can.
 
-		err := rc.Pull(ctx, c, "mixed")
+		err := rc.Pull(ctx, "mixed")
 		if err != nil {
 			t.Fatal(err)
 		}
@@ -530,7 +532,7 @@ func TestRegistryPullMixedCachedNotCached(t *testing.T) {
 }
 
 func TestRegistryPullChunking(t *testing.T) {
-	rc, c := newClient(t, func(w http.ResponseWriter, r *http.Request) {
+	rc, _ := newClient(t, func(w http.ResponseWriter, r *http.Request) {
 		t.Log("request:", r.URL.Host, r.Method, r.URL.Path, r.Header.Get("Range"))
 		if r.URL.Host != "blob.store" {
 			// The production registry redirects to the blob store.
@@ -568,7 +570,7 @@ func TestRegistryPullChunking(t *testing.T) {
 		},
 	})
 
-	err := rc.Pull(ctx, c, "remote")
+	err := rc.Pull(ctx, "remote")
 	testutil.Check(t, err)
 
 	want := []int64{0, 3, 6}
@@ -785,27 +787,27 @@ func TestParseNameExtended(t *testing.T) {
 
 func TestUnlink(t *testing.T) {
 	t.Run("found by name", func(t *testing.T) {
-		rc, c := newClient(t, nil)
+		rc, _ := newClient(t, nil)
 
 		// confirm linked
-		_, err := rc.ResolveLocal(c, "single")
+		_, err := rc.ResolveLocal("single")
 		if err != nil {
 			t.Errorf("unexpected error: %v", err)
 		}
 
 		// unlink
-		_, err = rc.Unlink(c, "single")
+		_, err = rc.Unlink("single")
 		testutil.Check(t, err)
 
 		// confirm unlinked
-		_, err = rc.ResolveLocal(c, "single")
+		_, err = rc.ResolveLocal("single")
 		if !errors.Is(err, fs.ErrNotExist) {
 			t.Errorf("err = %v; want fs.ErrNotExist", err)
 		}
 	})
 	t.Run("not found by name", func(t *testing.T) {
-		rc, c := newClient(t, nil)
-		ok, err := rc.Unlink(c, "manifestNotFound")
+		rc, _ := newClient(t, nil)
+		ok, err := rc.Unlink("manifestNotFound")
 		if err != nil {
 			t.Fatal(err)
 		}
diff --git a/server/internal/client/ollama/trace.go b/server/internal/client/ollama/trace.go
index 8e53040a..e300870b 100644
--- a/server/internal/client/ollama/trace.go
+++ b/server/internal/client/ollama/trace.go
@@ -6,6 +6,9 @@ import (
 
 // Trace is a set of functions that are called to report progress during blob
 // downloads and uploads.
+//
+// Use [WithTrace] to attach a Trace to a context for use with [Registry.Push]
+// and [Registry.Pull].
 type Trace struct {
 	// Update is called during [Registry.Push] and [Registry.Pull] to
 	// report the progress of blob uploads and downloads.
diff --git a/server/internal/cmd/opp/opp.go b/server/internal/cmd/opp/opp.go
index c21e71d5..6976927c 100644
--- a/server/internal/cmd/opp/opp.go
+++ b/server/internal/cmd/opp/opp.go
@@ -63,25 +63,28 @@ func main() {
 	}
 	flag.Parse()
 
-	c, err := ollama.DefaultCache()
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	rc, err := ollama.DefaultRegistry()
-	if err != nil {
-		log.Fatal(err)
-	}
-
 	ctx := context.Background()
 
-	err = func() error {
+	err := func() error {
 		switch cmd := flag.Arg(0); cmd {
 		case "pull":
-			return cmdPull(ctx, rc, c)
+			rc, err := ollama.DefaultRegistry()
+			if err != nil {
+				log.Fatal(err)
+			}
+
+			return cmdPull(ctx, rc)
 		case "push":
-			return cmdPush(ctx, rc, c)
+			rc, err := ollama.DefaultRegistry()
+			if err != nil {
+				log.Fatal(err)
+			}
+			return cmdPush(ctx, rc)
 		case "import":
+			c, err := ollama.DefaultCache()
+			if err != nil {
+				log.Fatal(err)
+			}
 			return cmdImport(ctx, c)
 		default:
 			if cmd == "" {
@@ -99,7 +102,7 @@ func main() {
 	}
 }
 
-func cmdPull(ctx context.Context, rc *ollama.Registry, c *blob.DiskCache) error {
+func cmdPull(ctx context.Context, rc *ollama.Registry) error {
 	model := flag.Arg(1)
 	if model == "" {
 		flag.Usage()
@@ -145,7 +148,7 @@ func cmdPull(ctx context.Context, rc *ollama.Registry, c *blob.DiskCache) error
 
 	errc := make(chan error)
 	go func() {
-		errc <- rc.Pull(ctx, c, model)
+		errc <- rc.Pull(ctx, model)
 	}()
 
 	t := time.NewTicker(time.Second)
@@ -161,7 +164,7 @@ func cmdPull(ctx context.Context, rc *ollama.Registry, c *blob.DiskCache) error
 	}
 }
 
-func cmdPush(ctx context.Context, rc *ollama.Registry, c *blob.DiskCache) error {
+func cmdPush(ctx context.Context, rc *ollama.Registry) error {
 	args := flag.Args()[1:]
 	flag := flag.NewFlagSet("push", flag.ExitOnError)
 	flagFrom := flag.String("from", "", "Use the manifest from a model by another name.")
@@ -177,7 +180,7 @@ func cmdPush(ctx context.Context, rc *ollama.Registry, c *blob.DiskCache) error
 	}
 
 	from := cmp.Or(*flagFrom, model)
-	m, err := rc.ResolveLocal(c, from)
+	m, err := rc.ResolveLocal(from)
 	if err != nil {
 		return err
 	}
@@ -203,7 +206,7 @@ func cmdPush(ctx context.Context, rc *ollama.Registry, c *blob.DiskCache) error
 		},
 	})
 
-	return rc.Push(ctx, c, model, &ollama.PushParams{
+	return rc.Push(ctx, model, &ollama.PushParams{
 		From: from,
 	})
 }
diff --git a/server/internal/registry/server.go b/server/internal/registry/server.go
index 6ea590a7..81085357 100644
--- a/server/internal/registry/server.go
+++ b/server/internal/registry/server.go
@@ -11,7 +11,6 @@ import (
 	"log/slog"
 	"net/http"
 
-	"github.com/ollama/ollama/server/internal/cache/blob"
 	"github.com/ollama/ollama/server/internal/client/ollama"
 )
 
@@ -27,7 +26,6 @@ import (
 // directly to the blob disk cache.
 type Local struct {
 	Client *ollama.Registry // required
-	Cache  *blob.DiskCache  // required
 	Logger *slog.Logger     // required
 
 	// Fallback, if set, is used to handle requests that are not handled by
@@ -199,7 +197,7 @@ func (s *Local) handleDelete(_ http.ResponseWriter, r *http.Request) error {
 	if err != nil {
 		return err
 	}
-	ok, err := s.Client.Unlink(s.Cache, p.model())
+	ok, err := s.Client.Unlink(p.model())
 	if err != nil {
 		return err
 	}
diff --git a/server/internal/registry/server_test.go b/server/internal/registry/server_test.go
index 7ba13d50..e44d88c0 100644
--- a/server/internal/registry/server_test.go
+++ b/server/internal/registry/server_test.go
@@ -42,10 +42,10 @@ func newTestServer(t *testing.T) *Local {
 		t.Fatal(err)
 	}
 	rc := &ollama.Registry{
+		Cache:      c,
 		HTTPClient: panicOnRoundTrip,
 	}
 	l := &Local{
-		Cache:  c,
 		Client: rc,
 		Logger: testutil.Slogger(t),
 	}
@@ -87,7 +87,7 @@ func TestServerDelete(t *testing.T) {
 
 	s := newTestServer(t)
 
-	_, err := s.Client.ResolveLocal(s.Cache, "smol")
+	_, err := s.Client.ResolveLocal("smol")
 	check(err)
 
 	got := s.send(t, "DELETE", "/api/delete", `{"model": "smol"}`)
@@ -95,7 +95,7 @@ func TestServerDelete(t *testing.T) {
 		t.Fatalf("Code = %d; want 200", got.Code)
 	}
 
-	_, err = s.Client.ResolveLocal(s.Cache, "smol")
+	_, err = s.Client.ResolveLocal("smol")
 	if err == nil {
 		t.Fatal("expected smol to have been deleted")
 	}
diff --git a/server/routes.go b/server/routes.go
index ff42000f..519f0473 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -34,7 +34,6 @@ import (
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/model/models/mllama"
 	"github.com/ollama/ollama/openai"
-	"github.com/ollama/ollama/server/internal/cache/blob"
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/server/internal/registry"
 	"github.com/ollama/ollama/template"
@@ -1129,7 +1128,7 @@ func allowedHostsMiddleware(addr net.Addr) gin.HandlerFunc {
 	}
 }
 
-func (s *Server) GenerateRoutes(c *blob.DiskCache, rc *ollama.Registry) (http.Handler, error) {
+func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) {
 	corsConfig := cors.DefaultConfig()
 	corsConfig.AllowWildcard = true
 	corsConfig.AllowBrowserExtensions = true
@@ -1197,7 +1196,6 @@ func (s *Server) GenerateRoutes(c *blob.DiskCache, rc *ollama.Registry) (http.Ha
 
 	// wrap old with new
 	rs := &registry.Local{
-		Cache:    c,
 		Client:   rc,
 		Logger:   slog.Default(), // TODO(bmizerany): Take a logger, do not use slog.Default()
 		Fallback: r,
@@ -1258,16 +1256,12 @@ func Serve(ln net.Listener) error {
 
 	s := &Server{addr: ln.Addr()}
 
-	c, err := ollama.DefaultCache()
-	if err != nil {
-		return err
-	}
 	rc, err := ollama.DefaultRegistry()
 	if err != nil {
 		return err
 	}
 
-	h, err := s.GenerateRoutes(c, rc)
+	h, err := s.GenerateRoutes(rc)
 	if err != nil {
 		return err
 	}
diff --git a/server/routes_test.go b/server/routes_test.go
index 0dd782f4..e13c4b59 100644
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -23,7 +23,6 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/openai"
-	"github.com/ollama/ollama/server/internal/cache/blob"
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
@@ -490,11 +489,6 @@ func TestRoutes(t *testing.T) {
 	modelsDir := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", modelsDir)
 
-	c, err := blob.Open(modelsDir)
-	if err != nil {
-		t.Fatalf("failed to open models dir: %v", err)
-	}
-
 	rc := &ollama.Registry{
 		// This is a temporary measure to allow us to move forward,
 		// surfacing any code contacting ollama.com we do not intended
@@ -511,7 +505,7 @@ func TestRoutes(t *testing.T) {
 	}
 
 	s := &Server{}
-	router, err := s.GenerateRoutes(c, rc)
+	router, err := s.GenerateRoutes(rc)
 	if err != nil {
 		t.Fatalf("failed to generate routes: %v", err)
 	}

From 1579c4f06d7c6cc96a91f76be3e0dcad7839e9eb Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 3 Mar 2025 01:20:49 -0800
Subject: [PATCH 05/17] build: install binutils alongside gcc in Dockerfile
 (#9475)

---
 Dockerfile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index f94beae1..91b61362 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,9 +12,8 @@ FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base
 RUN yum install -y yum-utils \
     && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
     && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
-    && dnf install -y yum-utils gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
-    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \
-    && curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /usr/local/bin --strip-components 1
+    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
+    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 
 FROM --platform=linux/arm64 almalinux:8 AS base-arm64

From 3b1ddb2b3a00598162a69b0441e4e56df3af7666 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C4=B0brahim=20=C3=87etin?= <cetinibrahim.ci@gmail.com>
Date: Mon, 3 Mar 2025 22:06:30 +0300
Subject: [PATCH 06/17] docs: add reins to community integrations (#9411)

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 7bf306d5..ecfbe18e 100644
--- a/README.md
+++ b/README.md
@@ -388,6 +388,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangBot](https://github.com/RockChinQ/LangBot) (LLM-based instant messaging bots platform, with Agents, RAG features, supports multiple platforms)
 - [1Panel](https://github.com/1Panel-dev/1Panel/) (Web-based Linux Server Management Tool)
 - [AstrBot](https://github.com/Soulter/AstrBot/) (User-friendly LLM-based multi-platform chatbot with a WebUI, supporting RAG, LLM agents, and plugins integration)
+- [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
 
 ### Cloud
 
@@ -512,6 +513,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
+- [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
 
 ### Extensions & Plugins
 

From a6f0f908b93c05319fb58b682c2b2123c160af3d Mon Sep 17 00:00:00 2001
From: aritra saha <162728301+olumolu@users.noreply.github.com>
Date: Tue, 4 Mar 2025 00:39:21 +0530
Subject: [PATCH 07/17] docs: update phi3-mini to phi4-mini  (#9424)

* Update README.md

removed phi 3 mini and added phi4-mini

* Update README.md

---------

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ecfbe18e..9c47a6f8 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,7 @@ Here are some example models that can be downloaded:
 | Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`            |
 | Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`       |
 | Phi 4              | 14B        | 9.1GB | `ollama run phi4`                |
-| Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`                |
+| Phi 4 Mini         | 3.8B       | 2.5GB | `ollama run phi4-mini`           |
 | Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`           |
 | Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`              |
 | Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`          |

From 36dfb906bb1e28e4c2133196640ccc7424dddfd7 Mon Sep 17 00:00:00 2001
From: Mark <remarkablemark@users.noreply.github.com>
Date: Mon, 3 Mar 2025 14:56:34 -0500
Subject: [PATCH 08/17] docs: don't use self-closing tag for anchor element
 (#9456)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9c47a6f8..fa16aaa2 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <div align="center">
-  <a href="https://ollama.com" />
+  <a href="https://ollama.com">
     <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
   </a>
 </div>

From d25efe395415353152feafb9b766a8a7a1496517 Mon Sep 17 00:00:00 2001
From: CYJiang <86391540+googs1025@users.noreply.github.com>
Date: Tue, 4 Mar 2025 04:13:41 +0800
Subject: [PATCH 09/17] cmd: add default err return for stop (#9458)

---
 cmd/cmd.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 80ece4c6..b4001a7f 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -256,6 +256,7 @@ func StopHandler(cmd *cobra.Command, args []string) error {
 		if strings.Contains(err.Error(), "not found") {
 			return fmt.Errorf("couldn't find model \"%s\" to stop", args[0])
 		}
+		return err
 	}
 	return nil
 }

From ba7d31240eb0b44064a8220a8a6143307b1459ef Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 3 Mar 2025 11:05:21 -0800
Subject: [PATCH 10/17] fix: own lib/ollama directory

expand backend loading error handling to catch more problems and log
them instead of panicing
---
 .../patches/0015-try-catch-backend-load.patch |  69 ---------
 ...-filesystem-path-instead-of-wstring.patch} | 134 ++++++++++++------
 ...remove-amx.patch => 0016-remove-amx.patch} |   0
 ...tch => 0017-fix-clip-compiler-error.patch} |   0
 ...port.patch => 0018-add-phi4-support.patch} |   0
 ml/backend/ggml/ggml/src/ggml-backend-reg.cpp |  74 +++++-----
 scripts/install.sh                            |   3 +-
 7 files changed, 135 insertions(+), 145 deletions(-)
 delete mode 100644 llama/patches/0015-try-catch-backend-load.patch
 rename llama/patches/{0016-use-std-filesystem-path-instead-of-wstring.patch => 0015-use-std-filesystem-path-instead-of-wstring.patch} (67%)
 rename llama/patches/{0017-remove-amx.patch => 0016-remove-amx.patch} (100%)
 rename llama/patches/{0018-fix-clip-compiler-error.patch => 0017-fix-clip-compiler-error.patch} (100%)
 rename llama/patches/{0019-add-phi4-support.patch => 0018-add-phi4-support.patch} (100%)

diff --git a/llama/patches/0015-try-catch-backend-load.patch b/llama/patches/0015-try-catch-backend-load.patch
deleted file mode 100644
index 9aea6183..00000000
--- a/llama/patches/0015-try-catch-backend-load.patch
+++ /dev/null
@@ -1,69 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Tue, 11 Feb 2025 14:06:36 -0800
-Subject: [PATCH] try/catch backend load
-
----
- ggml/src/ggml-backend-reg.cpp | 45 ++++++++++++++++++-----------------
- 1 file changed, 23 insertions(+), 22 deletions(-)
-
-diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 98d5e14d..1c19129a 100644
---- a/ggml/src/ggml-backend-reg.cpp
-+++ b/ggml/src/ggml-backend-reg.cpp
-@@ -512,32 +512,33 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
-         }
-         fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
-         for (const auto & entry : dir_it) {
--            if (entry.is_regular_file()) {
--                std::wstring filename = entry.path().filename().wstring();
--                std::wstring ext = entry.path().extension().wstring();
--                if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
--                    dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
--                    if (!handle && !silent) {
--                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
--                    }
--                    if (handle) {
-+            try {
-+                if (entry.is_regular_file()) {
-+                    std::wstring filename = entry.path().filename().wstring();
-+                    std::wstring ext = entry.path().extension().wstring();
-+                    if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
-+                        dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
-+                        if (!handle) {
-+                            GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
-+                            continue;
-+                        }
-+
-                         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
--                        if (score_fn) {
--                            int s = score_fn();
--#ifndef NDEBUG
--                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
--#endif
--                            if (s > best_score) {
--                                best_score = s;
--                                best_path = entry.path().wstring();
--                            }
--                        } else {
--                            if (!silent) {
--                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
--                            }
-+                        if (!score_fn) {
-+                            GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
-+                            continue;
-+                        }
-+
-+                        int s = score_fn();
-+                        GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
-+                        if (s > best_score) {
-+                            best_score = s;
-+                            best_path = entry.path().wstring();
-                         }
-                     }
-                 }
-+            } catch (const std::exception & e) {
-+                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), e.what());
-             }
-         }
-     }
diff --git a/llama/patches/0016-use-std-filesystem-path-instead-of-wstring.patch b/llama/patches/0015-use-std-filesystem-path-instead-of-wstring.patch
similarity index 67%
rename from llama/patches/0016-use-std-filesystem-path-instead-of-wstring.patch
rename to llama/patches/0015-use-std-filesystem-path-instead-of-wstring.patch
index d60066c1..e72d78ac 100644
--- a/llama/patches/0016-use-std-filesystem-path-instead-of-wstring.patch
+++ b/llama/patches/0015-use-std-filesystem-path-instead-of-wstring.patch
@@ -4,11 +4,11 @@ Date: Sun, 16 Feb 2025 20:00:22 -0500
 Subject: [PATCH] use std::filesystem::path instead of wstring
 
 ---
- ggml/src/ggml-backend-reg.cpp | 144 ++++++++++++++--------------------
- 1 file changed, 58 insertions(+), 86 deletions(-)
+ ggml/src/ggml-backend-reg.cpp | 199 +++++++++++++++-------------------
+ 1 file changed, 88 insertions(+), 111 deletions(-)
 
 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 1c19129a..c854e6bb 100644
+index 98d5e14d..799af5f3 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
 @@ -66,26 +66,6 @@
@@ -264,47 +264,55 @@ index 1c19129a..c854e6bb 100644
      for (const auto & search_path : search_paths) {
          if (!fs::exists(search_path)) {
              continue;
-@@ -514,31 +486,31 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
+@@ -513,29 +485,26 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
+         fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
          for (const auto & entry : dir_it) {
-             try {
-                 if (entry.is_regular_file()) {
--                    std::wstring filename = entry.path().filename().wstring();
--                    std::wstring ext = entry.path().extension().wstring();
-+                    std::string filename = entry.path().filename().string();
-+                    std::string ext = entry.path().extension().string();
-                     if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
--                        dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
-+                        dl_handle_ptr handle { dl_load_library(entry.path()) };
-                         if (!handle) {
--                            GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
-+                            GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_to_string(entry.path()).c_str());
-                             continue;
-                         }
- 
-                         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-                         if (!score_fn) {
--                            GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
-+                            GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, path_to_string(entry.path()).c_str());
-                             continue;
-                         }
- 
-                         int s = score_fn();
--                        GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
-+                        GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_to_string(entry.path()).c_str(), s);
-                         if (s > best_score) {
-                             best_score = s;
--                            best_path = entry.path().wstring();
-+                            best_path = entry.path();
-                         }
+             if (entry.is_regular_file()) {
+-                std::wstring filename = entry.path().filename().wstring();
+-                std::wstring ext = entry.path().extension().wstring();
++                std::string filename = entry.path().filename().string();
++                std::string ext = entry.path().extension().string();
+                 if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
+-                    dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
+-                    if (!handle && !silent) {
+-                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
++                    dl_handle_ptr handle { dl_load_library(entry.path()) };
++                    if (!handle) {
++                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_to_string(entry.path()).c_str());
++                        continue;
+                     }
+-                    if (handle) {
+-                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+-                        if (score_fn) {
+-                            int s = score_fn();
+-#ifndef NDEBUG
+-                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
+-#endif
+-                            if (s > best_score) {
+-                                best_score = s;
+-                                best_path = entry.path().wstring();
+-                            }
+-                        } else {
+-                            if (!silent) {
+-                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
+-                            }
+-                        }
++
++                    auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
++                    if (!score_fn) {
++                        GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, path_to_string(entry.path()).c_str());
++                        continue;
++                    }
++
++                    int s = score_fn();
++                    GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_to_string(entry.path()).c_str(), s);
++                    if (s > best_score) {
++                        best_score = s;
++                        best_path = entry.path();
                      }
                  }
-             } catch (const std::exception & e) {
--                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), e.what());
-+                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_to_string(entry.path()).c_str(), e.what());
              }
-         }
-     }
-@@ -546,7 +518,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
+@@ -545,7 +514,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
      if (best_score == 0) {
          // try to load the base backend
          for (const auto & search_path : search_paths) {
@@ -313,3 +321,49 @@ index 1c19129a..c854e6bb 100644
              if (fs::exists(path)) {
                  return get_reg().load_backend(path, silent);
              }
+@@ -560,6 +529,14 @@ void ggml_backend_load_all() {
+     ggml_backend_load_all_from_path(nullptr);
+ }
+ 
++static void ggml_backend_try_load_best(const char * name, bool silent, const char * user_search_path) {
++    try {
++        ggml_backend_load_best(name, silent, user_search_path);
++    } catch (const std::exception & e) {
++        GGML_LOG_DEBUG("%s: failed to load %s: %s\n", __func__, name, e.what());
++    }
++}
++
+ void ggml_backend_load_all_from_path(const char * dir_path) {
+ #ifdef NDEBUG
+     bool silent = true;
+@@ -567,18 +544,18 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
+     bool silent = false;
+ #endif
+ 
+-    ggml_backend_load_best("blas", silent, dir_path);
+-    ggml_backend_load_best("cann", silent, dir_path);
+-    ggml_backend_load_best("cuda", silent, dir_path);
+-    ggml_backend_load_best("hip", silent, dir_path);
+-    ggml_backend_load_best("kompute", silent, dir_path);
+-    ggml_backend_load_best("metal", silent, dir_path);
+-    ggml_backend_load_best("rpc", silent, dir_path);
+-    ggml_backend_load_best("sycl", silent, dir_path);
+-    ggml_backend_load_best("vulkan", silent, dir_path);
+-    ggml_backend_load_best("opencl", silent, dir_path);
+-    ggml_backend_load_best("musa", silent, dir_path);
+-    ggml_backend_load_best("cpu", silent, dir_path);
++    ggml_backend_try_load_best("blas", silent, dir_path);
++    ggml_backend_try_load_best("cann", silent, dir_path);
++    ggml_backend_try_load_best("cuda", silent, dir_path);
++    ggml_backend_try_load_best("hip", silent, dir_path);
++    ggml_backend_try_load_best("kompute", silent, dir_path);
++    ggml_backend_try_load_best("metal", silent, dir_path);
++    ggml_backend_try_load_best("rpc", silent, dir_path);
++    ggml_backend_try_load_best("sycl", silent, dir_path);
++    ggml_backend_try_load_best("vulkan", silent, dir_path);
++    ggml_backend_try_load_best("opencl", silent, dir_path);
++    ggml_backend_try_load_best("musa", silent, dir_path);
++    ggml_backend_try_load_best("cpu", silent, dir_path);
+     // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
+     const char * backend_path = std::getenv("GGML_BACKEND_PATH");
+     if (backend_path) {
diff --git a/llama/patches/0017-remove-amx.patch b/llama/patches/0016-remove-amx.patch
similarity index 100%
rename from llama/patches/0017-remove-amx.patch
rename to llama/patches/0016-remove-amx.patch
diff --git a/llama/patches/0018-fix-clip-compiler-error.patch b/llama/patches/0017-fix-clip-compiler-error.patch
similarity index 100%
rename from llama/patches/0018-fix-clip-compiler-error.patch
rename to llama/patches/0017-fix-clip-compiler-error.patch
diff --git a/llama/patches/0019-add-phi4-support.patch b/llama/patches/0018-add-phi4-support.patch
similarity index 100%
rename from llama/patches/0019-add-phi4-support.patch
rename to llama/patches/0018-add-phi4-support.patch
diff --git a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
index c854e6bb..799af5f3 100644
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -484,33 +484,29 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
         }
         fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
         for (const auto & entry : dir_it) {
-            try {
-                if (entry.is_regular_file()) {
-                    std::string filename = entry.path().filename().string();
-                    std::string ext = entry.path().extension().string();
-                    if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
-                        dl_handle_ptr handle { dl_load_library(entry.path()) };
-                        if (!handle) {
-                            GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_to_string(entry.path()).c_str());
-                            continue;
-                        }
+            if (entry.is_regular_file()) {
+                std::string filename = entry.path().filename().string();
+                std::string ext = entry.path().extension().string();
+                if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
+                    dl_handle_ptr handle { dl_load_library(entry.path()) };
+                    if (!handle) {
+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_to_string(entry.path()).c_str());
+                        continue;
+                    }
 
-                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-                        if (!score_fn) {
-                            GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, path_to_string(entry.path()).c_str());
-                            continue;
-                        }
+                    auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+                    if (!score_fn) {
+                        GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, path_to_string(entry.path()).c_str());
+                        continue;
+                    }
 
-                        int s = score_fn();
-                        GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_to_string(entry.path()).c_str(), s);
-                        if (s > best_score) {
-                            best_score = s;
-                            best_path = entry.path();
-                        }
+                    int s = score_fn();
+                    GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_to_string(entry.path()).c_str(), s);
+                    if (s > best_score) {
+                        best_score = s;
+                        best_path = entry.path();
                     }
                 }
-            } catch (const std::exception & e) {
-                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_to_string(entry.path()).c_str(), e.what());
             }
         }
     }
@@ -533,6 +529,14 @@ void ggml_backend_load_all() {
     ggml_backend_load_all_from_path(nullptr);
 }
 
+static void ggml_backend_try_load_best(const char * name, bool silent, const char * user_search_path) {
+    try {
+        ggml_backend_load_best(name, silent, user_search_path);
+    } catch (const std::exception & e) {
+        GGML_LOG_DEBUG("%s: failed to load %s: %s\n", __func__, name, e.what());
+    }
+}
+
 void ggml_backend_load_all_from_path(const char * dir_path) {
 #ifdef NDEBUG
     bool silent = true;
@@ -540,18 +544,18 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     bool silent = false;
 #endif
 
-    ggml_backend_load_best("blas", silent, dir_path);
-    ggml_backend_load_best("cann", silent, dir_path);
-    ggml_backend_load_best("cuda", silent, dir_path);
-    ggml_backend_load_best("hip", silent, dir_path);
-    ggml_backend_load_best("kompute", silent, dir_path);
-    ggml_backend_load_best("metal", silent, dir_path);
-    ggml_backend_load_best("rpc", silent, dir_path);
-    ggml_backend_load_best("sycl", silent, dir_path);
-    ggml_backend_load_best("vulkan", silent, dir_path);
-    ggml_backend_load_best("opencl", silent, dir_path);
-    ggml_backend_load_best("musa", silent, dir_path);
-    ggml_backend_load_best("cpu", silent, dir_path);
+    ggml_backend_try_load_best("blas", silent, dir_path);
+    ggml_backend_try_load_best("cann", silent, dir_path);
+    ggml_backend_try_load_best("cuda", silent, dir_path);
+    ggml_backend_try_load_best("hip", silent, dir_path);
+    ggml_backend_try_load_best("kompute", silent, dir_path);
+    ggml_backend_try_load_best("metal", silent, dir_path);
+    ggml_backend_try_load_best("rpc", silent, dir_path);
+    ggml_backend_try_load_best("sycl", silent, dir_path);
+    ggml_backend_try_load_best("vulkan", silent, dir_path);
+    ggml_backend_try_load_best("opencl", silent, dir_path);
+    ggml_backend_try_load_best("musa", silent, dir_path);
+    ggml_backend_try_load_best("cpu", silent, dir_path);
     // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
     const char * backend_path = std::getenv("GGML_BACKEND_PATH");
     if (backend_path) {
diff --git a/scripts/install.sh b/scripts/install.sh
index 9e146e50..9c232400 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -77,11 +77,12 @@ if [ -d "$OLLAMA_INSTALL_DIR/lib/ollama" ] ; then
 fi
 status "Installing ollama to $OLLAMA_INSTALL_DIR"
 $SUDO install -o0 -g0 -m755 -d $BINDIR
-$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR"
+$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR/lib/ollama"
 status "Downloading Linux ${ARCH} bundle"
 curl --fail --show-error --location --progress-bar \
     "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
     $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+
 if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
     status "Making ollama accessible in the PATH in $BINDIR"
     $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"

From b428ddd79686a0c72db51f55fe0673e47cbf094e Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 3 Mar 2025 09:55:00 -0800
Subject: [PATCH 11/17] docker: use go version from go.mod

---
 Dockerfile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 91b61362..4136fca7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -86,10 +86,11 @@ RUN --mount=type=cache,target=/root/.ccache \
         && cmake --install build --component CUDA --strip --parallel 8
 
 FROM base AS build
-ARG GOVERSION=1.23.4
-RUN curl -fsSL https://golang.org/dl/go${GOVERSION}.linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
-ENV PATH=/usr/local/go/bin:$PATH
 WORKDIR /go/src/github.com/ollama/ollama
+COPY go.mod go.sum .
+RUN curl -fsSL https://golang.org/dl/go$(awk '/^go/ { print $2 }' go.mod).linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
+ENV PATH=/usr/local/go/bin:$PATH
+RUN go mod download
 COPY . .
 ARG GOFLAGS="'-ldflags=-w -s'"
 ENV CGO_ENABLED=1

From fefbf8f74bdedeb4a6066f0b958a52194de91240 Mon Sep 17 00:00:00 2001
From: KindBrave <47859250+sunshine0523@users.noreply.github.com>
Date: Tue, 4 Mar 2025 08:38:32 +0800
Subject: [PATCH 12/17] docs: add Ollama Android Chat community integration

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index fa16aaa2..16b0d68e 100644
--- a/README.md
+++ b/README.md
@@ -513,6 +513,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
+- [Ollama Android Chat](https://github.com/sunshine0523/OllamaServer) (No need for Termux, start the Ollama service with one click on an Android device)
 - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
 
 ### Extensions & Plugins

From 55ab9f371a198a190b423763a1d48e4495fe520e Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Mon, 3 Mar 2025 16:45:40 -0800
Subject: [PATCH 13/17] server/.../backoff,syncs: don't break builds without
 synctest (#9484)

Previously, developers without the synctest experiment enabled would see
build failures when running tests in some server/internal/internal
packages using the synctest package. This change makes the transition to
use of the package less painful but guards the use of the synctest
package with build tags.

synctest is enabled in CI. If a new change will break a synctest
package, it will break in CI, even if it does not break locally.

The developer docs have been updated to help with any confusion about
why package tests pass locally but fail in CI.
---
 docs/development.md                           | 29 +++++++++++++++++++
 .../internal/internal/backoff/backoff_test.go |  2 ++
 server/internal/internal/syncs/line_test.go   |  2 ++
 3 files changed, 33 insertions(+)

diff --git a/docs/development.md b/docs/development.md
index eb67dbfa..cf6d91e2 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -118,6 +118,35 @@ To run tests, use `go test`:
 go test ./...
 ```
 
+> NOTE: In rare cirumstances, you may nedd to change a package using the new
+> "synctest" package in go1.24.
+>
+> If you do not have the "synctest" package enabled, you will not see build or
+> test failures resulting from your change(s), if any, locally, but CI will
+> break.
+>
+> If you see failures in CI, you can either keep pushing changes to see if the
+> CI build passes, or you can enable the "synctest" package locally to see the
+> failures before pushing.
+>
+> To enable the "synctest" package for testing, run the following command:
+>
+> ```shell
+> GOEXPERIMENT=synctest go test ./...
+> ```
+>
+> If you wish to enable synctest for all go commands, you can set the
+> `GOEXPERIMENT` environment variable in your shell profile or by using:
+>
+> ```shell
+> go env -w GOEXPERIMENT=synctest
+> ```
+>
+> Which will enable the "synctest" package for all go commands without needing
+> to set it for all shell sessions.
+>
+> The synctest package is not required for production builds.
+
 ## Library detection
 
 Ollama looks for acceleration libraries in the following paths relative to the `ollama` executable:
diff --git a/server/internal/internal/backoff/backoff_test.go b/server/internal/internal/backoff/backoff_test.go
index bb8438a7..11ace22a 100644
--- a/server/internal/internal/backoff/backoff_test.go
+++ b/server/internal/internal/backoff/backoff_test.go
@@ -1,3 +1,5 @@
+//go:build goexperiment.synctest
+
 package backoff
 
 import (
diff --git a/server/internal/internal/syncs/line_test.go b/server/internal/internal/syncs/line_test.go
index d5216026..94114a56 100644
--- a/server/internal/internal/syncs/line_test.go
+++ b/server/internal/internal/syncs/line_test.go
@@ -1,3 +1,5 @@
+//go:build goexperiment.synctest
+
 package syncs
 
 import (

From 7a01ad76143973199bd6965c13476d2d04f10f75 Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Mon, 3 Mar 2025 19:11:16 -0800
Subject: [PATCH 14/17] server/internal/registry: reintroduce pruning on model
 deletion (#9489)

This reintroduces aggressive pruning on model deletion as a temporary
measure until a more controlled garbage collection (GC) mechanism is
implemented.

Issues with the current approach:

1. Users may accidentally delete a model (`ollama rm llama3.3` instead
   of `ollama rm llama3.2`), requiring a full re-download unless another
   model references the same blobs.

2. Users may assume a deleted model is still referenced elsewhere, but
   due to prior updates or deletions, the references no longer exist,
   leading to unnecessary re-downloads.

Soon, we should implement a structured GC mechanism to retain
unreferenced blobs for a configurable period before removal, which will
run on "ollama rm" and other commands we deem appropriate.

Users that want to immediately remove unreferenced blobs can use a new
prune command that will allow them to specify the age and class of blobs
to remove.

Example usage:

    # Run basic blob GC
    $ ollama prune

    # Remove unreferenced blobs older than 7 days
    $ ollama prune --age 7d

    # Remove all blobs, referenced or not, older than 7 days (and their manifests?)
    $ ollama prune --age 7d --all

    # Remove all unreferenced blobs immediately
    $ ollama prune --age 0 --all

    # Remove all blobs
    $ ollama prune --age 0 --all

This should provide a safer and more predictable cleanup process.
---
 server/internal/registry/server.go | 9 ++++++++-
 server/routes.go                   | 2 ++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/server/internal/registry/server.go b/server/internal/registry/server.go
index 81085357..4d44aa8d 100644
--- a/server/internal/registry/server.go
+++ b/server/internal/registry/server.go
@@ -31,6 +31,10 @@ type Local struct {
 	// Fallback, if set, is used to handle requests that are not handled by
 	// this handler.
 	Fallback http.Handler
+
+	// Prune, if set, is called to prune the local disk cache after a model
+	// is deleted.
+	Prune func() error // optional
 }
 
 // serverError is like ollama.Error, but with a Status field for the HTTP
@@ -204,7 +208,10 @@ func (s *Local) handleDelete(_ http.ResponseWriter, r *http.Request) error {
 	if !ok {
 		return &serverError{404, "not_found", "model not found"}
 	}
-	return nil
+	if s.Prune == nil {
+		return nil
+	}
+	return s.Prune()
 }
 
 func decodeUserJSON[T any](r io.Reader) (T, error) {
diff --git a/server/routes.go b/server/routes.go
index 519f0473..f1afad6e 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -1199,6 +1199,8 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) {
 		Client:   rc,
 		Logger:   slog.Default(), // TODO(bmizerany): Take a logger, do not use slog.Default()
 		Fallback: r,
+
+		Prune: PruneLayers,
 	}
 
 	return rs, nil

From 1fdb351c37a445fb2e8fdad19fff88f6d85b2912 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 4 Mar 2025 09:03:46 -0800
Subject: [PATCH 15/17] New engine: vision models and auto-fallback (#9113)

* Include unified vision layers in memory prediction

For newer vision models with a single gguf, include
the projection estimates.

* Adjust CLI to handle both styles of vision model metadata

* Wire up new tokenizers for new engine

If we're loading the new engine, utilize the new model
text processor instead of calling into cgo wrappers for
llama.cpp.  This also cleans up some tech debt from the
older tokenization flow for the C++ server which was
no longer used.

This also adjusts the grammar handling logic to pass
through to the new engine instead of utilizing the cgo
schema to grammar call.

* Lay foundation for auto selection of new engine
---
 cmd/cmd.go                   |  14 +-
 fs/ggml/ggml.go              |  37 +++++
 llm/memory.go                |   3 +
 llm/server.go                | 278 +++++++++++++++--------------------
 model/model.go               |  31 ++++
 model/model_test.go          |  39 +++++
 model/models/llama/model.go  |   6 +
 model/models/mllama/model.go |   6 +
 server/prompt.go             |   3 +-
 server/routes.go             |   2 +-
 10 files changed, 249 insertions(+), 170 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index b4001a7f..159de9a6 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -339,10 +339,16 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}
 
-	// TODO(jessegross): We should either find another way to know if this is
-	// a vision model or remove the logic. Also consider that other modalities will
-	// need different behavior anyways.
-	opts.MultiModal = len(info.ProjectorInfo) != 0 || envconfig.NewEngine()
+	if len(info.ProjectorInfo) != 0 {
+		opts.MultiModal = true
+	}
+	for k := range info.ModelInfo {
+		if strings.Contains(k, ".vision.") {
+			opts.MultiModal = true
+			break
+		}
+	}
+
 	opts.ParentModel = info.Details.ParentModel
 
 	if interactive {
diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index b9f9cc17..8662c3b0 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -565,6 +565,43 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO
 	return
 }
 
+func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
+	switch llm.KV().Architecture() {
+	case "mllama":
+		for _, layer := range llm.Tensors().GroupLayers()["v"] {
+			weights += layer.Size()
+		}
+
+		kv := func(n string) uint64 {
+			if v, ok := llm.KV()["mllama.vision."+n].(uint32); ok {
+				return uint64(v)
+			}
+
+			return 0
+		}
+
+		imageSize := kv("image_size")
+
+		maxNumTiles := kv("max_num_tiles")
+		embeddingLength := kv("embedding_length")
+		headCount := kv("attention.head_count")
+
+		numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
+		if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
+			numPatches++
+		}
+
+		numPaddedPatches := numPatches + 8 - (numPatches%8)%8
+
+		graphSize = 4 * (8 +
+			imageSize*imageSize*kv("num_channels")*maxNumTiles +
+			embeddingLength*numPatches*maxNumTiles +
+			9*embeddingLength*numPaddedPatches*maxNumTiles +
+			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
+	}
+	return weights, graphSize
+}
+
 // SupportsKVCacheType checks if the requested cache type is supported
 func (f GGML) SupportsKVCacheType(cacheType string) bool {
 	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
diff --git a/llm/memory.go b/llm/memory.go
index 1da4d2c0..40104eca 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -115,6 +115,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
+	if projectorWeights == 0 && projectorGraph == 0 {
+		projectorWeights, projectorGraph = f.VisionGraphSize()
+	}
 
 	layers := f.Tensors().GroupLayers()
 	// add one layer worth of memory as a buffer
diff --git a/llm/server.go b/llm/server.go
index fd027a53..09690a5f 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -30,6 +30,7 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llama"
+	"github.com/ollama/ollama/model"
 )
 
 type LlamaServer interface {
@@ -54,8 +55,15 @@ type llmServer struct {
 	options     api.Options
 	numParallel int
 	modelPath   string
-	modelLock   sync.Mutex   // Temporary until we switch fully to Go server
-	model       *llama.Model // If non-nil, the runner is a new Go server
+
+	// llamaModel is an instance of the cgo llama.cpp model definition
+	// nil if this server is running the new engine
+	llamaModel     *llama.Model
+	llamaModelLock sync.Mutex
+
+	// textProcessor handles text encoding/decoding for the model in the Ollama engine
+	// nil if this server is running the llama.cpp based engine
+	textProcessor model.TextProcessor
 
 	estimate    MemoryEstimate
 	totalLayers uint64
@@ -89,7 +97,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 
 // NewLlamaServer will run a server for the given GPUs
 // The gpu list must be a single family.
-func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
+func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
 	systemInfo := discover.GetSystemInfo()
 	systemTotalMemory := systemInfo.System.TotalMemory
 	systemFreeMemory := systemInfo.System.FreeMemory
@@ -130,7 +138,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 	slog.Info("offload", "", estimate)
 
 	params := []string{
-		"--model", model,
+		"--model", modelPath,
 		"--ctx-size", strconv.Itoa(opts.NumCtx),
 		"--batch-size", strconv.Itoa(opts.NumBatch),
 	}
@@ -153,11 +161,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 		}
 	}
 
-	if len(projectors) > 0 {
-		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
-		params = append(params, "--mmproj", projectors[0])
-	}
-
 	defaultThreads := systemInfo.GetOptimalThreadCount()
 	if opts.NumThread > 0 {
 		params = append(params, "--threads", strconv.Itoa(opts.NumThread))
@@ -257,6 +260,34 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 		}
 	}
 	slog.Debug("compatible gpu libraries", "compatible", compatible)
+	exe, err := os.Executable()
+	if err != nil {
+		return nil, fmt.Errorf("unable to lookup executable path: %w", err)
+	}
+
+	if eval, err := filepath.EvalSymlinks(exe); err == nil {
+		exe = eval
+	}
+
+	var llamaModel *llama.Model
+	var textProcessor model.TextProcessor
+	if envconfig.NewEngine() {
+		textProcessor, err = model.NewTextProcessor(modelPath)
+		if err != nil {
+			// To prepare for opt-out mode, instead of treating this as an error, we fallback to the old runner
+			slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err)
+		}
+	}
+	if textProcessor == nil {
+		llamaModel, err = llama.LoadModelFromFile(modelPath, llama.ModelParams{VocabOnly: true})
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	if len(projectors) > 0 && llamaModel != nil {
+		params = append(params, "--mmproj", projectors[0])
+	}
 
 	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
 	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
@@ -275,7 +306,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 			port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
 		}
 		finalParams := []string{"runner"}
-		if envconfig.NewEngine() {
+		if textProcessor != nil {
+			// New engine
+			// TODO - if we have failure to load scenarios, add logic to retry with the old runner
 			finalParams = append(finalParams, "--ollama-engine")
 		}
 		finalParams = append(finalParams, params...)
@@ -315,28 +348,20 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 		// finally, add the root library path
 		libraryPaths = append(libraryPaths, discover.LibOllamaPath)
 
-		exe, err := os.Executable()
-		if err != nil {
-			return nil, fmt.Errorf("unable to lookup executable path: %w", err)
-		}
-
-		if eval, err := filepath.EvalSymlinks(exe); err == nil {
-			exe = eval
-		}
-
-		// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
 		s := &llmServer{
-			port:        port,
-			cmd:         exec.Command(exe, finalParams...),
-			status:      NewStatusWriter(os.Stderr),
-			options:     opts,
-			modelPath:   model,
-			estimate:    estimate,
-			numParallel: numParallel,
-			sem:         semaphore.NewWeighted(int64(numParallel)),
-			totalLayers: f.KV().BlockCount() + 1,
-			gpus:        gpus,
-			done:        make(chan error, 1),
+			port:          port,
+			cmd:           exec.Command(exe, finalParams...),
+			status:        NewStatusWriter(os.Stderr),
+			options:       opts,
+			modelPath:     modelPath,
+			llamaModel:    llamaModel,
+			textProcessor: textProcessor,
+			estimate:      estimate,
+			numParallel:   numParallel,
+			sem:           semaphore.NewWeighted(int64(numParallel)),
+			totalLayers:   f.KV().BlockCount() + 1,
+			gpus:          gpus,
+			done:          make(chan error, 1),
 		}
 
 		s.cmd.Env = os.Environ()
@@ -405,6 +430,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 			}
 			err := fmt.Errorf("error starting runner: %v %s", err, msg)
 			if len(compatible) == 0 {
+				if llamaModel != nil {
+					llama.FreeModel(llamaModel)
+				}
 				return nil, err
 			}
 
@@ -701,24 +729,29 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	}
 
 	if len(req.Format) > 0 {
-		switch string(req.Format) {
-		case `null`, `""`:
-			// Field was set, but "missing" a value. We accept
-			// these as "not set".
-			break
-		case `"json"`:
-			request["grammar"] = grammarJSON
-		default:
-			if req.Format[0] != '{' {
-				return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format)
-			}
+		format := string(req.Format)
+		if format != `null` && format != `""` {
+			if s.textProcessor != nil {
+				// New engine handles this on the backend
+				request["format"] = req.Format
+			} else {
+				// old engine
+				switch format {
+				case `"json"`:
+					request["grammar"] = grammarJSON
+				default:
+					if req.Format[0] != '{' {
+						return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format)
+					}
 
-			// User provided a JSON schema
-			g := llama.SchemaToGrammar(req.Format)
-			if g == nil {
-				return fmt.Errorf("invalid JSON schema in format")
+					// User provided a JSON schema
+					g := llama.SchemaToGrammar(req.Format)
+					if g == nil {
+						return fmt.Errorf("invalid JSON schema in format")
+					}
+					request["grammar"] = string(g)
+				}
 			}
-			request["grammar"] = string(g)
 		}
 	}
 
@@ -933,64 +966,25 @@ type TokenizeResponse struct {
 }
 
 func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) {
-	s.modelLock.Lock()
-	defer s.modelLock.Unlock()
-	if s.model != nil {
-		return s.model.Tokenize(content, false, true)
-	}
+	s.llamaModelLock.Lock()
+	defer s.llamaModelLock.Unlock()
 
-	// Make sure the server is ready
-	status, err := s.getServerStatus(ctx)
-	if err != nil {
-		return nil, err
-	} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
-		return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
+	if s.llamaModel != nil {
+		return s.llamaModel.Tokenize(content, false, true)
 	}
-
-	data, err := json.Marshal(TokenizeRequest{Content: content})
-	if err != nil {
-		return nil, fmt.Errorf("marshaling encode data: %w", err)
-	}
-
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data))
-	if err != nil {
-		return nil, fmt.Errorf("encode request: %w", err)
-	}
-	req.Header.Set("Content-Type", "application/json")
-
-	resp, err := http.DefaultClient.Do(req)
-	if err != nil {
-		return nil, fmt.Errorf("do encode request: %w", err)
-	}
-	defer resp.Body.Close()
-	if resp.StatusCode == http.StatusNotFound {
-		if s.model == nil {
-			slog.Debug("new runner detected, loading model for cgo tokenization")
-			m, err := llama.LoadModelFromFile(s.modelPath, llama.ModelParams{VocabOnly: true})
-			if err != nil {
-				return nil, err
-			}
-			s.model = m
+	if s.textProcessor != nil {
+		tokens, err := s.textProcessor.Encode(content)
+		if err != nil {
+			return nil, err
 		}
-		return s.model.Tokenize(content, false, true)
+		toks := make([]int, len(tokens))
+		for i, t := range tokens {
+			toks[i] = int(t)
+		}
+		return toks, nil
 	}
-
-	body, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return nil, fmt.Errorf("read encode request: %w", err)
-	}
-
-	if resp.StatusCode >= 400 {
-		log.Printf("llm encode error: %s", body)
-		return nil, fmt.Errorf("%s", body)
-	}
-
-	var encoded TokenizeResponse
-	if err := json.Unmarshal(body, &encoded); err != nil {
-		return nil, fmt.Errorf("unmarshal encode response: %w", err)
-	}
-
-	return encoded.Tokens, nil
+	// not reached
+	return nil, fmt.Errorf("no tokenizer configured")
 }
 
 type DetokenizeRequest struct {
@@ -1002,80 +996,38 @@ type DetokenizeResponse struct {
 }
 
 func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
-	s.modelLock.Lock()
-	defer s.modelLock.Unlock()
-	if s.model != nil {
+	s.llamaModelLock.Lock()
+	defer s.llamaModelLock.Unlock()
+
+	if s.llamaModel != nil {
 		var resp string
 		for _, token := range tokens {
-			resp += s.model.TokenToPiece(token)
+			resp += s.llamaModel.TokenToPiece(token)
 		}
 		return resp, nil
 	}
-	// Make sure the server is ready
-	status, err := s.getServerStatus(ctx)
-	if err != nil {
-		return "", err
-	} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
-		return "", fmt.Errorf("unexpected server status: %s", status.ToString())
-	}
-
-	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
-	if err != nil {
-		return "", fmt.Errorf("marshaling decode data: %w", err)
-	}
-
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data))
-	if err != nil {
-		return "", fmt.Errorf("decode request: %w", err)
-	}
-	req.Header.Set("Content-Type", "application/json")
-
-	resp, err := http.DefaultClient.Do(req)
-	if err != nil {
-		return "", fmt.Errorf("do decode request: %w", err)
-	}
-	defer resp.Body.Close()
-	if resp.StatusCode == http.StatusNotFound {
-		if s.model == nil {
-			slog.Debug("new runner detected, loading model for cgo tokenization")
-			m, err := llama.LoadModelFromFile(s.modelPath, llama.ModelParams{VocabOnly: true})
-			if err != nil {
-				return "", err
-			}
-			s.model = m
+	if s.textProcessor != nil {
+		toks := make([]int32, len(tokens))
+		for i, t := range tokens {
+			toks[i] = int32(t)
 		}
-		var resp string
-		for _, token := range tokens {
-			resp += s.model.TokenToPiece(token)
+		content, err := s.textProcessor.Decode(toks)
+		if err != nil {
+			return "", err
 		}
-		return resp, nil
+		return content, nil
 	}
-
-	body, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return "", fmt.Errorf("read decode request: %w", err)
-	}
-
-	if resp.StatusCode >= 400 {
-		log.Printf("llm decode error: %s", body)
-		return "", fmt.Errorf("%s", body)
-	}
-
-	var decoded DetokenizeResponse
-	if err := json.Unmarshal(body, &decoded); err != nil {
-		return "", fmt.Errorf("unmarshal encode response: %w", err)
-	}
-
-	return decoded.Content, nil
+	// not reached
+	return "", fmt.Errorf("no tokenizer configured")
 }
 
 func (s *llmServer) Close() error {
-	s.modelLock.Lock()
-	if s.model != nil {
-		llama.FreeModel(s.model)
-		s.model = nil
+	s.llamaModelLock.Lock()
+	if s.llamaModel != nil {
+		llama.FreeModel(s.llamaModel)
+		s.llamaModel = nil
 	}
-	s.modelLock.Unlock()
+	s.llamaModelLock.Unlock()
 
 	if s.cmd != nil {
 		slog.Debug("stopping llama server")
diff --git a/model/model.go b/model/model.go
index 16020b35..f8ed8741 100644
--- a/model/model.go
+++ b/model/model.go
@@ -16,6 +16,7 @@ import (
 	_ "golang.org/x/image/tiff"
 	_ "golang.org/x/image/webp"
 
+	fs "github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	_ "github.com/ollama/ollama/ml/backend"
@@ -100,6 +101,36 @@ func New(modelPath string, params ml.BackendParams) (Model, error) {
 	return m, nil
 }
 
+func NewTextProcessor(s string) (TextProcessor, error) {
+	r, err := os.Open(s)
+	if err != nil {
+		return nil, err
+	}
+	defer r.Close()
+	meta, _, err := fs.Decode(r, -1)
+	if err != nil {
+		return nil, err
+	}
+	return getTextProcessor(meta.KV())
+}
+
+func getTextProcessor(kv fs.KV) (TextProcessor, error) {
+	arch := kv.Architecture()
+	f, ok := models[arch]
+	if !ok {
+		return nil, fmt.Errorf("unsupported model architecture %q", arch)
+	}
+	m, err := f(kv)
+	if err != nil {
+		return nil, err
+	}
+	tp, ok := m.(TextProcessor)
+	if !ok {
+		return nil, fmt.Errorf("%v is not a TextProcessor", m)
+	}
+	return tp, nil
+}
+
 func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {
 	t := v.Type()
 
diff --git a/model/model_test.go b/model/model_test.go
index 02b8aa3c..8761817e 100644
--- a/model/model_test.go
+++ b/model/model_test.go
@@ -3,9 +3,11 @@ package model
 import (
 	"reflect"
 	"slices"
+	"strings"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
+	fs "github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/backend/ggml"
 	"github.com/ollama/ollama/ml/nn"
@@ -134,3 +136,40 @@ func TestPopulateFieldsAlternateName(t *testing.T) {
 		t.Errorf("populateFields() set incorrect values (-want +got):\n%s", diff)
 	}
 }
+
+func TestGetTextProcessor(t *testing.T) {
+	tp, err := getTextProcessor(fs.KV{})
+	if err == nil {
+		t.Error("expected error")
+	} else if !strings.Contains(err.Error(), "unsupported model architecture") {
+		t.Errorf("unexpected error: %v", err)
+	} else if tp != nil {
+		t.Error("expected nil tp")
+	}
+
+	models["dummy"] = func(ml.Config) (Model, error) {
+		return notTextProcessorModel{}, nil
+	}
+	tp, err = getTextProcessor(fs.KV{"general.architecture": "dummy"})
+	if err == nil {
+		t.Error("expected error")
+	} else if !strings.Contains(err.Error(), "not a TextProcessor") {
+		t.Errorf("unexpected error: %v", err)
+	} else if tp != nil {
+		t.Error("expected nil tp")
+	}
+}
+
+type notTextProcessorModel struct{}
+
+func (notTextProcessorModel) Forward(ml.Context, Options) (ml.Tensor, error) {
+	panic("unimplemented")
+}
+
+func (notTextProcessorModel) Backend() ml.Backend {
+	panic("unimplemented")
+}
+
+func (notTextProcessorModel) Config() config {
+	panic("unimplemented")
+}
diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index 9bf6f497..2f254a28 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -1,7 +1,9 @@
 package llama
 
 import (
+	"fmt"
 	"math"
+	"strings"
 
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
@@ -29,6 +31,10 @@ type Model struct {
 }
 
 func New(c ml.Config) (model.Model, error) {
+	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
+		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
+	}
+
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go
index 743f4c32..8fee0cdb 100644
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -1,6 +1,8 @@
 package mllama
 
 import (
+	"fmt"
+
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
@@ -25,6 +27,10 @@ const (
 )
 
 func New(c ml.Config) (model.Model, error) {
+	// Verify unified config
+	if c.Uint("vision.block_count") == 0 {
+		return nil, fmt.Errorf("non-unified vision model not supported")
+	}
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
diff --git a/server/prompt.go b/server/prompt.go
index 233dffd6..5b5b958f 100644
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -10,7 +10,6 @@ import (
 	"strings"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/model/models/mllama"
 	"github.com/ollama/ollama/template"
@@ -93,7 +92,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 			var imgData llm.ImageData
 
 			if isMllama {
-				if envconfig.NewEngine() {
+				if len(m.ProjectorPaths) == 0 {
 					imgData = llm.ImageData{
 						ID:   len(images),
 						Data: i,
diff --git a/server/routes.go b/server/routes.go
index f1afad6e..73e94dc6 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -205,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 
 	images := make([]llm.ImageData, len(req.Images))
 	for i := range req.Images {
-		if isMllama && !envconfig.NewEngine() {
+		if isMllama && len(model.ProjectorPaths) > 0 {
 			data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
 			if err != nil {
 				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})

From 8fe6f69f2828d66df7fafa02cf6ee5cb030ef87d Mon Sep 17 00:00:00 2001
From: aritra saha <162728301+olumolu@users.noreply.github.com>
Date: Wed, 5 Mar 2025 00:40:56 +0530
Subject: [PATCH 16/17] docs: add granite-3.2 to the readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 16b0d68e..5aa4801e 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ Here are some example models that can be downloaded:
 | Code Llama         | 7B         | 3.8GB | `ollama run codellama`           |
 | Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored`   |
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`               |
-| Solar              | 10.7B      | 6.1GB | `ollama run solar`               |
+| Granite-3.2         | 8B         | 4.9GB | `ollama run granite3.2`          |
 
 > [!NOTE]
 > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.

From 05a01fdecbf9077613c57874b3f8eb7919f76527 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 28 Feb 2025 18:06:47 -0800
Subject: [PATCH 17/17] ml/backend/ggml: consolidate system info logging

- output backend system info when initializing the backend. this ensures
  this information is always present without needing to be called
  explicitly
- convert to structured logging
- enumerate devices rather than backends since devices are ordered
- track device indices grouped by device name
---
 cmd/cmd.go                       |  2 --
 llama/llama.go                   | 25 --------------
 ml/backend.go                    |  1 -
 ml/backend/ggml/ggml.go          | 59 ++++----------------------------
 ml/backend/ggml/ggml/src/ggml.go | 54 +++++++++++++++++++++++++++++
 runner/llamarunner/runner.go     |  1 -
 runner/ollamarunner/runner.go    |  2 --
 7 files changed, 60 insertions(+), 84 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 159de9a6..c22a08f4 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -34,7 +34,6 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/runner"
@@ -1281,7 +1280,6 @@ func NewCLI() *cobra.Command {
 
 	runnerCmd := &cobra.Command{
 		Use:    "runner",
-		Short:  llama.PrintSystemInfo(),
 		Hidden: true,
 		RunE: func(cmd *cobra.Command, args []string) error {
 			return runner.Execute(os.Args[1:])
diff --git a/llama/llama.go b/llama/llama.go
index 0c4fca43..bb5028bd 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -21,18 +21,6 @@ package llama
 
 extern bool llamaProgressCallback(float progress, void *user_data);
 extern void llamaLog(int level, char* text, void* user_data);
-
-typedef enum {COMP_UNKNOWN,COMP_GCC,COMP_CLANG} COMPILER;
-COMPILER inline get_compiler() {
-#if defined(__clang__)
-	return COMP_CLANG;
-#elif defined(__GNUC__)
-	return COMP_GCC;
-#else
-	return UNKNOWN_COMPILER;
-#endif
-}
-
 */
 import "C"
 
@@ -72,19 +60,6 @@ func BackendInit() {
 	C.llama_backend_init()
 }
 
-func PrintSystemInfo() string {
-	var compiler string
-	switch C.get_compiler() {
-	case C.COMP_UNKNOWN:
-		compiler = "cgo(unknown_compiler)"
-	case C.COMP_GCC:
-		compiler = "cgo(gcc)"
-	case C.COMP_CLANG:
-		compiler = "cgo(clang)"
-	}
-	return C.GoString(C.llama_print_system_info()) + compiler
-}
-
 func GetModelArch(modelPath string) (string, error) {
 	mp := C.CString(modelPath)
 	defer C.free(unsafe.Pointer(mp))
diff --git a/ml/backend.go b/ml/backend.go
index 83b7a8c9..3ef8a1ac 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -24,7 +24,6 @@ type Backend interface {
 	Config() Config
 	Get(name string) Tensor
 	NewContext() Context
-	SystemInfo() string
 }
 
 // BackendCacheConfig should be implemented by backends that need special output
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index f4948fca..2d8ddf99 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1,27 +1,11 @@
 package ggml
 
-/*
-#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
-#include <stdlib.h>
-#include <stdint.h>
-#include "ggml.h"
-#include "ggml-cpu.h"
-#include "ggml-backend.h"
-static struct ggml_backend_feature * getBackendFeatures(void *fp, ggml_backend_reg_t reg) {return ((ggml_backend_get_features_t)(fp))(reg);}
-static struct ggml_backend_feature * getNextBackendFeatures(struct ggml_backend_feature * feature) { return &feature[1];}
-
-typedef enum {COMP_UNKNOWN,COMP_GCC,COMP_CLANG} COMPILER;
-COMPILER inline get_compiler() {
-#if defined(__clang__)
-	return COMP_CLANG;
-#elif defined(__GNUC__)
-	return COMP_GCC;
-#else
-	return UNKNOWN_COMPILER;
-#endif
-}
-
-*/
+// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
+// #include <stdlib.h>
+// #include <stdint.h>
+// #include "ggml.h"
+// #include "ggml-cpu.h"
+// #include "ggml-backend.h"
 import "C"
 
 import (
@@ -729,34 +713,3 @@ func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask ml.T
 		return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	}
 }
-
-func (b *Backend) SystemInfo() string {
-	var compiler string
-	switch C.get_compiler() {
-	case C.COMP_UNKNOWN:
-		compiler = "cgo(unknown_compiler)"
-	case C.COMP_GCC:
-		compiler = "cgo(gcc)"
-	case C.COMP_CLANG:
-		compiler = "cgo(clang)"
-	}
-
-	var s string
-	for i := range C.ggml_backend_reg_count() {
-		reg := C.ggml_backend_reg_get(i)
-		fName := C.CString("ggml_backend_get_features")
-		defer C.free(unsafe.Pointer(fName))
-		get_features_fn := C.ggml_backend_reg_get_proc_address(reg, fName)
-		if get_features_fn != nil {
-			s += C.GoString(C.ggml_backend_reg_name(reg))
-			s += " : "
-			for features := C.getBackendFeatures(get_features_fn, reg); features.name != nil; features = C.getNextBackendFeatures(features) {
-				s += C.GoString(features.name)
-				s += " = "
-				s += C.GoString(features.value)
-				s += " | "
-			}
-		}
-	}
-	return s + compiler
-}
diff --git a/ml/backend/ggml/ggml/src/ggml.go b/ml/backend/ggml/ggml/src/ggml.go
index 85c693eb..afc1e1ed 100644
--- a/ml/backend/ggml/ggml/src/ggml.go
+++ b/ml/backend/ggml/ggml/src/ggml.go
@@ -7,6 +7,20 @@ package ggml
 // #include <stdlib.h>
 // #include "ggml-backend.h"
 // extern void sink(int level, char *text, void *user_data);
+// static struct ggml_backend_feature * first_feature(ggml_backend_get_features_t fp, ggml_backend_reg_t reg) { return fp(reg); }
+// static struct ggml_backend_feature * next_feature(struct ggml_backend_feature * feature) { return &feature[1]; }
+/*
+typedef enum { COMPILER_CLANG, COMPILER_GNUC, COMPILER_UNKNOWN } COMPILER;
+static COMPILER compiler_name(void) {
+#if defined(__clang__)
+	return COMPILER_CLANG;
+#elif defined(__GNUC__)
+	return COMPILER_GNUC;
+#else
+	return COMPILER_UNKNOWN;
+#endif
+}
+*/
 import "C"
 
 import (
@@ -16,6 +30,7 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
+	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
@@ -90,4 +105,43 @@ var OnceLoad = sync.OnceFunc(func() {
 			visited[abspath] = struct{}{}
 		}
 	}
+
+	slog.Info("system", "", system{})
 })
+
+type system struct{}
+
+func (system) LogValue() slog.Value {
+	var attrs []slog.Attr
+	names := make(map[string]int)
+	for i := range C.ggml_backend_dev_count() {
+		r := C.ggml_backend_dev_backend_reg(C.ggml_backend_dev_get(i))
+
+		func() {
+			fName := C.CString("ggml_backend_get_features")
+			defer C.free(unsafe.Pointer(fName))
+
+			if fn := C.ggml_backend_reg_get_proc_address(r, fName); fn != nil {
+				var features []any
+				for f := C.first_feature(C.ggml_backend_get_features_t(fn), r); f.name != nil; f = C.next_feature(f) {
+					features = append(features, C.GoString(f.name), C.GoString(f.value))
+				}
+
+				name := C.GoString(C.ggml_backend_reg_name(r))
+				attrs = append(attrs, slog.Group(name+"."+strconv.Itoa(names[name]), features...))
+				names[name] += 1
+			}
+		}()
+	}
+
+	switch C.compiler_name() {
+	case C.COMPILER_CLANG:
+		attrs = append(attrs, slog.String("compiler", "cgo(clang)"))
+	case C.COMPILER_GNUC:
+		attrs = append(attrs, slog.String("compiler", "cgo(gcc)"))
+	default:
+		attrs = append(attrs, slog.String("compiler", "cgo(unknown)"))
+	}
+
+	return slog.GroupValue(attrs...)
+}
diff --git a/runner/llamarunner/runner.go b/runner/llamarunner/runner.go
index 82880c98..8662afc1 100644
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -931,7 +931,6 @@ func Execute(args []string) error {
 	slog.Info("starting go runner")
 
 	llama.BackendInit()
-	slog.Info("system", "info", llama.PrintSystemInfo(), "threads", *threads)
 
 	server := &Server{
 		batchSize: *batchSize,
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index 5705931a..1a4bbf19 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -786,8 +786,6 @@ func (s *Server) loadModel(
 		panic(err)
 	}
 
-	slog.Info("system", "info", s.model.Backend().SystemInfo(), "threads", params.NumThreads)
-
 	// TODO(jessegross): LoRA loading
 	if lpath.String() != "" {
 		panic("loras are not yet implemented")