diff --git a/README.md b/README.md
index e773236b..eb11483c 100644
--- a/README.md
+++ b/README.md
@@ -483,6 +483,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
 - [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))
 - [ollama-bash-toolshed](https://github.com/attogram/ollama-bash-toolshed) - Bash scripts to chat with tool using models. Add new tools to your shed with ease. Runs on Ollama.
+- [VT Code](https://github.com/vinhnx/vtcode) - VT Code is a Rust-based terminal coding agent with semantic code intelligence via Tree-sitter. Ollama integration for running local/cloud models with configurable endpoints.
 
 ### Apple Vision Pro
 
diff --git a/convert/convert.go b/convert/convert.go
index bed59a57..3e98eee1 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -198,6 +198,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &qwen2Model{}
 	case "Qwen2_5_VLForConditionalGeneration":
 		conv = &qwen25VLModel{}
+	case "Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration":
+		conv = &qwen3VLModel{}
 	case "BertModel":
 		conv = &bertModel{}
 	case "CohereForCausalLM":
diff --git a/convert/convert_qwen3.go b/convert/convert_qwen3.go
new file mode 100644
index 00000000..f54418a9
--- /dev/null
+++ b/convert/convert_qwen3.go
@@ -0,0 +1,157 @@
+package convert
+
+import (
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+)
+
+type qwen3Model struct {
+	ModelParameters
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	HiddenLayers          uint32  `json:"num_hidden_layers"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	HeadDim               uint32  `json:"head_dim"`
+	NumExperts            uint32  `json:"num_experts"`
+	NumExpertsPerToken    uint32  `json:"num_experts_per_tok"`
+	NormTopkProb          bool    `json:"norm_topk_prob"`
+	RopeTheta             float32 `json:"rope_theta"`
+	RopeScaling           struct {
+		Type                          string     `json:"type"`
+		Factor                        ropeFactor `json:"factor"`
+		OriginalMaxPositionEmbeddings uint32     `json:"original_max_position_embeddings"`
+		MropeSection                  []int32    `json:"mrope_section"`
+	} `json:"rope_scaling"`
+	RMSNormEPS float32 `json:"rms_norm_eps"`
+}
+
+// KV implements ModelConverter.
+func (q *qwen3Model) KV(t *Tokenizer) ggml.KV {
+	arch := "qwen3"
+	if q.NumExperts > 0 {
+		arch += "moe"
+	}
+
+	kv := q.ModelParameters.KV(t)
+	kv["general.architecture"] = arch
+	kv["block_count"] = q.HiddenLayers
+	kv["context_length"] = q.MaxPositionEmbeddings
+	kv["embedding_length"] = q.HiddenSize
+	kv["feed_forward_length"] = q.IntermediateSize
+	kv["attention.head_count"] = q.NumAttentionHeads
+	kv["attention.head_count_kv"] = q.NumKeyValueHeads
+	kv["attention.key_length"] = q.HeadDim
+	kv["attention.value_length"] = q.HeadDim
+
+	if q.NumExperts > 0 {
+		kv["expert_count"] = q.NumExperts
+		kv["expert_used_count"] = q.NumExpertsPerToken
+		kv["norm_top_k_prob"] = q.NormTopkProb
+	}
+
+	kv["rope.freq_base"] = q.RopeTheta
+	kv["attention.layer_norm_rms_epsilon"] = q.RMSNormEPS
+
+	switch q.RopeScaling.Type {
+	case "":
+		// no scaling
+	case "yarn":
+		kv["rope.scaling.type"] = q.RopeScaling.Type
+		kv["rope.scaling.factor"] = q.RopeScaling.Factor
+	case "mrope", "default":
+		kv["rope.mrope_section"] = q.RopeScaling.MropeSection
+	default:
+		panic("unknown rope scaling type")
+	}
+	return kv
+}
+
+// Tensors implements ModelConverter.
+func (q *qwen3Model) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
+
+	// TODO: handle split experts
+
+	for _, t := range ts {
+		switch {
+		case strings.Contains(t.Name(), "ffn_gate_up_exps"):
+			afterFunc := func(t tensor.Tensor) (tensor.Tensor, error) { return tensor.Transpose(t, 0, 2, 1) }
+			for t := range splitDim(t, 2,
+				split{Replacer: strings.NewReplacer("gate_up", "gate"), afterFunc: afterFunc},
+				split{Replacer: strings.NewReplacer("gate_up", "up"), afterFunc: afterFunc},
+			) {
+				t.Shape[1], t.Shape[2] = t.Shape[2], t.Shape[1]
+				out = append(out, t)
+			}
+		case strings.Contains(t.Name(), "ffn_down_exps"):
+			shape := slices.Clone(t.Shape())
+			shape[1], shape[2] = shape[2], shape[1]
+			t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
+				dims := make([]int, len(shape))
+				for i := range shape {
+					dims[i] = int(shape[i])
+				}
+
+				var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+				tt, err := tensor.Transpose(tt, 0, 2, 1)
+				if err != nil {
+					return nil, err
+				}
+
+				// flatten tensor so it can be written as a vector
+				if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
+					return nil, err
+				}
+
+				return native.VectorF32(tt.(*tensor.Dense))
+			})
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    shape,
+				WriterTo: t,
+			})
+		default:
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
+		}
+	}
+
+	return out
+}
+
+// Replacements implements ModelConverter.
+func (q *qwen3Model) Replacements() []string {
+	return []string{
+		"lm_head", "output",
+		"model.embed_tokens", "token_embd",
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.k_norm", "attn_k_norm",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.q_norm", "attn_q_norm",
+		"self_attn.o_proj", "attn_output",
+		"mlp.down_proj", "ffn_down",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.up_proj", "ffn_up",
+		"mlp.gate.weight", "ffn_gate_inp.weight",
+		"mlp.experts.down_proj", "ffn_down_exps.weight",
+		"mlp.experts.gate_up_proj", "ffn_gate_up_exps.weight",
+		"post_attention_layernorm", "ffn_norm",
+		"model.norm", "output_norm",
+	}
+}
+
+var _ ModelConverter = (*qwen3Model)(nil)
diff --git a/convert/convert_qwen3vl.go b/convert/convert_qwen3vl.go
new file mode 100644
index 00000000..e0ccb805
--- /dev/null
+++ b/convert/convert_qwen3vl.go
@@ -0,0 +1,116 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/json"
+	"io/fs"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type qwen3VLModel struct {
+	qwen3Model `json:"text_config"`
+
+	VisionModel struct {
+		Depth                  uint32  `json:"depth"`
+		HiddenSize             uint32  `json:"hidden_size"`
+		NumHeads               uint32  `json:"num_heads"`
+		InChannels             uint32  `json:"in_channels"`
+		PatchSize              uint32  `json:"patch_size"`
+		SpatialMergeSize       uint32  `json:"spatial_merge_size"`
+		WindowSize             uint32  `json:"window_size"`
+		RMSNormEps             float32 `json:"layer_norm_epsilon"`
+		RopeTheta              float32 `json:"rope_theta"`
+		TemporalPatchSize      uint32  `json:"temporal_patch_size"`
+		DeepstackVisualIndexes []int32 `json:"deepstack_visual_indexes"`
+
+		Size struct {
+			ShortestEdge uint32 `json:"shortest_edge"`
+			LongestEdge  uint32 `json:"longest_edge"`
+		} `json:"size"`
+
+		ImageMean []float32 `json:"image_mean"`
+		ImageStd  []float32 `json:"image_std"`
+	} `json:"vision_config"`
+}
+
+func (m *qwen3VLModel) parseMore(fsys fs.FS) error {
+	bts, err := fs.ReadFile(fsys, "preprocessor_config.json")
+	if err != nil {
+		return err
+	}
+
+	return json.Unmarshal(bts, &m.VisionModel)
+}
+
+func (m *qwen3VLModel) KV(t *Tokenizer) ggml.KV {
+	kv := m.qwen3Model.KV(t)
+
+	arch := "qwen3vl"
+	if m.NumExperts > 0 {
+		arch += "moe"
+	}
+	// override architecture
+	kv["general.architecture"] = arch
+
+	kv["vision.block_count"] = cmp.Or(m.VisionModel.Depth, 32)
+	kv["vision.embedding_length"] = m.VisionModel.HiddenSize
+	kv["vision.attention.head_count"] = cmp.Or(m.VisionModel.NumHeads, 16)
+	kv["vision.num_channels"] = m.VisionModel.InChannels
+	kv["vision.patch_size"] = cmp.Or(m.VisionModel.PatchSize, 14)
+	kv["vision.spatial_merge_size"] = cmp.Or(m.VisionModel.SpatialMergeSize, 2)
+	kv["vision.attention.layer_norm_epsilon"] = cmp.Or(m.VisionModel.RMSNormEps, 1e-6)
+	kv["vision.rope.freq_base"] = cmp.Or(m.VisionModel.RopeTheta, 1e4)
+	kv["vision.temporal_patch_size"] = cmp.Or(m.VisionModel.TemporalPatchSize, 2)
+	kv["vision.deepstack_visual_indexes"] = m.VisionModel.DeepstackVisualIndexes
+
+	kv["vision.shortest_edge"] = m.VisionModel.Size.ShortestEdge
+	kv["vision.longest_edge"] = m.VisionModel.Size.LongestEdge
+
+	kv["vision.image_mean"] = m.VisionModel.ImageMean
+	kv["vision.image_std"] = m.VisionModel.ImageStd
+
+	return kv
+}
+
+func (m *qwen3VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var rest []Tensor
+	var out []*ggml.Tensor
+	for _, t := range ts {
+		switch {
+		case strings.Contains(t.Name(), "attn_qkv"):
+			out = append(out, slices.Collect(splitDim(t, 0,
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_q")},
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_k")},
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_v")},
+			))...)
+		case strings.Contains(t.Name(), "patch_embed") && strings.HasSuffix(t.Name(), "weight"):
+			shape := t.Shape()
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    append([]uint64{shape[0] * shape[1]}, shape[2:]...),
+				WriterTo: t,
+			})
+		default:
+			rest = append(rest, t)
+		}
+	}
+
+	return append(m.qwen3Model.Tensors(rest), out...)
+}
+
+func (m *qwen3VLModel) Replacements() []string {
+	return append(
+		m.qwen3Model.Replacements(),
+		"model.language_", "",
+		"model.visual", "v",
+		"patch_embed.proj", "patch_embed",
+		"blocks", "blk",
+		"attn.qkv", "attn_qkv",
+		"attn.proj", "attn_out",
+		"deepstack_merger_list", "deepstack_merger",
+	)
+}
diff --git a/convert/tensor.go b/convert/tensor.go
index 9b8517f1..27bdd13f 100644
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -19,8 +19,8 @@ type split struct {
 	dim    int
 	slices []tensor.Slice
 
-	// fn is an optional function to apply to the tensor after slicing
-	fn func(tensor.Tensor) (tensor.Tensor, error)
+	// afterFunc is an optional function to apply to the tensor after slicing
+	afterFunc func(tensor.Tensor) (tensor.Tensor, error)
 }
 
 // splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
@@ -54,8 +54,8 @@ func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
 
 				tt = tensor.Materialize(tt)
 
-				if split.fn != nil {
-					tt, err = split.fn(tt)
+				if split.afterFunc != nil {
+					tt, err = split.afterFunc(tt)
 					if err != nil {
 						return nil, err
 					}
diff --git a/convert/tensor_test.go b/convert/tensor_test.go
index 3a34bbff..c1f58da6 100644
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@@ -432,7 +432,7 @@ func TestSplitDim(t *testing.T) {
 		t.Run("split with transpose", func(t *testing.T) {
 			next, stop := iter.Pull(splitDim(&r, 1,
 				split{Replacer: strings.NewReplacer("a", "x")},
-				split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
+				split{Replacer: strings.NewReplacer("b", "y"), afterFunc: func(tt tensor.Tensor) (tensor.Tensor, error) {
 					return tensor.Transpose(tt, 1, 0)
 				}},
 			))
diff --git a/discover/cpu_linux_test.go b/discover/cpu_linux_test.go
index 3a514478..7ff34df0 100644
--- a/discover/cpu_linux_test.go
+++ b/discover/cpu_linux_test.go
@@ -2065,12 +2065,6 @@ power management:
 			cpus := linuxCPUDetails(buf)
 
 			slog.Info("example", "scenario", k, "cpus", cpus)
-			si := SystemInfo{
-				System: CPUInfo{
-					CPUs: cpus,
-				},
-			}
-			threadCount := si.GetOptimalThreadCount()
 			if len(v.expCPUs) != len(cpus) {
 				t.Fatalf("incorrect number of sockets: expected:%v got:%v", v.expCPUs, cpus)
 			}
@@ -2085,10 +2079,6 @@ power management:
 					t.Fatalf("incorrect number of threads: expected:%v got:%v", v.expCPUs[i], c)
 				}
 			}
-
-			if threadCount != v.expThreadCount {
-				t.Fatalf("incorrect thread count expected:%d got:%d", v.expThreadCount, threadCount)
-			}
 		})
 	}
 }
diff --git a/discover/gpu.go b/discover/gpu.go
index 2f394fdf..927aed2a 100644
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -1,16 +1,13 @@
 package discover
 
 import (
-	"context"
 	"log/slog"
 	"os"
-	"path/filepath"
 	"regexp"
 	"runtime"
 	"strconv"
 	"strings"
 
-	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/ml"
 )
 
@@ -18,159 +15,28 @@ import (
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
 
-func GetCPUInfo() GpuInfo {
-	mem, err := GetCPUMem()
+// GetSystemInfo returns the last cached state of the GPUs on the system
+func GetSystemInfo() ml.SystemInfo {
+	memInfo, err := GetCPUMem()
 	if err != nil {
 		slog.Warn("error looking up system memory", "error", err)
 	}
-
-	return GpuInfo{
-		memInfo: mem,
-		DeviceID: ml.DeviceID{
-			Library: "cpu",
-			ID:      "0",
-		},
-	}
-}
-
-func GetGPUInfo(ctx context.Context, runners []FilteredRunnerDiscovery) GpuInfoList {
-	devs := GPUDevices(ctx, runners)
-	return devInfoToInfoList(devs)
-}
-
-func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList {
-	resp := []GpuInfo{}
-	// Our current packaging model places ggml-hip in the main directory
-	// but keeps rocm in an isolated directory.  We have to add it to
-	// the [LD_LIBRARY_]PATH so ggml-hip will load properly
-	rocmDir := filepath.Join(LibOllamaPath, "rocm")
-	if _, err := os.Stat(rocmDir); err != nil {
-		rocmDir = ""
+	var threadCount int
+	cpus := GetCPUDetails()
+	for _, c := range cpus {
+		threadCount += c.CoreCount - c.EfficiencyCoreCount
 	}
 
-	for _, dev := range devs {
-		info := GpuInfo{
-			DeviceID: dev.DeviceID,
-			filterID: dev.FilteredID,
-			Name:     dev.Description,
-			memInfo: memInfo{
-				TotalMemory: dev.TotalMemory,
-				FreeMemory:  dev.FreeMemory,
-			},
-			// TODO can we avoid variant
-			DependencyPath: dev.LibraryPath,
-			DriverMajor:    dev.DriverMajor,
-			DriverMinor:    dev.DriverMinor,
-			ComputeMajor:   dev.ComputeMajor,
-			ComputeMinor:   dev.ComputeMinor,
-		}
-		if dev.Library == "CUDA" || dev.Library == "ROCm" {
-			info.MinimumMemory = 457 * format.MebiByte
-		}
-		if dev.Library == "ROCm" && rocmDir != "" {
-			info.DependencyPath = append(info.DependencyPath, rocmDir)
-		}
-		// TODO any special processing of Vulkan devices?
-		resp = append(resp, info)
-	}
-	if len(resp) == 0 {
-		mem, err := GetCPUMem()
-		if err != nil {
-			slog.Warn("error looking up system memory", "error", err)
-		}
-
-		resp = append(resp, GpuInfo{
-			memInfo: mem,
-			DeviceID: ml.DeviceID{
-				Library: "cpu",
-				ID:      "0",
-			},
-		})
-	}
-	return resp
-}
-
-// Given the list of GPUs this instantiation is targeted for,
-// figure out the visible devices environment variable
-//
-// If different libraries are detected, the first one is what we use
-func (l GpuInfoList) GetVisibleDevicesEnv() []string {
-	if len(l) == 0 {
-		return nil
-	}
-	res := []string{}
-	envVar := rocmGetVisibleDevicesEnv(l)
-	if envVar != "" {
-		res = append(res, envVar)
-	}
-	envVar = vkGetVisibleDevicesEnv(l)
-	if envVar != "" {
-		res = append(res, envVar)
-	}
-	return res
-}
-
-func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
-	ids := []string{}
-	for _, info := range gpuInfo {
-		if info.Library != "ROCm" {
-			continue
-		}
-		// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
-		if info.filterID != "" {
-			ids = append(ids, info.filterID)
-		} else {
-			ids = append(ids, info.ID)
-		}
-	}
-	if len(ids) == 0 {
-		return ""
-	}
-	envVar := "ROCR_VISIBLE_DEVICES="
-	if runtime.GOOS != "linux" {
-		envVar = "HIP_VISIBLE_DEVICES="
-	}
-	// There are 3 potential env vars to use to select GPUs.
-	// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
-	// HIP_VISIBLE_DEVICES supports numeric IDs only
-	// GPU_DEVICE_ORDINAL supports numeric IDs only
-	return envVar + strings.Join(ids, ",")
-}
-
-func vkGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
-	ids := []string{}
-	for _, info := range gpuInfo {
-		if info.Library != "Vulkan" {
-			continue
-		}
-		if info.filterID != "" {
-			ids = append(ids, info.filterID)
-		} else {
-			ids = append(ids, info.ID)
-		}
-	}
-	if len(ids) == 0 {
-		return ""
-	}
-	envVar := "GGML_VK_VISIBLE_DEVICES="
-	return envVar + strings.Join(ids, ",")
-}
-
-// GetSystemInfo returns the last cached state of the GPUs on the system
-func GetSystemInfo() SystemInfo {
-	deviceMu.Lock()
-	defer deviceMu.Unlock()
-	gpus := devInfoToInfoList(devices)
-	if len(gpus) == 1 && gpus[0].Library == "cpu" {
-		gpus = []GpuInfo{}
+	if threadCount == 0 {
+		// Fall back to Go's num CPU
+		threadCount = runtime.NumCPU()
 	}
 
-	return SystemInfo{
-		System: CPUInfo{
-			CPUs:    GetCPUDetails(),
-			GpuInfo: GetCPUInfo(),
-		},
-		GPUs: gpus,
+	return ml.SystemInfo{
+		ThreadCount: threadCount,
+		TotalMemory: memInfo.TotalMemory,
+		FreeMemory:  memInfo.FreeMemory,
+		FreeSwap:    memInfo.FreeSwap,
 	}
 }
 
diff --git a/discover/runner.go b/discover/runner.go
index 66c3e3e6..e74050d0 100644
--- a/discover/runner.go
+++ b/discover/runner.go
@@ -4,13 +4,8 @@ package discover
 
 import (
 	"context"
-	"encoding/json"
-	"fmt"
 	"io"
 	"log/slog"
-	"math/rand"
-	"net"
-	"net/http"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -23,6 +18,7 @@ import (
 
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 )
@@ -36,7 +32,7 @@ var (
 	bootstrapped bool
 )
 
-func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.DeviceInfo {
+func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
 	deviceMu.Lock()
 	defer deviceMu.Unlock()
 	startDiscovery := time.Now()
@@ -121,7 +117,7 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
 
 		// In the second pass, we more deeply initialize the GPUs to weed out devices that
 		// aren't supported by a given library.  We run this phase in parallel to speed up discovery.
-		slog.Debug("filtering out unsupported or overlapping GPU library combinations", "count", len(devices))
+		slog.Debug("evluating which if any devices to filter out", "initial_count", len(devices))
 		ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
 		defer cancel()
 		var wg sync.WaitGroup
@@ -133,7 +129,7 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
 			if devices[i].Library == "Metal" {
 				continue
 			}
-			slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
+			slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID)
 			wg.Add(1)
 			go func(i int) {
 				defer wg.Done()
@@ -154,11 +150,17 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
 					slog.Error("Unknown Library:" + devices[i].Library)
 				}
 
-				extraEnvs := []string{
-					"GGML_CUDA_INIT=1", // force deep initialization to trigger crash on unsupported GPUs
-					envVar + "=" + id,  // Filter to just this one GPU
+				extraEnvs := map[string]string{
+					"GGML_CUDA_INIT": "1", // force deep initialization to trigger crash on unsupported GPUs
+					envVar:           id,  // Filter to just this one GPU
 				}
 				if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
+					slog.Debug("filtering device which didn't fully initialize",
+						"id", devices[i].ID,
+						"libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
+						"pci_id", devices[i].PCIID,
+						"library", devices[i].Library,
+					)
 					needsDelete[i] = true
 				} else {
 					supportedMu.Lock()
@@ -174,7 +176,7 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
 			}(i)
 		}
 		wg.Wait()
-		logutil.Trace("supported GPU library combinations", "supported", supported)
+		logutil.Trace("supported GPU library combinations before filtering", "supported", supported)
 
 		filterOutVulkanThatAreSupportedByOtherGPU(needsDelete)
 
@@ -376,12 +378,13 @@ func filterOutVulkanThatAreSupportedByOtherGPU(needsDelete []bool) {
 			}
 			if devices[j].PCIID == devices[i].PCIID && devices[j].Library != "Vulkan" && !needsDelete[j] {
 				needsDelete[i] = true
-				slog.Debug("dropping Vulkan duplicate by PCI ID",
-					"vulkan_id", devices[i].ID,
-					"vulkan_libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
+				slog.Debug("filtering device with duplicate PCI ID",
+					"id", devices[i].ID,
+					"library", devices[i].Library,
+					"libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1],
 					"pci_id", devices[i].PCIID,
-					"kept_library", devices[j].Library,
 					"kept_id", devices[j].ID,
+					"kept_library", devices[j].Library,
 				)
 				break
 			}
@@ -426,6 +429,12 @@ func filterOverlapByLibrary(supported map[string]map[string]map[string]int, need
 			}
 			for dev, i := range byLibDirs[libDir] {
 				if _, found := byLibDirs[newest][dev]; found {
+					slog.Debug("filtering device with overlapping libraries",
+						"id", dev,
+						"library", libDir,
+						"delete_index", i,
+						"kept_library", newest,
+					)
 					needsDelete[i] = true
 				}
 			}
@@ -449,100 +458,35 @@ func (r *bootstrapRunner) HasExited() bool {
 	return false
 }
 
-func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []string) []ml.DeviceInfo {
-	// TODO DRY out with llm/server.go
-	slog.Debug("spawning runner with", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
+func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map[string]string) []ml.DeviceInfo {
+	var out io.Writer
+	if envconfig.LogLevel() == logutil.LevelTrace {
+		out = os.Stderr
+	}
 	start := time.Now()
 	defer func() {
 		slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
 	}()
-	port := 0
-	if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
-		var l *net.TCPListener
-		if l, err = net.ListenTCP("tcp", a); err == nil {
-			port = l.Addr().(*net.TCPAddr).Port
-			l.Close()
-		}
-	}
-	if port == 0 {
-		slog.Debug("ResolveTCPAddr failed, using random port")
-		port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
-	}
-	params := []string{"runner", "--ollama-engine", "--port", strconv.Itoa(port)}
-	var pathEnv string
-	switch runtime.GOOS {
-	case "windows":
-		pathEnv = "PATH"
-	case "darwin":
-		pathEnv = "DYLD_LIBRARY_PATH"
-	default:
-		pathEnv = "LD_LIBRARY_PATH"
-	}
-	libraryPaths := append([]string{LibOllamaPath}, ollamaLibDirs...)
-	if rocmDir != "" {
-		libraryPaths = append(libraryPaths, rocmDir)
-	}
-	// Note: we always put our dependency paths first
-	// since these are the exact version we compiled/linked against
-	if libraryPath, ok := os.LookupEnv(pathEnv); ok {
-		libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
-	}
 
-	cmd := exec.Command(exe, params...)
-	cmd.Env = os.Environ()
-	if envconfig.LogLevel() == logutil.LevelTrace {
-		cmd.Stdout = os.Stdout
-		cmd.Stderr = os.Stderr
-	}
-
-	// cmd.SysProcAttr = llm.LlamaServerSysProcAttr // circular dependency - bring back once refactored
-	pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
-	pathNeeded := true
-	ollamaPathNeeded := true
-	extraDone := make([]bool, len(extraEnvs))
-	for i := range cmd.Env {
-		cmp := strings.SplitN(cmd.Env[i], "=", 2)
-		if strings.EqualFold(cmp[0], pathEnv) {
-			cmd.Env[i] = pathEnv + "=" + pathEnvVal
-			pathNeeded = false
-		} else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") {
-			cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(ollamaLibDirs, string(filepath.ListSeparator))
-			ollamaPathNeeded = false
-		} else {
-			for j := range extraEnvs {
-				if extraDone[j] {
-					continue
-				}
-				extra := strings.SplitN(extraEnvs[j], "=", 2)
-				if cmp[0] == extra[0] {
-					cmd.Env[i] = extraEnvs[j]
-					extraDone[j] = true
-				}
-			}
-		}
-	}
-	if pathNeeded {
-		cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
-	}
-	if ollamaPathNeeded {
-		cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ollamaLibDirs, string(filepath.ListSeparator)))
-	}
-	for i := range extraDone {
-		if !extraDone[i] {
-			cmd.Env = append(cmd.Env, extraEnvs[i])
-		}
-	}
-	logutil.Trace("starting runner for device discovery", "env", cmd.Env, "cmd", cmd)
-	if err := cmd.Start(); err != nil {
-		slog.Warn("unable to start discovery subprocess", "cmd", cmd, "error", err)
+	logutil.Trace("starting runner for device discovery", "libDirs", ollamaLibDirs, "extraEnvs", extraEnvs)
+	cmd, port, err := llm.StartRunner(
+		true, // ollama engine
+		"",   // no model
+		ollamaLibDirs,
+		out,
+		extraEnvs,
+	)
+	if err != nil {
+		slog.Debug("failed to start runner to discovery GPUs", "error", err)
 		return nil
 	}
+
 	go func() {
 		cmd.Wait() // exit status ignored
 	}()
 
 	defer cmd.Process.Kill()
-	devices, err := GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
+	devices, err := ml.GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
 	if err != nil {
 		if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
 			// Expected during bootstrapping while we filter out unsupported AMD GPUs
@@ -555,52 +499,3 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []s
 
 	return devices
 }
-
-func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceInfo, error) {
-	var moreDevices []ml.DeviceInfo
-	port := runner.GetPort()
-	tick := time.Tick(10 * time.Millisecond)
-	for {
-		select {
-		case <-ctx.Done():
-			return nil, fmt.Errorf("failed to finish discovery before timeout")
-		case <-tick:
-			r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
-			if err != nil {
-				return nil, fmt.Errorf("failed to create request: %w", err)
-			}
-			r.Header.Set("Content-Type", "application/json")
-
-			resp, err := http.DefaultClient.Do(r)
-			if err != nil {
-				// slog.Warn("failed to send request", "error", err)
-				if runner.HasExited() {
-					return nil, fmt.Errorf("runner crashed")
-				}
-				continue
-			}
-			defer resp.Body.Close()
-
-			if resp.StatusCode == http.StatusNotFound {
-				// old runner, fall back to bootstrapping model
-				return nil, fmt.Errorf("llamarunner free vram reporting not supported")
-			}
-
-			body, err := io.ReadAll(resp.Body)
-			if err != nil {
-				slog.Warn("failed to read response", "error", err)
-				continue
-			}
-			if resp.StatusCode != 200 {
-				logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
-				return nil, fmt.Errorf("runner error: %s", string(body))
-			}
-
-			if err := json.Unmarshal(body, &moreDevices); err != nil {
-				slog.Warn("unmarshal encode response", "error", err)
-				continue
-			}
-			return moreDevices, nil
-		}
-	}
-}
diff --git a/discover/types.go b/discover/types.go
index adb2f43a..b1f622f4 100644
--- a/discover/types.go
+++ b/discover/types.go
@@ -1,10 +1,9 @@
 package discover
 
 import (
-	"context"
 	"log/slog"
 	"path/filepath"
-	"runtime"
+	"sort"
 	"strings"
 
 	"github.com/ollama/ollama/format"
@@ -17,50 +16,6 @@ type memInfo struct {
 	FreeSwap    uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
 }
 
-// Beginning of an `ollama info` command
-type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
-	ml.DeviceID
-	memInfo
-
-	// Optional variant to select (e.g. versions, cpu feature flags)
-	Variant string `json:"variant"`
-
-	// MinimumMemory represents the minimum memory required to use the GPU
-	MinimumMemory uint64 `json:"-"`
-
-	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
-	DependencyPath []string `json:"lib_path,omitempty"`
-
-	// Set to true if we can NOT reliably discover FreeMemory.  A value of true indicates
-	// the FreeMemory is best effort, and may over or under report actual memory usage
-	// False indicates FreeMemory can generally be trusted on this GPU
-	UnreliableFreeMemory bool
-
-	// GPU information
-	filterID     string // AMD/Vulkan Workaround: The numeric ID of the device used to filter out other devices
-	Name         string `json:"name"`          // user friendly name if available
-	ComputeMajor int    `json:"compute_major"` // Compute Capability or gfx
-	ComputeMinor int    `json:"compute_minor"`
-
-	// Driver Information - TODO no need to put this on each GPU
-	DriverMajor int `json:"driver_major,omitempty"`
-	DriverMinor int `json:"driver_minor,omitempty"`
-
-	// TODO other performance capability info to help in scheduling decisions
-}
-
-func (gpu GpuInfo) RunnerName() string {
-	if gpu.Variant != "" {
-		return gpu.Library + "_" + gpu.Variant
-	}
-	return gpu.Library
-}
-
-type CPUInfo struct {
-	GpuInfo
-	CPUs []CPU
-}
-
 // CPU type represents a CPU Package occupying a socket
 type CPU struct {
 	ID                  string `cpuinfo:"processor"`
@@ -71,33 +26,8 @@ type CPU struct {
 	ThreadCount         int
 }
 
-type GpuInfoList []GpuInfo
-
-func (l GpuInfoList) ByLibrary() []GpuInfoList {
-	resp := []GpuInfoList{}
-	libs := []string{}
-	for _, info := range l {
-		found := false
-		requested := info.Library
-		if info.Variant != "" {
-			requested += "_" + info.Variant
-		}
-		for i, lib := range libs {
-			if lib == requested {
-				resp[i] = append(resp[i], info)
-				found = true
-				break
-			}
-		}
-		if !found {
-			libs = append(libs, requested)
-			resp = append(resp, []GpuInfo{info})
-		}
-	}
-	return resp
-}
-
 func LogDetails(devices []ml.DeviceInfo) {
+	sort.Sort(sort.Reverse(ml.ByFreeMemory(devices))) // Report devices in order of scheduling preference
 	for _, dev := range devices {
 		var libs []string
 		for _, dir := range dev.LibraryPath {
@@ -111,6 +41,7 @@ func LogDetails(devices []ml.DeviceInfo) {
 		}
 		slog.Info("inference compute",
 			"id", dev.ID,
+			"filtered_id", dev.FilteredID,
 			"library", dev.Library,
 			"compute", dev.Compute(),
 			"name", dev.Name,
@@ -141,74 +72,3 @@ func LogDetails(devices []ml.DeviceInfo) {
 		)
 	}
 }
-
-// Sort by Free Space
-type ByFreeMemory []GpuInfo
-
-func (a ByFreeMemory) Len() int           { return len(a) }
-func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
-func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
-
-type SystemInfo struct {
-	System CPUInfo   `json:"system"`
-	GPUs   []GpuInfo `json:"gpus"`
-}
-
-// Return the optimal number of threads to use for inference
-func (si SystemInfo) GetOptimalThreadCount() int {
-	if len(si.System.CPUs) == 0 {
-		// Fall back to Go's num CPU
-		return runtime.NumCPU()
-	}
-
-	coreCount := 0
-	for _, c := range si.System.CPUs {
-		coreCount += c.CoreCount - c.EfficiencyCoreCount
-	}
-
-	return coreCount
-}
-
-// For each GPU, check if it does NOT support flash attention
-func (l GpuInfoList) FlashAttentionSupported() bool {
-	for _, gpu := range l {
-		supportsFA := gpu.Library == "cpu" ||
-			gpu.Name == "Metal" || gpu.Library == "Metal" ||
-			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || // We don't have kernels for Jetson Xavier
-			gpu.Library == "ROCm" ||
-			gpu.Library == "Vulkan"
-
-		if !supportsFA {
-			return false
-		}
-	}
-	return true
-}
-
-type BaseRunner interface {
-	// GetPort returns the localhost port number the runner is running on
-	GetPort() int
-
-	// HasExited indicates if the runner is no longer running.  This can be used during
-	// bootstrap to detect if a given filtered device is incompatible and triggered an assert
-	HasExited() bool
-}
-
-type RunnerDiscovery interface {
-	BaseRunner
-
-	// GetDeviceInfos will perform a query of the underlying device libraries
-	// for device identification and free VRAM information
-	// During bootstrap scenarios, this routine may take seconds to complete
-	GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
-}
-
-type FilteredRunnerDiscovery interface {
-	RunnerDiscovery
-
-	// GetActiveDeviceIDs returns the filtered set of devices actively in
-	// use by this runner for running models.  If the runner is a bootstrap runner, no devices
-	// will be active yet so no device IDs are returned.
-	// This routine will not query the underlying device and will return immediately
-	GetActiveDeviceIDs() []ml.DeviceID
-}
diff --git a/docs/README.md b/docs/README.md
index 310a4399..b7ba9b96 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,22 +1,22 @@
 # Documentation
 
 ### Getting Started
-* [Quickstart](../README.md#quickstart)
+* [Quickstart](https://docs.ollama.com/quickstart)
 * [Examples](./examples.md)
-* [Importing models](./import.md)
-* [MacOS Documentation](./macos.md)
-* [Linux Documentation](./linux.md)
-* [Windows Documentation](./windows.md)
-* [Docker Documentation](./docker.md)
+* [Importing models](https://docs.ollama.com/import)
+* [MacOS Documentation](https://docs.ollama.com/macos)
+* [Linux Documentation](https://docs.ollama.com/linux)
+* [Windows Documentation](https://docs.ollama.com/windows)
+* [Docker Documentation](https://docs.ollama.com/docker)
 
 ### Reference
 
-* [API Reference](./api.md)
+* [API Reference](https://docs.ollama.com/api)
 * [Modelfile Reference](./modelfile.md)
-* [OpenAI Compatibility](./openai.md)
+* [OpenAI Compatibility](https://docs.ollama.com/api/openai-compatibility)
 
 ### Resources
 
-* [Troubleshooting Guide](./troubleshooting.md)
-* [FAQ](./faq.md)
+* [Troubleshooting Guide](https://docs.ollama.com/troubleshooting)
+* [FAQ](https://docs.ollama.com/faq#faq)
 * [Development guide](./development.md)
diff --git a/docs/api.md b/docs/api.md
index f47af63c..99ceaa11 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -1,5 +1,7 @@
 # API
 
+> Note: Ollama's API docs are moving to https://docs.ollama.com/api
+
 ## Endpoints
 
 - [Generate a completion](#generate-a-completion)
@@ -104,7 +106,7 @@ The final response in the stream also includes additional data about the generat
 - `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
 - `response`: empty if the response was streamed, if not streamed, this will contain the full response
 
-To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration` * `10^9`.
+To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration` \* `10^9`.
 
 ```json
 {
@@ -617,25 +619,26 @@ curl http://localhost:11434/api/chat -d '{
 ##### Response
 
 A stream of JSON objects is returned:
+
 ```json
 {
-    "model": "llama3.2",
-    "created_at": "2025-07-07T20:22:19.184789Z",
-    "message": {
-        "role": "assistant",
-        "content": "",
-        "tool_calls": [
-            {
-                "function": {
-                    "name": "get_weather",
-                    "arguments": {
-                        "city": "Tokyo"
-                    }
-                },
-            }
-        ]
-    },
-    "done": false
+  "model": "llama3.2",
+  "created_at": "2025-07-07T20:22:19.184789Z",
+  "message": {
+    "role": "assistant",
+    "content": "",
+    "tool_calls": [
+      {
+        "function": {
+          "name": "get_weather",
+          "arguments": {
+            "city": "Tokyo"
+          }
+        }
+      }
+    ]
+  },
+  "done": false
 }
 ```
 
@@ -643,8 +646,8 @@ Final response:
 
 ```json
 {
-  "model":"llama3.2",
-  "created_at":"2025-07-07T20:22:19.19314Z",
+  "model": "llama3.2",
+  "created_at": "2025-07-07T20:22:19.19314Z",
   "message": {
     "role": "assistant",
     "content": ""
@@ -701,7 +704,6 @@ curl http://localhost:11434/api/chat -d '{
 
 ##### Request
 
-
 ```shell
 curl http://localhost:11434/api/chat -d '{
   "model": "llama3.2",
@@ -730,7 +732,7 @@ curl http://localhost:11434/api/chat -d '{
       }
     }
   ],
-  "stream": false 
+  "stream": false
 }'
 ```
 
@@ -750,7 +752,7 @@ curl http://localhost:11434/api/chat -d '{
           "arguments": {
             "city": "Tokyo"
           }
-        },
+        }
       }
     ]
   },
@@ -801,7 +803,10 @@ curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json"
 {
   "model": "llama3.1",
   "created_at": "2024-12-06T00:46:58.265747Z",
-  "message": { "role": "assistant", "content": "{\"age\": 22, \"available\": false}" },
+  "message": {
+    "role": "assistant",
+    "content": "{\"age\": 22, \"available\": false}"
+  },
   "done_reason": "stop",
   "done": true,
   "total_duration": 2254970291,
@@ -871,7 +876,6 @@ Final response:
 }
 ```
 
-
 #### Chat request (With history, with tools)
 
 ##### Request
@@ -948,10 +952,8 @@ curl http://localhost:11434/api/chat -d '{
   "eval_count": 11,
   "eval_duration": 90282125
 }
-
 ```
 
-
 #### Chat request (with images)
 
 ##### Request
@@ -1123,7 +1125,7 @@ curl http://localhost:11434/api/chat -d '{
 ```json
 {
   "model": "llama3.2",
-  "created_at":"2024-09-12T21:17:29.110811Z",
+  "created_at": "2024-09-12T21:17:29.110811Z",
   "message": {
     "role": "assistant",
     "content": ""
@@ -1154,7 +1156,7 @@ A single JSON object is returned:
 ```json
 {
   "model": "llama3.2",
-  "created_at":"2024-09-12T21:33:17.547535Z",
+  "created_at": "2024-09-12T21:33:17.547535Z",
   "message": {
     "role": "assistant",
     "content": ""
@@ -1171,9 +1173,10 @@ POST /api/create
 ```
 
 Create a model from:
- * another model;
- * a safetensors directory; or
- * a GGUF file.
+
+- another model;
+- a safetensors directory; or
+- a GGUF file.
 
 If you are creating a model from a safetensors directory or from a GGUF file, you must [create a blob](#create-a-blob) for each of the files and then use the file name and SHA256 digest associated with each blob in the `files` field.
 
@@ -1193,11 +1196,11 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
 
 #### Quantization types
 
-| Type | Recommended |
-| --- | :-: |
-| q4_K_M | * |
-| q4_K_S | |
-| q8_0 | * |
+| Type   | Recommended |
+| ------ | :---------: |
+| q4_K_M |     \*      |
+| q4_K_S |             |
+| q8_0   |     \*      |
 
 ### Examples
 
@@ -1268,7 +1271,6 @@ A stream of JSON objects is returned:
 
 Create a model from a GGUF file. The `files` parameter should be filled out with the file name and SHA256 digest of the GGUF file you wish to use. Use [/api/blobs/:digest](#push-a-blob) to push the GGUF file to the server before calling this API.
 
-
 ##### Request
 
 ```shell
@@ -1291,7 +1293,6 @@ A stream of JSON objects is returned:
 {"status":"success"}
 ```
 
-
 #### Create a model from a Safetensors directory
 
 The `files` parameter should include a dictionary of files for the safetensors model which includes the file names and SHA256 digest of each file. Use [/api/blobs/:digest](#push-a-blob) to first push each of the files to the server before calling this API. Files will remain in the cache until the Ollama server is restarted.
@@ -1406,9 +1407,7 @@ A single JSON object will be returned.
         "parent_model": "",
         "format": "gguf",
         "family": "qwen2",
-        "families": [
-          "qwen2"
-        ],
+        "families": ["qwen2"],
         "parameter_size": "7.6B",
         "quantization_level": "Q4_K_M"
       }
@@ -1423,9 +1422,7 @@ A single JSON object will be returned.
         "parent_model": "",
         "format": "gguf",
         "family": "llama",
-        "families": [
-          "llama"
-        ],
+        "families": ["llama"],
         "parameter_size": "3.2B",
         "quantization_level": "Q4_K_M"
       }
@@ -1461,20 +1458,18 @@ curl http://localhost:11434/api/show -d '{
 
 ```json5
 {
-  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
-  "parameters": "num_keep                       24\nstop                           \"<|start_header_id|>\"\nstop                           \"<|end_header_id|>\"\nstop                           \"<|eot_id|>\"",
-  "template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
-  "details": {
-    "parent_model": "",
-    "format": "gguf",
-    "family": "llama",
-    "families": [
-      "llama"
-    ],
-    "parameter_size": "8.0B",
-    "quantization_level": "Q4_0"
+  modelfile: '# Modelfile generated by "ollama show"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE """{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: """\nPARAMETER num_ctx 4096\nPARAMETER stop "\u003c/s\u003e"\nPARAMETER stop "USER:"\nPARAMETER stop "ASSISTANT:"',
+  parameters: 'num_keep                       24\nstop                           "<|start_header_id|>"\nstop                           "<|end_header_id|>"\nstop                           "<|eot_id|>"',
+  template: "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
+  details: {
+    parent_model: "",
+    format: "gguf",
+    family: "llama",
+    families: ["llama"],
+    parameter_size: "8.0B",
+    quantization_level: "Q4_0",
   },
-  "model_info": {
+  model_info: {
     "general.architecture": "llama",
     "general.file_type": 2,
     "general.parameter_count": 8030261248,
@@ -1491,16 +1486,13 @@ curl http://localhost:11434/api/show -d '{
     "llama.vocab_size": 128256,
     "tokenizer.ggml.bos_token_id": 128000,
     "tokenizer.ggml.eos_token_id": 128009,
-    "tokenizer.ggml.merges": [],            // populates if `verbose=true`
+    "tokenizer.ggml.merges": [], // populates if `verbose=true`
     "tokenizer.ggml.model": "gpt2",
     "tokenizer.ggml.pre": "llama-bpe",
-    "tokenizer.ggml.token_type": [],        // populates if `verbose=true`
-    "tokenizer.ggml.tokens": []             // populates if `verbose=true`
+    "tokenizer.ggml.token_type": [], // populates if `verbose=true`
+    "tokenizer.ggml.tokens": [], // populates if `verbose=true`
   },
-  "capabilities": [
-    "completion",
-    "vision"
-  ],
+  capabilities: ["completion", "vision"],
 }
 ```
 
@@ -1726,10 +1718,12 @@ curl http://localhost:11434/api/embed -d '{
 ```json
 {
   "model": "all-minilm",
-  "embeddings": [[
-    0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
-    0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
-  ]],
+  "embeddings": [
+    [
+      0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
+      0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
+    ]
+  ],
   "total_duration": 14143917,
   "load_duration": 1019500,
   "prompt_eval_count": 8
@@ -1750,17 +1744,21 @@ curl http://localhost:11434/api/embed -d '{
 ```json
 {
   "model": "all-minilm",
-  "embeddings": [[
-    0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
-    0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
-  ],[
-    -0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725,
-    0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481
-  ]]
+  "embeddings": [
+    [
+      0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
+      0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
+    ],
+    [
+      -0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725,
+      0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481
+    ]
+  ]
 }
 ```
 
 ## List Running Models
+
 ```
 GET /api/ps
 ```
@@ -1791,9 +1789,7 @@ A single JSON object will be returned.
         "parent_model": "",
         "format": "gguf",
         "family": "llama",
-        "families": [
-          "llama"
-        ],
+        "families": ["llama"],
         "parameter_size": "7.2B",
         "quantization_level": "Q4_0"
       },
@@ -1840,8 +1836,10 @@ curl http://localhost:11434/api/embeddings -d '{
 ```json
 {
   "embedding": [
-    0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
-    0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
+    0.5670403838157654, 0.009260174818336964, 0.23178744316101074,
+    -0.2916173040866852, -0.8924556970596313, 0.8785552978515625,
+    -0.34576427936553955, 0.5742510557174683, -0.04222835972905159,
+    -0.137906014919281
   ]
 }
 ```
@@ -1869,5 +1867,3 @@ curl http://localhost:11434/api/version
   "version": "0.5.1"
 }
 ```
-
-
diff --git a/docs/api/authentication.mdx b/docs/api/authentication.mdx
new file mode 100644
index 00000000..03d802fb
--- /dev/null
+++ b/docs/api/authentication.mdx
@@ -0,0 +1,63 @@
+---
+title: Authentication
+---
+
+No authentication is required when accessing Ollama's API locally via `http://localhost:11434`.
+
+Authentication is required for the following:
+
+* Running cloud models via ollama.com
+* Publishing models
+* Downloading private models
+
+Ollama supports two authentication methods:
+
+* **Signing in**: sign in from your local installation, and Ollama will automatically take care of authenticating requests to ollama.com when running commands
+* **API keys**: API keys for programmatic access to ollama.com's API
+
+## Signing in
+
+To sign in to ollama.com from your local installation of Ollama, run:
+
+```
+ollama signin
+```
+
+Once signed in, Ollama will automatically authenticate commands as required:
+
+```
+ollama run gpt-oss:120b-cloud
+```
+
+Similarly, when accessing a local API endpoint that requires cloud access, Ollama will automatically authenticate the request:
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "gpt-oss:120b-cloud",
+  "prompt": "Why is the sky blue?"
+}'
+```
+
+## API keys
+
+For direct access to ollama.com's API served at `https://ollama.com/api`, authentication via API keys is required.
+
+First, create an [API key](https://ollama.com/settings/keys), then set the `OLLAMA_API_KEY` environment variable:
+
+```shell
+export OLLAMA_API_KEY=your_api_key
+```
+
+Then use the API key in the Authorization header:
+
+```shell
+curl https://ollama.com/api/generate \
+  -H "Authorization: Bearer $OLLAMA_API_KEY" \
+  -d '{
+    "model": "gpt-oss:120b",
+    "prompt": "Why is the sky blue?",
+    "stream": false
+  }'
+```
+
+API keys don't currently expire, however you can revoke them at any time in your [API keys settings](https://ollama.com/settings/keys).
diff --git a/docs/api/errors.mdx b/docs/api/errors.mdx
new file mode 100644
index 00000000..15a8809e
--- /dev/null
+++ b/docs/api/errors.mdx
@@ -0,0 +1,36 @@
+---
+title: Errors
+---
+
+## Status codes
+
+Endpoints return appropriate HTTP status codes based on the success or failure of the request in the HTTP status line (e.g. `HTTP/1.1 200 OK` or `HTTP/1.1 400 Bad Request`). Common status codes are:
+
+- `200`: Success
+- `400`: Bad Request (missing parameters, invalid JSON, etc.)
+- `404`: Not Found (model doesn't exist, etc.)
+- `429`: Too Many Requests (e.g. when a rate limit is exceeded)
+- `500`: Internal Server Error
+- `502`: Bad Gateway (e.g. when a cloud model cannot be reached)
+
+## Error messages
+
+Errors are returned in the `application/json` format with the following structure, with the error message in the `error` property:
+
+```json
+{
+  "error": "the model failed to generate a response"
+}
+```
+
+## Errors that occur while streaming
+
+If an error occurs mid-stream, the error will be returned as an object in the `application/x-ndjson` format with an `error` property. Since the response has already started, the status code of the response will not be changed.
+
+```json
+{"model":"gemma3","created_at":"2025-10-26T17:21:21.196249Z","response":" Yes","done":false}
+{"model":"gemma3","created_at":"2025-10-26T17:21:21.207235Z","response":".","done":false}
+{"model":"gemma3","created_at":"2025-10-26T17:21:21.219166Z","response":"I","done":false}
+{"model":"gemma3","created_at":"2025-10-26T17:21:21.231094Z","response":"can","done":false}
+{"error":"an error was encountered while running the model"}
+```
diff --git a/docs/api/index.mdx b/docs/api/index.mdx
new file mode 100644
index 00000000..bc8a9ad7
--- /dev/null
+++ b/docs/api/index.mdx
@@ -0,0 +1,47 @@
+---
+title: Introduction
+---
+
+Ollama's API allows you to run and interact with models programatically.
+
+## Get started
+
+If you're just getting started, follow the [quickstart](/quickstart) documentation to get up and running with Ollama's API.
+
+## Base URL
+
+After installation, Ollama's API is served by default at:
+
+```
+http://localhost:11434/api
+```
+
+For running cloud models on **ollama.com**, the same API is available with the following base URL:
+
+```
+https://ollama.com/api
+```
+
+## Example request
+
+Once Ollama is running, its API is automatically available and can be accessed via `curl`:
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "gemma3",
+  "prompt": "Why is the sky blue?"
+}'
+```
+
+## Libraries
+
+Ollama has official libraries for Python and JavaScript:
+
+- [Python](https://github.com/ollama/ollama-python)
+- [JavaScript](https://github.com/ollama/ollama-js)
+
+Several community-maintained libraries are available for Ollama. For a full list, see the [Ollama GitHub repository](https://github.com/ollama/ollama?tab=readme-ov-file#libraries-1).
+
+## Versioning
+
+Ollama's API isn't strictly versioned, but the API is expected to be stable and backwards compatible. Deprecations are rare and will be announced in the [release notes](https://github.com/ollama/ollama/releases).
diff --git a/docs/openai.md b/docs/api/openai-compatibility.mdx
similarity index 73%
rename from docs/openai.md
rename to docs/api/openai-compatibility.mdx
index 26930124..8329934a 100644
--- a/docs/openai.md
+++ b/docs/api/openai-compatibility.mdx
@@ -1,9 +1,8 @@
-# OpenAI compatibility
+---
+title: OpenAI compatibility
+---
 
-> [!NOTE]
-> OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/ollama/ollama/blob/main/docs/api.md).
-
-Ollama provides experimental compatibility with parts of the [OpenAI API](https://platform.openai.com/docs/api-reference) to help connect existing applications to Ollama.
+Ollama provides compatibility with parts of the [OpenAI API](https://platform.openai.com/docs/api-reference) to help connect existing applications to Ollama.
 
 ## Usage
 
@@ -100,49 +99,50 @@ except Exception as e:
 ### OpenAI JavaScript library
 
 ```javascript
-import OpenAI from 'openai'
+import OpenAI from "openai";
 
 const openai = new OpenAI({
-  baseURL: 'http://localhost:11434/v1/',
+  baseURL: "http://localhost:11434/v1/",
 
   // required but ignored
-  apiKey: 'ollama',
-})
+  apiKey: "ollama",
+});
 
 const chatCompletion = await openai.chat.completions.create({
-    messages: [{ role: 'user', content: 'Say this is a test' }],
-    model: 'llama3.2',
-})
+  messages: [{ role: "user", content: "Say this is a test" }],
+  model: "llama3.2",
+});
 
 const response = await openai.chat.completions.create({
-    model: "llava",
-    messages: [
+  model: "llava",
+  messages: [
+    {
+      role: "user",
+      content: [
+        { type: "text", text: "What's in this image?" },
         {
-        role: "user",
-        content: [
-            { type: "text", text: "What's in this image?" },
-            {
-            type: "image_url",
-            image_url: "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC",
-            },
-        ],
+          type: "image_url",
+          image_url:
+            "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC",
         },
-    ],
-})
+      ],
+    },
+  ],
+});
 
 const completion = await openai.completions.create({
-    model: "llama3.2",
-    prompt: "Say this is a test.",
-})
+  model: "llama3.2",
+  prompt: "Say this is a test.",
+});
 
-const listCompletion = await openai.models.list()
+const listCompletion = await openai.models.list();
 
-const model = await openai.models.retrieve("llama3.2")
+const model = await openai.models.retrieve("llama3.2");
 
 const embedding = await openai.embeddings.create({
   model: "all-minilm",
   input: ["why is the sky blue?", "why is the grass green?"],
-})
+});
 ```
 
 ### `curl`
@@ -306,8 +306,8 @@ curl http://localhost:11434/v1/embeddings \
   - [x] array of strings
   - [ ] array of tokens
   - [ ] array of token arrays
-- [ ] `encoding format`
-- [ ] `dimensions`
+- [x] `encoding format`
+- [x] `dimensions`
 - [ ] `user`
 
 ## Models
@@ -365,4 +365,4 @@ curl http://localhost:11434/v1/chat/completions \
             }
         ]
     }'
-```
+```
\ No newline at end of file
diff --git a/docs/api/streaming.mdx b/docs/api/streaming.mdx
new file mode 100644
index 00000000..ad77f810
--- /dev/null
+++ b/docs/api/streaming.mdx
@@ -0,0 +1,35 @@
+---
+title: Streaming
+---
+
+Certain API endpoints stream responses by default, such as `/api/generate`. These responses are provided in the newline-delimited JSON format (i.e. the `application/x-ndjson` content type). For example:
+
+```json
+{"model":"gemma3","created_at":"2025-10-26T17:15:24.097767Z","response":"That","done":false}
+{"model":"gemma3","created_at":"2025-10-26T17:15:24.109172Z","response":"'","done":false}
+{"model":"gemma3","created_at":"2025-10-26T17:15:24.121485Z","response":"s","done":false}
+{"model":"gemma3","created_at":"2025-10-26T17:15:24.132802Z","response":" a","done":false}
+{"model":"gemma3","created_at":"2025-10-26T17:15:24.143931Z","response":" fantastic","done":false}
+{"model":"gemma3","created_at":"2025-10-26T17:15:24.155176Z","response":" question","done":false}
+{"model":"gemma3","created_at":"2025-10-26T17:15:24.166576Z","response":"!","done":true, "done_reason": "stop"}
+```
+
+## Disabling streaming
+
+Streaming can be disabled by providing `{"stream": false}` in the request body for any endpoint that support streaming. This will cause responses to be returned in the `application/json` format instead:
+
+```json
+{"model":"gemma3","created_at":"2025-10-26T17:15:24.166576Z","response":"That's a fantastic question!","done":true}
+```
+
+## When to use streaming vs non-streaming
+
+**Streaming (default)**:
+  - Real-time response generation
+  - Lower perceived latency
+  - Better for long generations
+
+**Non-streaming**:
+  - Simpler to process
+  - Better for short responses, or structured outputs
+  - Easier to handle in some applications
\ No newline at end of file
diff --git a/docs/api/usage.mdx b/docs/api/usage.mdx
new file mode 100644
index 00000000..8317ca84
--- /dev/null
+++ b/docs/api/usage.mdx
@@ -0,0 +1,36 @@
+---
+title: Usage
+---
+
+Ollama's API responses include metrics that can be used for measuring performance and model usage:
+
+* `total_duration`: How long the response took to generate
+* `load_duration`: How long the model took to load
+* `prompt_eval_count`: How many input tokens were processed
+* `prompt_eval_duration`: How long it took to evaluate the prompt
+* `eval_count`: How many output tokens were processes
+* `eval_duration`: How long it took to generate the output tokens
+
+All timing values are measured in nanoseconds.
+
+## Example response
+
+For endpoints that return usage metrics, the response body will include the usage fields. For example, a non-streaming call to `/api/generate` may return the following response:
+
+```json
+{
+  "model": "gemma3",
+  "created_at": "2025-10-17T23:14:07.414671Z",
+  "response": "Hello! How can I help you today?",
+  "done": true,
+  "done_reason": "stop",
+  "total_duration": 174560334,
+  "load_duration": 101397084,
+  "prompt_eval_count": 11,
+  "prompt_eval_duration": 13074791,
+  "eval_count": 18,
+  "eval_duration": 52479709
+}
+```
+
+For endpoints that return **streaming responses**, usage fields are included as part of the final chunk, where `done` is `true`.
diff --git a/docs/capabilities/embeddings.mdx b/docs/capabilities/embeddings.mdx
new file mode 100644
index 00000000..99a57748
--- /dev/null
+++ b/docs/capabilities/embeddings.mdx
@@ -0,0 +1,113 @@
+---
+title: Embeddings
+description: Generate text embeddings for semantic search, retrieval, and RAG.
+---
+
+Embeddings turn text into numeric vectors you can store in a vector database, search with cosine similarity, or use in RAG pipelines. The vector length depends on the model (typically 384–1024 dimensions).
+
+## Recommended models
+
+- [embeddinggemma](https://ollama.com/library/embeddinggemma)
+- [qwen3-embedding](https://ollama.com/library/qwen3-embedding)
+- [all-minilm](https://ollama.com/library/all-minilm)
+
+## Generate embeddings
+
+Use `/api/embed` with a single string.
+
+<Tabs>
+  <Tab title="cURL">
+    ```shell
+    curl -X POST http://localhost:11434/api/embed \
+      -H "Content-Type: application/json" \
+      -d '{
+        "model": "embeddinggemma",
+        "input": "The quick brown fox jumps over the lazy dog."
+      }'
+    ```
+  </Tab>
+  <Tab title="Python">
+    ```python
+    import ollama
+
+    single = ollama.embed(
+      model='embeddinggemma',
+      input='The quick brown fox jumps over the lazy dog.'
+    )
+    print(len(single['embeddings'][0]))  # vector length
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+    ```javascript
+    import ollama from 'ollama'
+
+    const single = await ollama.embed({
+      model: 'embeddinggemma',
+      input: 'The quick brown fox jumps over the lazy dog.',
+    })
+    console.log(single.embeddings[0].length) // vector length
+    ```
+  </Tab>
+</Tabs>
+
+<Note>
+  The `/api/embed` endpoint returns L2‑normalized (unit‑length) vectors.
+</Note>
+
+## Generate a batch of embeddings
+
+Pass an array of strings to `input`.
+
+<Tabs>
+  <Tab title="cURL">
+    ```shell
+    curl -X POST http://localhost:11434/api/embed \
+      -H "Content-Type: application/json" \
+      -d '{
+        "model": "embeddinggemma",
+        "input": [
+          "First sentence",
+          "Second sentence",
+          "Third sentence"
+        ]
+      }'
+    ```
+  </Tab>
+  <Tab title="Python">
+    ```python
+    import ollama
+
+    batch = ollama.embed(
+      model='embeddinggemma',
+      input=[
+        'The quick brown fox jumps over the lazy dog.',
+        'The five boxing wizards jump quickly.',
+        'Jackdaws love my big sphinx of quartz.',
+      ]
+    )
+    print(len(batch['embeddings']))  # number of vectors
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+    ```javascript
+    import ollama from 'ollama'
+
+    const batch = await ollama.embed({
+      model: 'embeddinggemma',
+      input: [
+        'The quick brown fox jumps over the lazy dog.',
+        'The five boxing wizards jump quickly.',
+        'Jackdaws love my big sphinx of quartz.',
+      ],
+    })
+    console.log(batch.embeddings.length) // number of vectors
+    ```
+  </Tab>
+</Tabs>
+
+## Tips
+
+- Use cosine similarity for most semantic search use cases.
+- Use the same embedding model for both indexing and querying.
+
+
diff --git a/docs/capabilities/streaming.mdx b/docs/capabilities/streaming.mdx
new file mode 100644
index 00000000..1467afcd
--- /dev/null
+++ b/docs/capabilities/streaming.mdx
@@ -0,0 +1,99 @@
+---
+title: Streaming
+---
+
+Streaming allows you to render text as it is produced by the model. 
+
+Streaming is enabled by default through the REST API, but disabled by default in the SDKs.
+
+To enable streaming in the SDKs, set the `stream` parameter to `True`.
+
+## Key streaming concepts
+1. Chatting: Stream partial assistant messages. Each chunk includes the `content` so you can render messages as they arrive.
+1. Thinking: Thinking-capable models emit a `thinking` field alongside regular content in each chunk. Detect this field in streaming chunks to show or hide reasoning traces before the final answer arrives.
+1. Tool calling: Watch for streamed `tool_calls` in each chunk, execute the requested tool, and append tool outputs back into the conversation.
+
+## Handling streamed chunks
+
+
+<Note> It is necessary to accumulate the partial fields in order to maintain the history of the conversation. This is particularly important for tool calling where the thinking, tool call from the model, and the executed tool result must be passed back to the model in the next request. </Note>
+
+<Tabs>
+  <Tab title="Python">
+
+    ```python
+    from ollama import chat
+
+    stream = chat(
+      model='qwen3',
+      messages=[{'role': 'user', 'content': 'What is 17 × 23?'}],
+      stream=True,
+    )
+
+    in_thinking = False
+    content = ''
+    thinking = ''
+    for chunk in stream:
+      if chunk.message.thinking:
+        if not in_thinking:
+          in_thinking = True
+          print('Thinking:\n', end='', flush=True)
+        print(chunk.message.thinking, end='', flush=True)
+        # accumulate the partial thinking 
+        thinking += chunk.message.thinking
+      elif chunk.message.content:
+        if in_thinking:
+          in_thinking = False
+          print('\n\nAnswer:\n', end='', flush=True)
+        print(chunk.message.content, end='', flush=True)
+        # accumulate the partial content
+        content += chunk.message.content
+
+      # append the accumulated fields to the messages for the next request
+      new_messages = [{ role: 'assistant', thinking: thinking, content: content }]
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+
+    ```javascript
+    import ollama from 'ollama'
+
+    async function main() {
+      const stream = await ollama.chat({
+        model: 'qwen3',
+        messages: [{ role: 'user', content: 'What is 17 × 23?' }],
+        stream: true,
+      })
+
+      let inThinking = false
+      let content = ''
+      let thinking = ''
+
+      for await (const chunk of stream) {
+        if (chunk.message.thinking) {
+          if (!inThinking) {
+            inThinking = true
+            process.stdout.write('Thinking:\n')
+          }
+          process.stdout.write(chunk.message.thinking)
+          // accumulate the partial thinking
+          thinking += chunk.message.thinking
+        } else if (chunk.message.content) {
+          if (inThinking) {
+            inThinking = false
+            process.stdout.write('\n\nAnswer:\n')
+          }
+          process.stdout.write(chunk.message.content)
+          // accumulate the partial content
+          content += chunk.message.content
+        }
+      }
+
+      // append the accumulated fields to the messages for the next request
+      new_messages = [{ role: 'assistant', thinking: thinking, content: content }]
+    }
+
+    main().catch(console.error)
+    ```
+  </Tab>
+</Tabs>
\ No newline at end of file
diff --git a/docs/capabilities/structured-outputs.mdx b/docs/capabilities/structured-outputs.mdx
new file mode 100644
index 00000000..da74e597
--- /dev/null
+++ b/docs/capabilities/structured-outputs.mdx
@@ -0,0 +1,194 @@
+---
+title: Structured Outputs
+---
+
+Structured outputs let you enforce a JSON schema on model responses so you can reliably extract structured data, describe images, or keep every reply consistent.
+
+## Generating structured JSON
+
+<Tabs>
+  <Tab title="cURL">
+    ```shell
+    curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
+      "model": "gpt-oss",
+      "messages": [{"role": "user", "content": "Tell me about Canada in one line"}],
+      "stream": false,
+      "format": "json"
+    }'
+    ```
+  </Tab>
+  <Tab title="Python">
+    ```python
+    from ollama import chat
+
+    response = chat(
+      model='gpt-oss',
+      messages=[{'role': 'user', 'content': 'Tell me about Canada.'}],
+      format='json'
+    )
+    print(response.message.content)
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+    ```javascript
+    import ollama from 'ollama'
+
+    const response = await ollama.chat({
+      model: 'gpt-oss',
+      messages: [{ role: 'user', content: 'Tell me about Canada.' }],
+      format: 'json'
+    })
+    console.log(response.message.content)
+    ```
+  </Tab>
+</Tabs>
+
+## Generating structured JSON with a schema
+
+Provide a JSON schema to the `format` field.
+
+<Note>
+  It is ideal to also pass the JSON schema as a string in the prompt to ground the model's response.
+</Note>
+
+<Tabs>
+  <Tab title="cURL">
+    ```shell
+    curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
+      "model": "gpt-oss",
+      "messages": [{"role": "user", "content": "Tell me about Canada."}],
+      "stream": false,
+      "format": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "capital": {"type": "string"},
+          "languages": {
+            "type": "array",
+            "items": {"type": "string"}
+          }
+        },
+        "required": ["name", "capital", "languages"]
+      }
+    }'
+    ```
+  </Tab>
+  <Tab title="Python">
+    Use Pydantic models and pass `model_json_schema()` to `format`, then validate the response:
+
+    ```python
+    from ollama import chat
+    from pydantic import BaseModel
+
+    class Country(BaseModel):
+      name: str
+      capital: str
+      languages: list[str]
+
+    response = chat(
+      model='gpt-oss',
+      messages=[{'role': 'user', 'content': 'Tell me about Canada.'}],
+      format=Country.model_json_schema(),
+    )
+
+    country = Country.model_validate_json(response.message.content)
+    print(country)
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+    Serialize a Zod schema with `zodToJsonSchema()` and parse the structured response:
+
+    ```javascript
+    import ollama from 'ollama'
+    import { z } from 'zod'
+    import { zodToJsonSchema } from 'zod-to-json-schema'
+
+    const Country = z.object({
+      name: z.string(),
+      capital: z.string(),
+      languages: z.array(z.string()),
+    })
+
+    const response = await ollama.chat({
+      model: 'gpt-oss',
+      messages: [{ role: 'user', content: 'Tell me about Canada.' }],
+      format: zodToJsonSchema(Country),
+    })
+
+    const country = Country.parse(JSON.parse(response.message.content))
+    console.log(country)
+    ```
+  </Tab>
+</Tabs>
+
+## Example: Extract structured data
+
+Define the objects you want returned and let the model populate the fields:
+
+```python
+from ollama import chat
+from pydantic import BaseModel
+
+class Pet(BaseModel):
+  name: str
+  animal: str
+  age: int
+  color: str | None
+  favorite_toy: str | None
+
+class PetList(BaseModel):
+  pets: list[Pet]
+
+response = chat(
+  model='gpt-oss',
+  messages=[{'role': 'user', 'content': 'I have two cats named Luna and Loki...'}],
+  format=PetList.model_json_schema(),
+)
+
+pets = PetList.model_validate_json(response.message.content)
+print(pets)
+```
+
+## Example: Vision with structured outputs
+
+Vision models accept the same `format` parameter, enabling deterministic descriptions of images:
+
+```python
+from ollama import chat
+from pydantic import BaseModel
+from typing import Literal, Optional
+
+class Object(BaseModel):
+  name: str
+  confidence: float
+  attributes: str
+
+class ImageDescription(BaseModel):
+  summary: str
+  objects: list[Object]
+  scene: str
+  colors: list[str]
+  time_of_day: Literal['Morning', 'Afternoon', 'Evening', 'Night']
+  setting: Literal['Indoor', 'Outdoor', 'Unknown']
+  text_content: Optional[str] = None
+
+response = chat(
+  model='gemma3',
+  messages=[{
+    'role': 'user',
+    'content': 'Describe this photo and list the objects you detect.',
+    'images': ['path/to/image.jpg'],
+  }],
+  format=ImageDescription.model_json_schema(),
+  options={'temperature': 0},
+)
+
+image_description = ImageDescription.model_validate_json(response.message.content)
+print(image_description)
+```
+
+## Tips for reliable structured outputs
+
+- Define schemas with Pydantic (Python) or Zod (JavaScript) so they can be reused for validation.
+- Lower the temperature (e.g., set it to `0`) for more deterministic completions.
+- Structured outputs work through the OpenAI-compatible API via `response_format` 
diff --git a/docs/capabilities/thinking.mdx b/docs/capabilities/thinking.mdx
new file mode 100644
index 00000000..388e9858
--- /dev/null
+++ b/docs/capabilities/thinking.mdx
@@ -0,0 +1,153 @@
+---
+title: Thinking
+---
+
+Thinking-capable models emit a `thinking` field that separates their reasoning trace from the final answer. 
+
+Use this capability to audit model steps, animate the model *thinking* in a UI, or hide the trace entirely when you only need the final response.
+
+## Supported models
+
+- [Qwen 3](https://ollama.com/library/qwen3)
+- [GPT-OSS](https://ollama.com/library/gpt-oss) *(use `think` levels: `low`, `medium`, `high` — the trace cannot be fully disabled)*
+- [DeepSeek-v3.1](https://ollama.com/library/deepseek-v3.1)
+- [DeepSeek R1](https://ollama.com/library/deepseek-r1)
+- Browse the latest additions under [thinking models](https://ollama.com/search?c=thinking)
+
+## Enable thinking in API calls
+
+Set the `think` field on chat or generate requests. Most models accept booleans (`true`/`false`).
+
+GPT-OSS instead expects one of `low`, `medium`, or `high` to tune the trace length. 
+
+The `message.thinking` (chat endpoint) or `thinking` (generate endpoint) field contains the reasoning trace while `message.content` / `response` holds the final answer.
+
+<Tabs>
+  <Tab title="cURL">
+    ```shell
+    curl http://localhost:11434/api/chat -d '{
+      "model": "qwen3",
+      "messages": [{
+        "role": "user",
+        "content": "How many letter r are in strawberry?"
+      }],
+      "think": true,
+      "stream": false
+    }'
+    ```
+  </Tab>
+  <Tab title="Python">
+    ```python
+    from ollama import chat
+
+    response = chat(
+      model='qwen3',
+      messages=[{'role': 'user', 'content': 'How many letter r are in strawberry?'}],
+      think=True,
+      stream=False,
+    )
+
+    print('Thinking:\n', response.message.thinking)
+    print('Answer:\n', response.message.content)
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+    ```javascript
+    import ollama from 'ollama'
+
+    const response = await ollama.chat({
+      model: 'deepseek-r1',
+      messages: [{ role: 'user', content: 'How many letter r are in strawberry?' }],
+      think: true,
+      stream: false,
+    })
+
+    console.log('Thinking:\n', response.message.thinking)
+    console.log('Answer:\n', response.message.content)
+    ```
+  </Tab>
+</Tabs>
+
+<Note>
+  GPT-OSS requires `think` to be set to `"low"`, `"medium"`, or `"high"`. Passing `true`/`false` is ignored for that model.
+</Note>
+
+## Stream the reasoning trace
+
+Thinking streams interleave reasoning tokens before answer tokens. Detect the first `thinking` chunk to render a "thinking" section, then switch to the final reply once `message.content` arrives.
+
+<Tabs>
+  <Tab title="Python">
+    ```python
+    from ollama import chat
+
+    stream = chat(
+      model='qwen3',
+      messages=[{'role': 'user', 'content': 'What is 17 × 23?'}],
+      think=True,
+      stream=True,
+    )
+
+    in_thinking = False
+
+    for chunk in stream:
+      if chunk.message.thinking and not in_thinking:
+        in_thinking = True
+        print('Thinking:\n', end='')
+
+      if chunk.message.thinking:
+        print(chunk.message.thinking, end='')
+      elif chunk.message.content:
+        if in_thinking:
+          print('\n\nAnswer:\n', end='')
+          in_thinking = False
+        print(chunk.message.content, end='')
+
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+    ```javascript
+    import ollama from 'ollama'
+
+    async function main() {
+      const stream = await ollama.chat({
+        model: 'qwen3',
+        messages: [{ role: 'user', content: 'What is 17 × 23?' }],
+        think: true,
+        stream: true,
+      })
+
+      let inThinking = false
+
+      for await (const chunk of stream) {
+        if (chunk.message.thinking && !inThinking) {
+          inThinking = true
+          process.stdout.write('Thinking:\n')
+        }
+
+        if (chunk.message.thinking) {
+          process.stdout.write(chunk.message.thinking)
+        } else if (chunk.message.content) {
+          if (inThinking) {
+            process.stdout.write('\n\nAnswer:\n')
+            inThinking = false
+          }
+          process.stdout.write(chunk.message.content)
+        }
+      }
+    }
+
+    main()
+    ```
+  </Tab>
+</Tabs>
+
+## CLI quick reference
+
+- Enable thinking for a single run: `ollama run deepseek-r1 --think "Where should I visit in Lisbon?"`
+- Disable thinking: `ollama run deepseek-r1 --think=false "Summarize this article"`
+- Hide the trace while still using a thinking model: `ollama run deepseek-r1 --hidethinking "Is 9.9 bigger or 9.11?"`
+- Inside interactive sessions, toggle with `/set think` or `/set nothink`.
+- GPT-OSS only accepts levels: `ollama run gpt-oss --think=low "Draft a headline"` (replace `low` with `medium` or `high` as needed).
+
+<Note>Thinking is enabled by default in the CLI and API for supported models.</Note>
diff --git a/docs/capabilities/tool-calling.mdx b/docs/capabilities/tool-calling.mdx
new file mode 100644
index 00000000..ae1ff959
--- /dev/null
+++ b/docs/capabilities/tool-calling.mdx
@@ -0,0 +1,777 @@
+---
+title: Tool calling 
+---
+
+Ollama supports tool calling (also known as function calling) which allows a model to invoke tools and incorporate their results into its replies.
+
+## Calling a single tool
+Invoke a single tool and include its response in a follow-up request. 
+
+Also known as "single-shot" tool calling.
+
+<Tabs>
+  <Tab title="cURL">
+
+    ```shell
+    curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
+      "model": "qwen3",
+      "messages": [{"role": "user", "content": "What's the temperature in New York?"}],
+      "stream": false,
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_temperature",
+            "description": "Get the current temperature for a city",
+            "parameters": {
+              "type": "object",
+              "required": ["city"],
+              "properties": {
+                "city": {"type": "string", "description": "The name of the city"}
+              }
+            }
+          }
+        }
+      ]
+    }'
+    ```
+
+    **Generate a response with a single tool result**
+    ```shell
+    curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
+      "model": "qwen3",
+      "messages": [
+        {"role": "user", "content": "What's the temperature in New York?"},
+        {
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "type": "function",
+              "function": {
+                "index": 0,
+                "name": "get_temperature",
+                "arguments": {"city": "New York"}
+              }
+            }
+          ]
+        },
+        {"role": "tool", "tool_name": "get_temperature", "content": "22°C"}
+      ],
+      "stream": false
+    }'
+    ```
+  </Tab>
+  <Tab title="Python">
+    Install the Ollama Python SDK:
+    ```bash
+    # with pip
+    pip install ollama -U
+
+    # with uv
+    uv add ollama    
+    ```
+
+    ```python
+    from ollama import chat
+
+    def get_temperature(city: str) -> str:
+      """Get the current temperature for a city
+      
+      Args:
+        city: The name of the city
+
+      Returns:
+        The current temperature for the city
+      """
+      temperatures = {
+        "New York": "22°C",
+        "London": "15°C",
+        "Tokyo": "18°C",
+      }
+      return temperatures.get(city, "Unknown")
+
+    messages = [{"role": "user", "content": "What's the temperature in New York?"}]
+
+    # pass functions directly as tools in the tools list or as a JSON schema
+    response = chat(model="qwen3", messages=messages, tools=[get_temperature], think=True)
+
+    messages.append(response.message)
+    if response.message.tool_calls:
+      # only recommended for models which only return a single tool call
+      call = response.message.tool_calls[0]
+      result = get_temperature(**call.function.arguments)
+      # add the tool result to the messages
+      messages.append({"role": "tool", "tool_name": call.function.name, "content": str(result)})
+
+      final_response = chat(model="qwen3", messages=messages, tools=[get_temperature], think=True)
+      print(final_response.message.content)
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+    Install the Ollama JavaScript library:
+    ```bash
+    # with npm
+    npm i ollama
+
+    # with bun
+    bun i ollama
+    ```
+
+    ```typescript
+    import ollama from 'ollama'
+
+    function getTemperature(city: string): string {
+      const temperatures: Record<string, string> = {
+        'New York': '22°C',
+        'London': '15°C',
+        'Tokyo': '18°C',
+      }
+      return temperatures[city] ?? 'Unknown'
+    }
+
+    const tools = [
+      {
+        type: 'function',
+        function: {
+          name: 'get_temperature',
+          description: 'Get the current temperature for a city',
+          parameters: {
+            type: 'object',
+            required: ['city'],
+            properties: {
+              city: { type: 'string', description: 'The name of the city' },
+            },
+          },
+        },
+      },
+    ]
+
+    const messages = [{ role: 'user', content: "What's the temperature in New York?" }]
+
+    const response = await ollama.chat({
+      model: 'qwen3',
+      messages,
+      tools,
+      think: true,
+    })
+
+    messages.push(response.message)
+    if (response.message.tool_calls?.length) {
+      // only recommended for models which only return a single tool call
+      const call = response.message.tool_calls[0]
+      const args = call.function.arguments as { city: string }
+      const result = getTemperature(args.city)
+      // add the tool result to the messages
+      messages.push({ role: 'tool', tool_name: call.function.name, content: result })
+
+      // generate the final response
+      const finalResponse = await ollama.chat({ model: 'qwen3', messages, tools, think: true })
+      console.log(finalResponse.message.content)
+    }
+    ```
+  </Tab>
+</Tabs>
+
+## Parallel tool calling
+
+<Tabs>
+  <Tab title="cURL">
+    Request multiple tool calls in parallel, then send all tool responses back to the model.
+
+    ```shell
+    curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
+      "model": "qwen3",
+      "messages": [{"role": "user", "content": "What are the current weather conditions and temperature in New York and London?"}],
+      "stream": false,
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_temperature",
+            "description": "Get the current temperature for a city",
+            "parameters": {
+              "type": "object",
+              "required": ["city"],
+              "properties": {
+                "city": {"type": "string", "description": "The name of the city"}
+              }
+            }
+          }
+        },
+        {
+          "type": "function",
+          "function": {
+            "name": "get_conditions",
+            "description": "Get the current weather conditions for a city",
+            "parameters": {
+              "type": "object",
+              "required": ["city"],
+              "properties": {
+                "city": {"type": "string", "description": "The name of the city"}
+              }
+            }
+          }
+        }
+      ]
+    }'
+    ```
+
+    **Generate a response with multiple tool results**
+    ```shell
+    curl -s http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
+      "model": "qwen3",
+      "messages": [
+        {"role": "user", "content": "What are the current weather conditions and temperature in New York and London?"},
+        {
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "type": "function",
+              "function": {
+                "index": 0,
+                "name": "get_temperature",
+                "arguments": {"city": "New York"}
+              }
+            },
+            {
+              "type": "function",
+              "function": {
+                "index": 1,
+                "name": "get_conditions",
+                "arguments": {"city": "New York"}
+              }
+            },
+            {
+              "type": "function",
+              "function": {
+                "index": 2,
+                "name": "get_temperature",
+                "arguments": {"city": "London"}
+              }
+            },
+            {
+              "type": "function",
+              "function": {
+                "index": 3,
+                "name": "get_conditions",
+                "arguments": {"city": "London"}
+              }
+            }
+          ]
+        },
+        {"role": "tool", "tool_name": "get_temperature", "content": "22°C"},
+        {"role": "tool", "tool_name": "get_conditions", "content": "Partly cloudy"},
+        {"role": "tool", "tool_name": "get_temperature", "content": "15°C"},
+        {"role": "tool", "tool_name": "get_conditions", "content": "Rainy"}
+      ],
+      "stream": false
+    }'
+    ```
+  </Tab>
+  <Tab title="Python">
+    ```python
+    from ollama import chat
+
+    def get_temperature(city: str) -> str:
+      """Get the current temperature for a city
+      
+      Args:
+        city: The name of the city
+
+      Returns:
+        The current temperature for the city
+      """
+      temperatures = {
+        "New York": "22°C",
+        "London": "15°C",
+        "Tokyo": "18°C"
+      }
+      return temperatures.get(city, "Unknown")
+
+    def get_conditions(city: str) -> str:
+      """Get the current weather conditions for a city
+      
+      Args:
+        city: The name of the city
+
+      Returns:
+        The current weather conditions for the city
+      """
+      conditions = {
+        "New York": "Partly cloudy",
+        "London": "Rainy",
+        "Tokyo": "Sunny"
+      }
+      return conditions.get(city, "Unknown")
+
+
+    messages = [{'role': 'user', 'content': 'What are the current weather conditions and temperature in New York and London?'}]
+
+    # The python client automatically parses functions as a tool schema so we can pass them directly
+    # Schemas can be passed directly in the tools list as well 
+    response = chat(model='qwen3', messages=messages, tools=[get_temperature, get_conditions], think=True)
+
+    # add the assistant message to the messages
+    messages.append(response.message)
+    if response.message.tool_calls:
+      # process each tool call 
+      for call in response.message.tool_calls:
+        # execute the appropriate tool
+        if call.function.name == 'get_temperature':
+          result = get_temperature(**call.function.arguments)
+        elif call.function.name == 'get_conditions':
+          result = get_conditions(**call.function.arguments)
+        else:
+          result = 'Unknown tool'
+        # add the tool result to the messages
+        messages.append({'role': 'tool',  'tool_name': call.function.name, 'content': str(result)})
+
+      # generate the final response
+      final_response = chat(model='qwen3', messages=messages, tools=[get_temperature, get_conditions], think=True)
+      print(final_response.message.content)
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+    ```typescript
+    import ollama from 'ollama'
+
+    function getTemperature(city: string): string {
+      const temperatures: { [key: string]: string } = {
+        "New York": "22°C",
+        "London": "15°C",
+        "Tokyo": "18°C"
+      }
+      return temperatures[city] || "Unknown"
+    }
+
+    function getConditions(city: string): string {
+      const conditions: { [key: string]: string } = {
+        "New York": "Partly cloudy",
+        "London": "Rainy",
+        "Tokyo": "Sunny"
+      }
+      return conditions[city] || "Unknown"
+    }
+
+    const tools = [
+      {
+        type: 'function',
+        function: {
+          name: 'get_temperature',
+          description: 'Get the current temperature for a city',
+          parameters: {
+            type: 'object',
+            required: ['city'],
+            properties: {
+              city: { type: 'string', description: 'The name of the city' },
+            },
+          },
+        },
+      },
+      {
+        type: 'function',
+        function: {
+          name: 'get_conditions',
+          description: 'Get the current weather conditions for a city',
+          parameters: {
+            type: 'object',
+            required: ['city'],
+            properties: {
+              city: { type: 'string', description: 'The name of the city' },
+            },
+          },
+        },
+      }
+    ]
+
+    const messages = [{ role: 'user', content: 'What are the current weather conditions and temperature in New York and London?' }]
+
+    const response = await ollama.chat({
+      model: 'qwen3',
+      messages,
+      tools,
+      think: true
+    })
+
+    // add the assistant message to the messages
+    messages.push(response.message)
+    if (response.message.tool_calls) {
+      // process each tool call 
+      for (const call of response.message.tool_calls) {
+        // execute the appropriate tool
+        let result: string
+        if (call.function.name === 'get_temperature') {
+          const args = call.function.arguments as { city: string }
+          result = getTemperature(args.city)
+        } else if (call.function.name === 'get_conditions') {
+          const args = call.function.arguments as { city: string }
+          result = getConditions(args.city)
+        } else {
+          result = 'Unknown tool'
+        }
+        // add the tool result to the messages
+        messages.push({ role: 'tool', tool_name: call.function.name, content: result })
+      }
+
+      // generate the final response
+      const finalResponse = await ollama.chat({ model: 'qwen3', messages, tools, think: true })
+      console.log(finalResponse.message.content)
+    }
+    ```
+  </Tab>
+</Tabs>
+
+
+## Multi-turn tool calling (Agent loop)
+
+An agent loop allows the model to decide when to invoke tools and incorporate their results into its replies. 
+
+It also might help to tell the model that it is in a loop and can make multiple tool calls. 
+
+<Tabs>
+  <Tab title="Python">
+    ```python
+    from ollama import chat, ChatResponse
+
+
+    def add(a: int, b: int) -> int:
+      """Add two numbers"""
+      """
+      Args:
+        a: The first number
+        b: The second number
+
+      Returns:
+        The sum of the two numbers
+      """
+      return a + b
+
+
+    def multiply(a: int, b: int) -> int:
+      """Multiply two numbers"""
+      """
+      Args:
+        a: The first number
+        b: The second number
+
+      Returns:
+        The product of the two numbers
+      """
+      return a * b
+
+
+    available_functions = {
+      'add': add,
+      'multiply': multiply,
+    }
+
+    messages = [{'role': 'user', 'content': 'What is (11434+12341)*412?'}]
+    while True:
+        response: ChatResponse = chat(
+            model='qwen3',
+            messages=messages,
+            tools=[add, multiply],
+            think=True,
+        )
+        messages.append(response.message)
+        print("Thinking: ", response.message.thinking)
+        print("Content: ", response.message.content)
+        if response.message.tool_calls:
+            for tc in response.message.tool_calls:
+                if tc.function.name in available_functions:
+                    print(f"Calling {tc.function.name} with arguments {tc.function.arguments}")
+                    result = available_functions[tc.function.name](**tc.function.arguments)
+                    print(f"Result: {result}")
+                    # add the tool result to the messages
+                    messages.append({'role': 'tool', 'tool_name': tc.function.name, 'content': str(result)})
+        else:
+            # end the loop when there are no more tool calls
+            break
+      # continue the loop with the updated messages
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+    ```typescript
+    import ollama from 'ollama'
+
+    type ToolName = 'add' | 'multiply'
+
+    function add(a: number, b: number): number {
+      return a + b
+    }
+
+    function multiply(a: number, b: number): number {
+      return a * b
+    }
+
+    const availableFunctions: Record<ToolName, (a: number, b: number) => number> = {
+      add,
+      multiply,
+    }
+
+    const tools = [
+      {
+        type: 'function',
+        function: {
+          name: 'add',
+          description: 'Add two numbers',
+          parameters: {
+            type: 'object',
+            required: ['a', 'b'],
+            properties: {
+              a: { type: 'integer', description: 'The first number' },
+              b: { type: 'integer', description: 'The second number' },
+            },
+          },
+        },
+      },
+      {
+        type: 'function',
+        function: {
+          name: 'multiply',
+          description: 'Multiply two numbers',
+          parameters: {
+            type: 'object',
+            required: ['a', 'b'],
+            properties: {
+              a: { type: 'integer', description: 'The first number' },
+              b: { type: 'integer', description: 'The second number' },
+            },
+          },
+        },
+      },
+    ]
+
+    async function agentLoop() {
+      const messages = [{ role: 'user', content: 'What is (11434+12341)*412?' }]
+
+      while (true) {
+        const response = await ollama.chat({
+          model: 'qwen3',
+          messages,
+          tools,
+          think: true,
+        })
+
+        messages.push(response.message)
+        console.log('Thinking:', response.message.thinking)
+        console.log('Content:', response.message.content)
+
+        const toolCalls = response.message.tool_calls ?? []
+        if (toolCalls.length) {
+          for (const call of toolCalls) {
+            const fn = availableFunctions[call.function.name as ToolName]
+            if (!fn) {
+              continue
+            }
+
+            const args = call.function.arguments as { a: number; b: number }
+            console.log(`Calling ${call.function.name} with arguments`, args)
+            const result = fn(args.a, args.b)
+            console.log(`Result: ${result}`)
+            messages.push({ role: 'tool', tool_name: call.function.name, content: String(result) })
+          }
+        } else {
+          break
+        }
+      }
+    }
+
+    agentLoop().catch(console.error)
+    ```
+  </Tab>
+</Tabs>
+
+
+## Tool calling with streaming
+
+When streaming, gather every chunk of `thinking`, `content`, and `tool_calls`, then return those fields together with any tool results in the follow-up request.
+
+<Tabs>
+  <Tab title="Python">
+```python
+from ollama import chat 
+
+
+def get_temperature(city: str) -> str:
+  """Get the current temperature for a city
+  
+  Args:
+    city: The name of the city
+
+  Returns:
+    The current temperature for the city
+  """
+  temperatures = {
+    'New York': '22°C',
+    'London': '15°C',
+  }
+  return temperatures.get(city, 'Unknown')
+
+
+messages = [{'role': 'user', 'content': "What's the temperature in New York?"}]
+
+while True:
+  stream = chat(
+    model='qwen3',
+    messages=messages,
+    tools=[get_temperature],
+    stream=True,
+    think=True,
+  )
+
+  thinking = ''
+  content = ''
+  tool_calls = []
+
+  done_thinking = False
+  # accumulate the partial fields
+  for chunk in stream:
+    if chunk.message.thinking:
+      thinking += chunk.message.thinking
+      print(chunk.message.thinking, end='', flush=True)
+    if chunk.message.content:
+      if not done_thinking:
+        done_thinking = True
+        print('\n')
+      content += chunk.message.content
+      print(chunk.message.content, end='', flush=True)
+    if chunk.message.tool_calls:
+      tool_calls.extend(chunk.message.tool_calls)
+      print(chunk.message.tool_calls)
+
+  # append accumulated fields to the messages
+  if thinking or content or tool_calls:
+    messages.append({'role': 'assistant', 'thinking': thinking, 'content': content, 'tool_calls': tool_calls})
+
+  if not tool_calls:
+    break
+
+  for call in tool_calls:
+    if call.function.name == 'get_temperature':
+      result = get_temperature(**call.function.arguments)
+    else:
+      result = 'Unknown tool'
+    messages.append({'role': 'tool', 'tool_name': call.function.name, 'content': result})
+```
+
+  </Tab>
+  <Tab title="JavaScript">
+```typescript
+import ollama from 'ollama'
+
+function getTemperature(city: string): string {
+  const temperatures: Record<string, string> = {
+    'New York': '22°C',
+    'London': '15°C',
+  }
+  return temperatures[city] ?? 'Unknown'
+}
+
+const getTemperatureTool = {
+  type: 'function',
+  function: {
+    name: 'get_temperature',
+    description: 'Get the current temperature for a city',
+    parameters: {
+      type: 'object',
+      required: ['city'],
+      properties: {
+        city: { type: 'string', description: 'The name of the city' },
+      },
+    },
+  },
+}
+
+async function agentLoop() {
+  const messages = [{ role: 'user', content: "What's the temperature in New York?" }]
+
+  while (true) {
+    const stream = await ollama.chat({
+      model: 'qwen3',
+      messages,
+      tools: [getTemperatureTool],
+      stream: true,
+      think: true,
+    })
+
+    let thinking = ''
+    let content = ''
+    const toolCalls: any[] = []
+    let doneThinking = false
+
+    for await (const chunk of stream) {
+      if (chunk.message.thinking) {
+        thinking += chunk.message.thinking
+        process.stdout.write(chunk.message.thinking)
+      }
+      if (chunk.message.content) {
+        if (!doneThinking) {
+          doneThinking = true
+          process.stdout.write('\n')
+        }
+        content += chunk.message.content
+        process.stdout.write(chunk.message.content)
+      }
+      if (chunk.message.tool_calls?.length) {
+        toolCalls.push(...chunk.message.tool_calls)
+        console.log(chunk.message.tool_calls)
+      }
+    }
+
+    if (thinking || content || toolCalls.length) {
+      messages.push({ role: 'assistant', thinking, content, tool_calls: toolCalls } as any)
+    }
+
+    if (!toolCalls.length) {
+      break
+    }
+
+    for (const call of toolCalls) {
+      if (call.function.name === 'get_temperature') {
+        const args = call.function.arguments as { city: string }
+        const result = getTemperature(args.city)
+        messages.push({ role: 'tool', tool_name: call.function.name, content: result } )
+      } else {
+        messages.push({ role: 'tool', tool_name: call.function.name, content: 'Unknown tool' } )
+      }
+    }
+  }
+}
+
+agentLoop().catch(console.error)
+    ```
+  </Tab>
+</Tabs>
+
+This loop streams the assistant response, accumulates partial fields, passes them back together, and appends the tool results so the model can complete its answer.
+
+
+## Using functions as tools with Ollama Python SDK
+The Python SDK automatically parses functions as a tool schema so we can pass them directly.
+Schemas can still be passed if needed.
+
+```python
+from ollama import chat
+
+def get_temperature(city: str) -> str:
+  """Get the current temperature for a city
+  
+  Args:
+    city: The name of the city
+
+  Returns:
+    The current temperature for the city
+  """
+  temperatures = {
+    'New York': '22°C',
+    'London': '15°C',
+  }
+  return temperatures.get(city, 'Unknown')
+
+available_functions = {
+  'get_temperature': get_temperature,
+}
+# directly pass the function as part of the tools list
+response = chat(model='qwen3', messages=messages, tools=available_functions.values(), think=True)
+```
diff --git a/docs/capabilities/vision.mdx b/docs/capabilities/vision.mdx
new file mode 100644
index 00000000..3342eae2
--- /dev/null
+++ b/docs/capabilities/vision.mdx
@@ -0,0 +1,85 @@
+---
+title: Vision
+---
+
+Vision models accept images alongside text so the model can describe, classify, and answer questions about what it sees.
+
+## Quick start
+
+```shell
+ollama run gemma3 ./image.png whats in this image?
+```
+
+
+## Usage with Ollama's API
+Provide an `images` array. SDKs accept file paths, URLs or raw bytes while the REST API expects base64-encoded image data.
+
+
+<Tabs>
+  <Tab title="cURL">
+    ```shell
+    # 1. Download a sample image
+    curl -L -o test.jpg "https://upload.wikimedia.org/wikipedia/commons/3/3a/Cat03.jpg"
+
+    # 2. Encode the image
+    IMG=$(base64 < test.jpg | tr -d '\n')
+
+    # 3. Send it to Ollama
+    curl -X POST http://localhost:11434/api/chat \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "gemma3",
+        "messages": [{
+        "role": "user",
+        "content": "What is in this image?",
+        "images": ["'"$IMG"'"]
+        }],
+        "stream": false
+    }'
+    "
+    ```
+  </Tab>
+  <Tab title="Python">
+    ```python
+    from ollama import chat
+    # from pathlib import Path
+
+    # Pass in the path to the image
+    path = input('Please enter the path to the image: ')
+
+    # You can also pass in base64 encoded image data
+    # img = base64.b64encode(Path(path).read_bytes()).decode()
+    # or the raw bytes
+    # img = Path(path).read_bytes()
+
+    response = chat(
+      model='gemma3',
+      messages=[
+        {
+          'role': 'user',
+          'content': 'What is in this image? Be concise.',
+          'images': [path],
+        }
+      ],
+    )
+
+    print(response.message.content)
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+    ```javascript
+    import ollama from 'ollama'
+
+    const imagePath = '/absolute/path/to/image.jpg'
+    const response = await ollama.chat({
+      model: 'gemma3',
+      messages: [
+        { role: 'user', content: 'What is in this image?', images: [imagePath] }
+      ],
+      stream: false,
+    })
+
+    console.log(response.message.content)
+    ```
+  </Tab>
+</Tabs>
diff --git a/docs/capabilities/web-search.mdx b/docs/capabilities/web-search.mdx
new file mode 100644
index 00000000..641ef381
--- /dev/null
+++ b/docs/capabilities/web-search.mdx
@@ -0,0 +1,360 @@
+---
+title: Web search
+---
+
+Ollama's web search API can be used to augment models with the latest information to reduce hallucinations and improve accuracy.
+
+Web search is provided as a REST API with deeper tool integrations in the Python and JavaScript libraries. This also enables models like OpenAI’s gpt-oss models to conduct long-running research tasks.
+
+## Authentication
+
+For access to Ollama's web search API, create an [API key](https://ollama.com/settings/keys). A free Ollama account is required.
+
+## Web search API
+
+Performs a web search for a single query and returns relevant results.
+
+### Request
+
+`POST https://ollama.com/api/web_search`
+
+- `query` (string, required): the search query string
+- `max_results` (integer, optional): maximum results to return (default 5, max 10)
+
+### Response
+
+Returns an object containing:
+
+- `results` (array): array of search result objects, each containing:
+  - `title` (string): the title of the web page
+  - `url` (string): the URL of the web page
+  - `content` (string): relevant content snippet from the web page
+
+### Examples
+
+<Note>
+  Ensure OLLAMA_API_KEY is set or it must be passed in the Authorization header.
+</Note>
+
+#### cURL Request
+
+```bash
+curl https://ollama.com/api/web_search \
+  --header "Authorization: Bearer $OLLAMA_API_KEY" \
+	-d '{
+	  "query":"what is ollama?"
+	}'
+```
+
+**Response**
+
+```json
+{
+  "results": [
+    {
+      "title": "Ollama",
+      "url": "https://ollama.com/",
+      "content": "Cloud models are now available..."
+    },
+    {
+      "title": "What is Ollama? Introduction to the AI model management tool",
+      "url": "https://www.hostinger.com/tutorials/what-is-ollama",
+      "content": "Ariffud M. 6min Read..."
+    },
+    {
+      "title": "Ollama Explained: Transforming AI Accessibility and Language ...",
+      "url": "https://www.geeksforgeeks.org/artificial-intelligence/ollama-explained-transforming-ai-accessibility-and-language-processing/",
+      "content": "Data Science Data Science Projects Data Analysis..."
+    }
+  ]
+}
+```
+
+#### Python library
+
+```python
+import ollama
+response = ollama.web_search("What is Ollama?")
+print(response)
+```
+
+**Example output**
+
+```python
+
+results = [
+    {
+        "title": "Ollama",
+        "url": "https://ollama.com/",
+        "content": "Cloud models are now available in Ollama..."
+    },
+    {
+        "title": "What is Ollama? Features, Pricing, and Use Cases - Walturn",
+        "url": "https://www.walturn.com/insights/what-is-ollama-features-pricing-and-use-cases",
+        "content": "Our services..."
+    },
+    {
+        "title": "Complete Ollama Guide: Installation, Usage & Code Examples",
+        "url": "https://collabnix.com/complete-ollama-guide-installation-usage-code-examples",
+        "content": "Join our Discord Server..."
+    }
+]
+
+```
+
+More Ollama [Python example](https://github.com/ollama/ollama-python/blob/main/examples/web-search.py)
+
+#### JavaScript Library
+
+```tsx
+import { Ollama } from "ollama";
+
+const client = new Ollama();
+const results = await client.webSearch({ query: "what is ollama?" });
+console.log(JSON.stringify(results, null, 2));
+```
+
+**Example output**
+
+```json
+{
+  "results": [
+    {
+      "title": "Ollama",
+      "url": "https://ollama.com/",
+      "content": "Cloud models are now available..."
+    },
+    {
+      "title": "What is Ollama? Introduction to the AI model management tool",
+      "url": "https://www.hostinger.com/tutorials/what-is-ollama",
+      "content": "Ollama is an open-source tool..."
+    },
+    {
+      "title": "Ollama Explained: Transforming AI Accessibility and Language Processing",
+      "url": "https://www.geeksforgeeks.org/artificial-intelligence/ollama-explained-transforming-ai-accessibility-and-language-processing/",
+      "content": "Ollama is a groundbreaking..."
+    }
+  ]
+}
+```
+
+More Ollama [JavaScript example](https://github.com/ollama/ollama-js/blob/main/examples/websearch/websearch-tools.ts)
+
+## Web fetch API
+
+Fetches a single web page by URL and returns its content.
+
+### Request
+
+`POST https://ollama.com/api/web_fetch`
+
+- `url` (string, required): the URL to fetch
+
+### Response
+
+Returns an object containing:
+
+- `title` (string): the title of the web page
+- `content` (string): the main content of the web page
+- `links` (array): array of links found on the page
+
+### Examples
+
+#### cURL Request
+
+```python
+curl --request POST \
+  --url https://ollama.com/api/web_fetch \
+  --header "Authorization: Bearer $OLLAMA_API_KEY" \
+  --header 'Content-Type: application/json' \
+  --data '{
+      "url": "ollama.com"
+  }'
+```
+
+**Response**
+
+```json
+{
+  "title": "Ollama",
+  "content": "[Cloud models](https://ollama.com/blog/cloud-models) are now available in Ollama...",
+  "links": [
+    "http://ollama.com/",
+    "http://ollama.com/models",
+    "https://github.com/ollama/ollama"
+  ]
+
+```
+
+#### Python SDK
+
+```python
+from ollama import web_fetch
+
+result = web_fetch('https://ollama.com')
+print(result)
+```
+
+**Result**
+
+```python
+WebFetchResponse(
+    title='Ollama',
+    content='[Cloud models](https://ollama.com/blog/cloud-models) are now available in Ollama\n\n**Chat & build
+with open models**\n\n[Download](https://ollama.com/download) [Explore
+models](https://ollama.com/models)\n\nAvailable for macOS, Windows, and Linux',
+    links=['https://ollama.com/', 'https://ollama.com/models', 'https://github.com/ollama/ollama']
+)
+```
+
+#### JavaScript SDK
+
+```tsx
+import { Ollama } from "ollama";
+
+const client = new Ollama();
+const fetchResult = await client.webFetch({ url: "https://ollama.com" });
+console.log(JSON.stringify(fetchResult, null, 2));
+```
+
+**Result**
+
+```json
+{
+  "title": "Ollama",
+  "content": "[Cloud models](https://ollama.com/blog/cloud-models) are now available in Ollama...",
+  "links": [
+    "https://ollama.com/",
+    "https://ollama.com/models",
+    "https://github.com/ollama/ollama"
+  ]
+}
+```
+
+## Building a search agent
+
+Use Ollama’s web search API as a tool to build a mini search agent.
+
+This example uses Alibaba’s Qwen 3 model with 4B parameters.
+
+```bash
+ollama pull qwen3:4b
+```
+
+```python
+from ollama import chat, web_fetch, web_search
+
+available_tools = {'web_search': web_search, 'web_fetch': web_fetch}
+
+messages = [{'role': 'user', 'content': "what is ollama's new engine"}]
+
+while True:
+  response = chat(
+    model='qwen3:4b',
+    messages=messages,
+    tools=[web_search, web_fetch],
+    think=True
+    )
+  if response.message.thinking:
+    print('Thinking: ', response.message.thinking)
+  if response.message.content:
+    print('Content: ', response.message.content)
+  messages.append(response.message)
+  if response.message.tool_calls:
+    print('Tool calls: ', response.message.tool_calls)
+    for tool_call in response.message.tool_calls:
+      function_to_call = available_tools.get(tool_call.function.name)
+      if function_to_call:
+        args = tool_call.function.arguments
+        result = function_to_call(**args)
+        print('Result: ', str(result)[:200]+'...')
+        # Result is truncated for limited context lengths
+        messages.append({'role': 'tool', 'content': str(result)[:2000 * 4], 'tool_name': tool_call.function.name})
+      else:
+        messages.append({'role': 'tool', 'content': f'Tool {tool_call.function.name} not found', 'tool_name': tool_call.function.name})
+  else:
+    break
+```
+
+**Result**
+
+```
+Thinking:  Okay, the user is asking about Ollama's new engine. I need to figure out what they're referring to. Ollama is a company that develops large language models, so maybe they've released a new model or an updated version of their existing engine....
+
+Tool calls:  [ToolCall(function=Function(name='web_search', arguments={'max_results': 3, 'query': 'Ollama new engine'}))]
+Result:  results=[WebSearchResult(content='# New model scheduling\n\n## September 23, 2025\n\nOllama now includes a significantly improved model scheduling system. Ahead of running a model, Ollama’s new engine
+
+Thinking:  Okay, the user asked about Ollama's new engine. Let me look at the search results.
+
+First result is from September 23, 2025, talking about new model scheduling. It mentions improved memory management, reduced crashes, better GPU utilization, and multi-GPU performance. Examples show speed improvements and accurate memory reporting. Supported models include gemma3, llama4, qwen3, etc...
+
+Content:  Ollama has introduced two key updates to its engine, both released in 2025:
+
+1. **Enhanced Model Scheduling (September 23, 2025)**
+   - **Precision Memory Management**: Exact memory allocation reduces out-of-memory crashes and optimizes GPU utilization.
+   - **Performance Gains**: Examples show significant speed improvements (e.g., 85.54 tokens/s vs 52.02 tokens/s) and full GPU layer utilization.
+   - **Multi-GPU Support**: Improved efficiency across multiple GPUs, with accurate memory reporting via tools like `nvidia-smi`.
+   - **Supported Models**: Includes `gemma3`, `llama4`, `qwen3`, `mistral-small3.2`, and more.
+
+2. **Multimodal Engine (May 15, 2025)**
+   - **Vision Support**: First-class support for vision models, including `llama4:scout` (109B parameters), `gemma3`, `qwen2.5vl`, and `mistral-small3.1`.
+   - **Multimodal Tasks**: Examples include identifying animals in multiple images, answering location-based questions from videos, and document scanning.
+
+These updates highlight Ollama's focus on efficiency, performance, and expanded capabilities for both text and vision tasks.
+```
+
+### Context length and agents
+
+Web search results can return thousands of tokens. It is recommended to increase the context length of the model to at least ~32000 tokens. Search agents work best with full context length. [Ollama's cloud models](https://docs.ollama.com/cloud) run at the full context length.
+
+## MCP Server
+
+You can enable web search in any MCP client through the [Python MCP server](https://github.com/ollama/ollama-python/blob/main/examples/web-search-mcp.py).
+
+### Cline
+
+Ollama's web search can be integrated with Cline easily using the MCP server configuration.
+
+`Manage MCP Servers` > `Configure MCP Servers` > Add the following configuration:
+
+```json
+{
+  "mcpServers": {
+    "web_search_and_fetch": {
+      "type": "stdio",
+      "command": "uv",
+      "args": ["run", "path/to/web-search-mcp.py"],
+      "env": { "OLLAMA_API_KEY": "your_api_key_here" }
+    }
+  }
+}
+```
+
+![Cline MCP Configuration](/images/cline-mcp.png)
+
+### Codex
+
+Ollama works well with OpenAI's Codex tool.
+
+Add the following configuration to `~/.codex/config.toml`
+
+```python
+[mcp_servers.web_search]
+command = "uv"
+args = ["run", "path/to/web-search-mcp.py"]
+env = { "OLLAMA_API_KEY" = "your_api_key_here" }
+```
+
+![Codex MCP Configuration](/images/codex-mcp.png)
+
+### Goose
+
+Ollama can integrate with Goose via its MCP feature.
+
+![Goose MCP Configuration 1](/images/goose-mcp-1.png)
+
+![Goose MCP Configuration 2](/images/goose-mcp-2.png)
+
+### Other integrations
+
+Ollama can be integrated into most of the tools available either through direct integration of Ollama's API, Python / JavaScript libraries, OpenAI compatible API, and MCP server integration.
diff --git a/docs/cli.mdx b/docs/cli.mdx
new file mode 100644
index 00000000..3081838f
--- /dev/null
+++ b/docs/cli.mdx
@@ -0,0 +1,91 @@
+---
+title: CLI Reference
+---
+
+### Run a model
+
+```
+ollama run gemma3
+```
+
+#### Multiline input
+
+For multiline input, you can wrap text with `"""`:
+
+```
+>>> """Hello,
+... world!
+... """
+I'm a basic program that prints the famous "Hello, world!" message to the console.
+```
+
+#### Multimodal models
+
+```
+ollama run gemma3 "What's in this image? /Users/jmorgan/Desktop/smile.png"
+```
+
+### Download a model
+
+```
+ollama pull gemma3
+```
+
+### Remove a model
+
+```
+ollama rm gemma3
+```
+
+### List models
+
+```
+ollama ls
+```
+
+### Sign in to Ollama
+
+```
+ollama signin
+```
+
+### Sign out of Ollama
+
+```
+ollama signout
+```
+
+### Create a customized model
+
+First, create a `Modelfile`
+
+```
+FROM gemma3
+SYSTEM """You are a happy cat."""
+```
+
+Then run `ollama create`:
+
+```
+ollama create -f Modelfile
+```
+
+### List running models
+
+```
+ollama ps
+```
+
+### Stop a running model
+
+```
+ollama stop gemma3
+```
+
+### Start Ollama
+
+```
+ollama serve
+```
+
+To view a list of environment variables that can be set run `ollama serve --help`
diff --git a/docs/cloud.md b/docs/cloud.md
deleted file mode 100644
index 300e6f5e..00000000
--- a/docs/cloud.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# Cloud
-
-| Ollama's cloud is currently in preview. For full documentation, see [Ollama's documentation](https://docs.ollama.com/cloud).
-
-## Cloud Models
-
-[Cloud models](https://ollama.com/cloud) are a new kind of model in Ollama that can run without a powerful GPU. Instead, cloud models are automatically offloaded to Ollama's cloud while offering the same capabilities as local models, making it possible to keep using your local tools while running larger models that wouldn’t fit on a personal computer.
-
-Ollama currently supports the following cloud models, with more coming soon:
-
-- `gpt-oss:20b-cloud`
-- `gpt-oss:120b-cloud`
-- `deepseek-v3.1:671b-cloud`
-- `qwen3-coder:480b-cloud`
-
-### Get started
-
-To run a cloud model, open the terminal and run:
-
-```
-ollama run gpt-oss:120b-cloud
-```
-
-To run cloud models with integrations that work with Ollama, first download the cloud model:
-
-```
-ollama pull qwen3-coder:480b-cloud
-```
-
-Then sign in to Ollama:
-
-```
-ollama signin
-```
-
-Finally, access the model using the model name `qwen3-coder:480b-cloud` via Ollama's local API or tooling.
-
-## Cloud API access
-
-Cloud models can also be accessed directly on ollama.com's API. For more information, see the [docs](https://docs.ollama.com/cloud).
diff --git a/docs/cloud.mdx b/docs/cloud.mdx
new file mode 100644
index 00000000..cea27216
--- /dev/null
+++ b/docs/cloud.mdx
@@ -0,0 +1,236 @@
+---
+title: Cloud
+sidebarTitle: Cloud
+---
+
+<Info>Ollama's cloud is currently in preview.</Info>
+
+## Cloud Models
+
+Ollama's cloud models are a new kind of model in Ollama that can run without a powerful GPU. Instead, cloud models are automatically offloaded to Ollama's cloud service while offering the same capabilities as local models, making it possible to keep using your local tools while running larger models that wouldn't fit on a personal computer.
+
+Ollama currently supports the following cloud models, with more coming soon:
+
+- `deepseek-v3.1:671b-cloud`
+- `gpt-oss:20b-cloud`
+- `gpt-oss:120b-cloud`
+- `kimi-k2:1t-cloud`
+- `qwen3-coder:480b-cloud`
+- `glm-4.6:cloud`
+- `minimax-m2:cloud`
+
+### Running Cloud models
+
+Ollama's cloud models require an account on [ollama.com](https://ollama.com). To sign in or create an account, run:
+
+```
+ollama signin
+```
+
+<Tabs>
+  <Tab title="CLI">
+
+To run a cloud model, open the terminal and run:
+
+```
+ollama run gpt-oss:120b-cloud
+```
+
+  </Tab>
+  <Tab title="Python">
+
+First, pull a cloud model so it can be accessed:
+
+```
+ollama pull gpt-oss:120b-cloud
+```
+
+Next, install [Ollama's Python library](https://github.com/ollama/ollama-python):
+
+```
+pip install ollama
+```
+
+Next, create and run a simple Python script:
+
+```python
+from ollama import Client
+
+client = Client()
+
+messages = [
+  {
+    'role': 'user',
+    'content': 'Why is the sky blue?',
+  },
+]
+
+for part in client.chat('gpt-oss:120b-cloud', messages=messages, stream=True):
+  print(part['message']['content'], end='', flush=True)
+```
+
+  </Tab>
+  <Tab title="JavaScript">
+
+First, pull a cloud model so it can be accessed:
+
+```
+ollama pull gpt-oss:120b-cloud
+```
+
+Next, install [Ollama's JavaScript library](https://github.com/ollama/ollama-js):
+
+```
+npm i ollama
+```
+
+Then use the library to run a cloud model:
+
+```typescript
+import { Ollama } from "ollama";
+
+const ollama = new Ollama();
+
+const response = await ollama.chat({
+  model: "gpt-oss:120b-cloud",
+  messages: [{ role: "user", content: "Explain quantum computing" }],
+  stream: true,
+});
+
+for await (const part of response) {
+  process.stdout.write(part.message.content);
+}
+```
+
+  </Tab>
+  <Tab title="cURL">
+
+First, pull a cloud model so it can be accessed:
+
+```
+ollama pull gpt-oss:120b-cloud
+```
+
+Run the following cURL command to run the command via Ollama's API:
+
+```
+curl http://localhost:11434/api/chat -d '{
+  "model": "gpt-oss:120b-cloud",
+  "messages": [{
+    "role": "user",
+    "content": "Why is the sky blue?"
+  }],
+  "stream": false
+}'
+```
+
+  </Tab>
+</Tabs>
+
+## Cloud API access
+
+Cloud models can also be accessed directly on ollama.com's API. In this mode, ollama.com acts as a remote Ollama host.
+
+### Authentication
+
+For direct access to ollama.com's API, first create an [API key](https://ollama.com/settings/keys).
+
+Then, set the `OLLAMA_API_KEY` environment variable to your API key.
+
+```
+export OLLAMA_API_KEY=your_api_key
+```
+
+### Listing models
+
+For models available directly via Ollama's API, models can be listed via:
+
+```
+curl https://ollama.com/api/tags
+```
+
+### Generating a response
+
+<Tabs>
+  <Tab title="Python">
+
+First, install [Ollama's Python library](https://github.com/ollama/ollama-python)
+
+```
+pip install ollama
+```
+
+Then make a request
+
+```python
+import os
+from ollama import Client
+
+client = Client(
+    host="https://ollama.com",
+    headers={'Authorization': 'Bearer ' + os.environ.get('OLLAMA_API_KEY')}
+)
+
+messages = [
+  {
+    'role': 'user',
+    'content': 'Why is the sky blue?',
+  },
+]
+
+for part in client.chat('gpt-oss:120b', messages=messages, stream=True):
+  print(part['message']['content'], end='', flush=True)
+```
+
+  </Tab>
+  <Tab title="JavaScript">
+
+First, install [Ollama's JavaScript library](https://github.com/ollama/ollama-js):
+
+```
+npm i ollama
+```
+
+Next, make a request to the model:
+
+```typescript
+import { Ollama } from "ollama";
+
+const ollama = new Ollama({
+  host: "https://ollama.com",
+  headers: {
+    Authorization: "Bearer " + process.env.OLLAMA_API_KEY,
+  },
+});
+
+const response = await ollama.chat({
+  model: "gpt-oss:120b",
+  messages: [{ role: "user", content: "Explain quantum computing" }],
+  stream: true,
+});
+
+for await (const part of response) {
+  process.stdout.write(part.message.content);
+}
+```
+
+  </Tab>
+  <Tab title="cURL">
+
+Generate a response via Ollama's chat API:
+
+```
+curl https://ollama.com/api/chat \
+  -H "Authorization: Bearer $OLLAMA_API_KEY" \
+  -d '{
+    "model": "gpt-oss:120b",
+    "messages": [{
+      "role": "user",
+      "content": "Why is the sky blue?"
+    }],
+    "stream": false
+  }'
+```
+
+  </Tab>
+</Tabs>
diff --git a/docs/context-length.mdx b/docs/context-length.mdx
new file mode 100644
index 00000000..43bcf0d3
--- /dev/null
+++ b/docs/context-length.mdx
@@ -0,0 +1,38 @@
+---
+title: Context length
+---
+
+Context length is the maximum number of tokens that the model has access to in memory.  
+
+<Note>
+  The default context length in Ollama is 4096 tokens.
+</Note>
+
+Tasks which require large context like web search, agents, and coding tools should be set to at least 32000 tokens.
+
+## Setting context length
+
+Setting a larger context length will increase the amount of memory required to run a model. Ensure you have enough VRAM available to increase the context length.
+
+Cloud models are set to their maximum context length by default.
+
+### App
+
+Change the slider in the Ollama app under settings to your desired context length.
+![Context length in Ollama app](./images/ollama-settings.png)
+
+### CLI
+If editing the context length for Ollama is not possible, the context length can also be updated when serving Ollama.  
+```
+OLLAMA_CONTEXT_LENGTH=32000 ollama serve
+```
+
+### Check allocated context length and model offloading
+For best performance, use the maximum context length for a model, and avoid offloading the model to CPU. Verify the split under `PROCESSOR` using `ollama ps`.
+```
+ollama ps
+```
+```
+NAME             ID              SIZE      PROCESSOR    CONTEXT    UNTIL
+gemma3:latest    a2af6cc3eb7f    6.6 GB    100% GPU     65536      2 minutes from now
+```
diff --git a/docs/docker.md b/docs/docker.mdx
similarity index 72%
rename from docs/docker.md
rename to docs/docker.mdx
index dce090a2..22d2bc33 100644
--- a/docs/docker.md
+++ b/docs/docker.mdx
@@ -1,21 +1,21 @@
-# Ollama Docker image
-
-### CPU only
+## CPU only
 
 ```shell
 docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
 ```
 
-### Nvidia GPU
+## Nvidia GPU
+
 Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation).
 
-#### Install with Apt
+### Install with Apt
+
 1.  Configure the repository
 
     ```shell
     curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
         | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
-    curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+    curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
         | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
         | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
     sudo apt-get update
@@ -27,37 +27,40 @@ Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-
     sudo apt-get install -y nvidia-container-toolkit
     ```
 
-#### Install with Yum or Dnf
+### Install with Yum or Dnf
+
 1.  Configure the repository
 
     ```shell
-    curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
+    curl -fsSL https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
         | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
     ```
 
-2. Install the NVIDIA Container Toolkit packages
+2.  Install the NVIDIA Container Toolkit packages
 
     ```shell
     sudo yum install -y nvidia-container-toolkit
     ```
 
-#### Configure Docker to use Nvidia driver
+### Configure Docker to use Nvidia driver
 
 ```shell
 sudo nvidia-ctk runtime configure --runtime=docker
 sudo systemctl restart docker
 ```
 
-#### Start the container
+### Start the container
 
 ```shell
 docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
 ```
 
-> [!NOTE]  
-> If you're running on an NVIDIA JetPack system, Ollama can't automatically discover the correct JetPack version. Pass the environment variable JETSON_JETPACK=5 or JETSON_JETPACK=6 to the container to select version 5 or 6.
+<Note>
+  If you're running on an NVIDIA JetPack system, Ollama can't automatically discover the correct JetPack version.
+  Pass the environment variable `JETSON_JETPACK=5` or `JETSON_JETPACK=6` to the container to select version 5 or 6.
+</Note>
 
-### AMD GPU
+## AMD GPU
 
 To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
 
@@ -65,7 +68,7 @@ To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following c
 docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
 ```
 
-### Run model locally
+## Run model locally
 
 Now you can run a model:
 
@@ -73,6 +76,6 @@ Now you can run a model:
 docker exec -it ollama ollama run llama3.2
 ```
 
-### Try different models
+## Try different models
 
 More models can be found on the [Ollama library](https://ollama.com/library).
diff --git a/docs/docs.json b/docs/docs.json
new file mode 100644
index 00000000..6cc6606b
--- /dev/null
+++ b/docs/docs.json
@@ -0,0 +1,162 @@
+{
+  "$schema": "https://mintlify.com/docs.json",
+  "name": "Ollama",
+  "colors": {
+    "primary": "#000",
+    "light": "#b5b5b5",
+    "dark": "#000"
+  },
+  "favicon": "/images/favicon.png",
+  "logo": {
+    "light": "/images/logo.png",
+    "dark": "/images/logo-dark.png",
+    "href": "https://ollama.com"
+  },
+  "theme": "maple",
+  "background": {
+    "color": {
+      "light": "#ffffff",
+      "dark": "#000000"
+    }
+  },
+  "fonts": {
+    "family": "system-ui",
+    "heading": {
+      "family": "system-ui"
+    },
+    "body": {
+      "family": "system-ui"
+    }
+  },
+  "styling": {
+    "codeblocks": "system"
+  },
+  "contextual": {
+    "options": ["copy"]
+  },
+  "navbar": {
+    "links": [
+      {
+        "label": "Sign in",
+        "href": "https://ollama.com/signin"
+      }
+    ],
+    "primary": {
+      "type": "button",
+      "label": "Download",
+      "href": "https://ollama.com/download"
+    }
+  },
+  "api": {
+    "playground": {
+      "display": "simple"
+    },
+    "examples": {
+      "languages": ["curl"]
+    }
+  },
+  "redirects": [
+    {
+      "source": "/openai",
+      "destination": "/api/openai-compatibility"
+    },
+    {
+      "source": "/api/openai",
+      "destination": "/api/openai-compatibility"
+    }
+  ],
+  "navigation": {
+    "tabs": [
+      {
+        "tab": "Documentation",
+        "groups": [
+          {
+            "group": "Get started",
+            "pages": [
+              "index",
+              "quickstart",
+              "/cloud"
+            ]
+          },
+          {
+            "group": "Capabilities",
+            "pages": [
+              "/capabilities/streaming",
+              "/capabilities/thinking",
+              "/capabilities/structured-outputs",
+              "/capabilities/vision",
+              "/capabilities/embeddings",
+              "/capabilities/tool-calling",
+              "/capabilities/web-search"
+            ]
+          },
+          {
+            "group": "Integrations",
+            "pages": [
+              "/integrations/vscode",
+              "/integrations/jetbrains",
+              "/integrations/codex",
+              "/integrations/cline",
+              "/integrations/droid",
+              "/integrations/goose",
+              "/integrations/zed",
+              "/integrations/roo-code",
+              "/integrations/n8n",
+              "/integrations/xcode"
+            ]
+          },
+          {
+            "group": "More information",
+            "pages": [
+              "/cli",
+              "/modelfile",
+              "/context-length",
+              "/linux",
+              "/macos",
+              "/windows",
+              "/docker",
+              "/import",
+              "/faq",
+              "/gpu",
+              "/troubleshooting"
+            ]
+          }
+        ]
+      },
+      {
+        "tab": "API Reference",
+        "openapi": "/openapi.yaml",
+        "groups": [
+          {
+            "group": "API Reference",
+            "pages": [
+              "/api/index",
+              "/api/authentication",
+              "/api/streaming",
+              "/api/usage",
+              "/api/errors",
+              "/api/openai-compatibility"
+            ]
+          },
+          {
+            "group": "Endpoints",
+            "pages": [
+              "POST /api/generate",
+              "POST /api/chat",
+              "POST /api/embed",
+              "GET /api/tags",
+              "GET /api/ps",
+              "POST /api/show",
+              "POST /api/create",
+              "POST /api/copy",
+              "POST /api/pull",
+              "POST /api/push",
+              "DELETE /api/delete",
+              "GET /api/version"
+            ]
+          }
+        ]
+      }
+    ]
+  }
+}
diff --git a/docs/examples.md b/docs/examples.md
index 25f6563a..7f349f72 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -12,9 +12,3 @@ Ollama JavaScript examples at [ollama-js/examples](https://github.com/ollama/oll
 
 ## OpenAI compatibility examples
 Ollama OpenAI compatibility examples at [ollama/examples/openai](../docs/openai.md)
-
-
-## Community examples
-
-- [LangChain Ollama Python](https://python.langchain.com/docs/integrations/chat/ollama/)
-- [LangChain Ollama JS](https://js.langchain.com/docs/integrations/chat/ollama/)
diff --git a/docs/faq.md b/docs/faq.mdx
similarity index 69%
rename from docs/faq.md
rename to docs/faq.mdx
index 900ffba4..18a80b70 100644
--- a/docs/faq.md
+++ b/docs/faq.mdx
@@ -1,4 +1,6 @@
-# FAQ
+---
+title: FAQ
+---
 
 ## How can I upgrade Ollama?
 
@@ -20,9 +22,9 @@ Please refer to the [GPU docs](./gpu.md).
 
 ## How can I specify the context window size?
 
-By default, Ollama uses a context window size of 4096 tokens for most models. The `gpt-oss` model has a default context window size of 8192 tokens.
+By default, Ollama uses a context window size of 2048 tokens.
 
-This can be overridden in Settings in the Windows and macOS App, or with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
+This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
 
 ```shell
 OLLAMA_CONTEXT_LENGTH=8192 ollama serve
@@ -46,8 +48,6 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```
 
-Setting the context length higher may cause the model to not be able to fit onto the GPU which make the model run more slowly.
-
 ## How can I tell if my model was loaded onto the GPU?
 
 Use the `ollama ps` command to see what models are currently loaded into memory.
@@ -56,17 +56,16 @@ Use the `ollama ps` command to see what models are currently loaded into memory.
 ollama ps
 ```
 
-> **Output**:
->
-> ```
-> NAME           ID              SIZE     PROCESSOR    CONTEXT    UNTIL
-> gpt-oss:20b    05afbac4bad6    16 GB    100% GPU     8192       4 minutes from now
-> ```
+<Info>
+  **Output**: ``` NAME ID SIZE PROCESSOR UNTIL llama3:70b bcfb190ca3a7 42 GB
+  100% GPU 4 minutes from now ```
+</Info>
 
 The `Processor` column will show which memory the model was loaded in to:
-* `100% GPU` means the model was loaded entirely into the GPU
-* `100% CPU` means the model was loaded entirely in system memory
-* `48%/52% CPU/GPU` means the model was loaded partially onto both the GPU and into system memory
+
+- `100% GPU` means the model was loaded entirely into the GPU
+- `100% CPU` means the model was loaded entirely in system memory
+- `48%/52% CPU/GPU` means the model was loaded partially onto both the GPU and into system memory
 
 ## How do I configure Ollama server?
 
@@ -78,9 +77,9 @@ If Ollama is run as a macOS application, environment variables should be set usi
 
 1. For each environment variable, call `launchctl setenv`.
 
-    ```bash
-    launchctl setenv OLLAMA_HOST "0.0.0.0:11434"
-    ```
+   ```bash
+   launchctl setenv OLLAMA_HOST "0.0.0.0:11434"
+   ```
 
 2. Restart Ollama application.
 
@@ -92,10 +91,10 @@ If Ollama is run as a systemd service, environment variables should be set using
 
 2. For each environment variable, add a line `Environment` under section `[Service]`:
 
-    ```ini
-    [Service]
-    Environment="OLLAMA_HOST=0.0.0.0:11434"
-    ```
+   ```ini
+   [Service]
+   Environment="OLLAMA_HOST=0.0.0.0:11434"
+   ```
 
 3. Save and exit.
 
@@ -126,8 +125,10 @@ On Windows, Ollama inherits your user and system environment variables.
 
 Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
 
-> [!NOTE]
-> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.
+<Note>
+  Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only
+  HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.
+</Note>
 
 ### How do I use Ollama behind a proxy in Docker?
 
@@ -150,11 +151,9 @@ docker build -t ollama-with-ca .
 docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-with-ca
 ```
 
-## Does Ollama send my prompts and responses back to ollama.com?
+## Does Ollama send my prompts and answers back to ollama.com?
 
-If you're running a model locally, your prompts and responses will always stay on your machine. Ollama Turbo in the App allows you to run your queries on Ollama's servers if you don't have a powerful enough GPU. Web search lets a model query the web, giving you more accurate and up-to-date information. Both Turbo and web search require sending your prompts and responses to Ollama.com. This data is neither logged nor stored.
-
-If you don't want to see the Turbo and web search options in the app, you can disable them in Settings by turning on Airplane mode. In Airplane mode, all models will run locally, and your prompts and responses will stay on your machine.
+No. Ollama runs locally, and conversation data does not leave your machine.
 
 ## How can I expose Ollama on my network?
 
@@ -216,7 +215,9 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
 
 If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.
 
-> Note: on Linux using the standard installer, the `ollama` user needs read and write access to the specified directory. To assign the directory to the `ollama` user run `sudo chown -R ollama:ollama <directory>`.
+<Note>
+  On Linux using the standard installer, the `ollama` user needs read and write access to the specified directory. To assign the directory to the `ollama` user run `sudo chown -R ollama:ollama <directory>`.
+</Note>
 
 Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
 
@@ -235,7 +236,7 @@ GPU acceleration is not available for Docker Desktop in macOS due to the lack of
 This can impact both installing Ollama, as well as downloading models.
 
 Open `Control Panel > Networking and Internet > View network status and tasks` and click on `Change adapter settings` on the left panel. Find the `vEthernel (WSL)` adapter, right click and select `Properties`.
-Click on `Configure` and open the `Advanced` tab. Search through each of the properties until you find `Large Send Offload Version 2 (IPv4)` and `Large Send Offload Version 2 (IPv6)`. *Disable* both of these
+Click on `Configure` and open the `Advanced` tab. Search through each of the properties until you find `Large Send Offload Version 2 (IPv4)` and `Large Send Offload Version 2 (IPv6)`. _Disable_ both of these
 properties.
 
 ## How can I preload a model into Ollama to get faster response times?
@@ -269,10 +270,11 @@ ollama stop llama3.2
 ```
 
 If you're using the API, use the `keep_alive` parameter with the `/api/generate` and `/api/chat` endpoints to set the amount of time that a model stays in memory. The `keep_alive` parameter can be set to:
-* a duration string (such as "10m" or "24h")
-* a number in seconds (such as 3600)
-* any negative number which will keep the model loaded in memory (e.g. -1 or "-1m")
-* '0' which will unload the model immediately after generating a response
+
+- a duration string (such as "10m" or "24h")
+- a number in seconds (such as 3600)
+- any negative number which will keep the model loaded in memory (e.g. -1 or "-1m")
+- '0' which will unload the model immediately after generating a response
 
 For example, to preload a model and leave it in memory use:
 
@@ -292,31 +294,31 @@ The `keep_alive` API parameter with the `/api/generate` and `/api/chat` API endp
 
 ## How do I manage the maximum number of requests the Ollama server can queue?
 
-If too many requests are sent to the server, it will respond with a 503 error indicating the server is overloaded.  You can adjust how many requests may be queue by setting `OLLAMA_MAX_QUEUE`.
+If too many requests are sent to the server, it will respond with a 503 error indicating the server is overloaded. You can adjust how many requests may be queue by setting `OLLAMA_MAX_QUEUE`.
 
 ## How does Ollama handle concurrent requests?
 
-Ollama supports two levels of concurrent processing.  If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time.  For a given model, if there is sufficient available memory when the model is loaded, it can be configured to allow parallel request processing.
+Ollama supports two levels of concurrent processing. If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time. For a given model, if there is sufficient available memory when the model is loaded, it is configured to allow parallel request processing.
 
-If there is insufficient available memory to load a new model request while one or more models are already loaded, all new requests will be queued until the new model can be loaded.  As prior models become idle, one or more will be unloaded to make room for the new model.  Queued requests will be processed in order.  When using GPU inference new models must be able to completely fit in VRAM to allow concurrent model loads.
+If there is insufficient available memory to load a new model request while one or more models are already loaded, all new requests will be queued until the new model can be loaded. As prior models become idle, one or more will be unloaded to make room for the new model. Queued requests will be processed in order. When using GPU inference new models must be able to completely fit in VRAM to allow concurrent model loads.
 
-Parallel request processing for a given model results in increasing the context size by the number of parallel requests.  For example, a 2K context with 4 parallel requests will result in an 8K context and additional memory allocation.
+Parallel request processing for a given model results in increasing the context size by the number of parallel requests. For example, a 2K context with 4 parallel requests will result in an 8K context and additional memory allocation.
 
 The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms:
 
-- `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory.  The default is 3 * the number of GPUs or 3 for CPU inference.
-- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default is 1, and will handle 1 request per model at a time.
+- `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory. The default is 3 \* the number of GPUs or 3 for CPU inference.
+- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default will auto-select either 4 or 1 based on available memory.
 - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
 
-Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
+Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
 
 ## How does Ollama load models on multiple GPUs?
 
-When loading a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transferring across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
+When loading a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transferring across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
 
 ## How can I enable Flash Attention?
 
-Flash Attention is a feature of most modern models that can significantly reduce memory usage as the context size grows.  To enable Flash Attention, set the `OLLAMA_FLASH_ATTENTION` environment variable to `1` when starting the Ollama server.
+Flash Attention is a feature of most modern models that can significantly reduce memory usage as the context size grows. To enable Flash Attention, set the `OLLAMA_FLASH_ATTENTION` environment variable to `1` when starting the Ollama server.
 
 ## How can I set the quantization type for the K/V cache?
 
@@ -324,9 +326,12 @@ The K/V context cache can be quantized to significantly reduce memory usage when
 
 To use quantized K/V cache with Ollama you can set the following environment variable:
 
-- `OLLAMA_KV_CACHE_TYPE` - The quantization type for the K/V cache.  Default is `f16`.
+- `OLLAMA_KV_CACHE_TYPE` - The quantization type for the K/V cache. Default is `f16`.
 
-> Note: Currently this is a global option - meaning all models will run with the specified quantization type.
+<Note>
+  Currently this is a global option - meaning all models will run with the
+  specified quantization type.
+</Note>
 
 The currently available K/V cache quantization types are:
 
@@ -334,19 +339,40 @@ The currently available K/V cache quantization types are:
 - `q8_0` - 8-bit quantization, uses approximately 1/2 the memory of `f16` with a very small loss in precision, this usually has no noticeable impact on the model's quality (recommended if not using f16).
 - `q4_0` - 4-bit quantization, uses approximately 1/4 the memory of `f16` with a small-medium loss in precision that may be more noticeable at higher context sizes.
 
-How much the cache quantization impacts the model's response quality will depend on the model and the task.  Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.
+How much the cache quantization impacts the model's response quality will depend on the model and the task. Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.
 
 You may need to experiment with different quantization types to find the best balance between memory usage and quality.
 
-## How can I stop Ollama from starting when I login to my computer
+## Where can I find my Ollama Public Key?
 
-Ollama for Windows and macOS register as a login item during installation.  You can disable this if you prefer not to have Ollama automatically start.  Ollama will respect this setting across upgrades, unless you uninstall the application.
+Your **Ollama Public Key** is the public part of the key pair that lets your local Ollama instance talk to [ollama.com](https://ollama.com).
 
-**Windows**
-- Remove `%APPDATA%\Microsoft\Windows\Start Menu\Programs\Startup\Ollama.lnk`
+You'll need it to:
+* Push models to Ollama
+* Pull private models from Ollama to your machine
+* Run models hosted in [Ollama Cloud](https://ollama.com/cloud)
 
-**MacOS Monterey (v12)**
-- Open `Settings` -> `Users & Groups` -> `Login Items` and find the `Ollama` entry, then click the `-` (minus) to remove
+### How to Add the Key
 
-**MacOS Ventura (v13) and later**
-- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
+* **Sign-in via the Settings page** in the **Mac** and **Windows App**
+
+* **Sign‑in via CLI**
+
+```shell
+ollama signin
+```
+
+* **Manually copy & paste** the key on the **Ollama Keys** page:
+[https://ollama.com/settings/keys](https://ollama.com/settings/keys)
+
+### Where the Ollama Public Key lives
+
+| OS | Path to `id_ed25519.pub` |
+| :- | :- |
+| macOS 	| `~/.ollama/id_ed25519.pub`			|
+| Linux		| `/usr/share/ollama/.ollama/id_ed25519.pub`	|
+| Windows	| `C:\Users\<username>\.ollama\id_ed25519.pub`	|
+
+<Note>
+  Replace &lt;username&gt; with your actual Windows user name.
+</Note>
diff --git a/docs/favicon-dark.svg b/docs/favicon-dark.svg
new file mode 100644
index 00000000..672ecd01
--- /dev/null
+++ b/docs/favicon-dark.svg
@@ -0,0 +1,3 @@
+<svg width="28" height="28" viewBox="0 0 28 28" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path fill-rule="evenodd" clip-rule="evenodd" d="M5.92343 3.10209C6.16484 3.19868 6.38278 3.35777 6.58061 3.56799C6.91031 3.91571 7.18861 4.41342 7.40096 5.00318C7.61443 5.59635 7.75302 6.25316 7.80555 6.91224C8.50922 6.5075 9.29019 6.26103 10.0956 6.18953L10.1526 6.18498C11.125 6.10544 12.0862 6.28384 12.9244 6.72361C13.0373 6.78384 13.1479 6.84861 13.2563 6.91679C13.3122 6.27021 13.4486 5.62817 13.6587 5.04863C13.871 4.45773 14.1493 3.96117 14.4779 3.61231C14.6616 3.41034 14.8862 3.25137 15.1362 3.14641C15.4235 3.03277 15.7286 3.01232 16.0259 3.09868C16.474 3.22822 16.8585 3.51685 17.1614 3.93617C17.4386 4.3191 17.6465 4.81 17.7884 5.39863C18.0455 6.45998 18.0902 7.85655 17.9169 9.54061L17.9762 9.58607L18.0052 9.60766C18.8513 10.2622 19.4403 11.1951 19.7522 12.2781C20.2383 13.9678 19.9936 15.8632 19.1553 16.9235L19.1352 16.9473L19.1374 16.9507C19.6035 17.8166 19.8863 18.7314 19.9466 19.678L19.9489 19.7121C20.0204 20.9223 19.7253 22.1404 19.0391 23.337L19.0312 23.3484L19.0424 23.3756C19.57 24.6904 19.7354 26.0142 19.532 27.3369L19.5253 27.3813C19.4938 27.5744 19.3881 27.7472 19.2316 27.8612C19.0751 27.9753 18.8805 28.0219 18.6903 27.9903C18.5962 27.9753 18.5059 27.9417 18.4246 27.8912C18.3433 27.8408 18.2725 27.7744 18.2165 27.696C18.1604 27.6178 18.1201 27.529 18.0979 27.4347C18.0756 27.3406 18.0718 27.2428 18.0868 27.1472C18.2735 25.9733 18.098 24.7961 17.5503 23.5984C17.4992 23.4871 17.4766 23.3645 17.4844 23.242C17.4922 23.1194 17.5302 23.0009 17.5951 22.8972L17.5995 22.8904C18.2746 21.8404 18.554 20.8109 18.4936 19.7996C18.4422 18.9143 18.1304 18.045 17.5995 17.2166C17.4963 17.0556 17.4595 16.8597 17.4971 16.6713C17.5348 16.4829 17.6438 16.3171 17.8007 16.2098L17.8108 16.203C18.0823 16.0223 18.3327 15.561 18.459 14.9303C18.5983 14.1846 18.562 13.4159 18.3528 12.6872C18.1237 11.8917 17.7046 11.2281 17.1178 10.7747C16.4528 10.2588 15.5721 10.0099 14.4578 10.0815C14.3121 10.0911 14.1669 10.0561 14.041 9.98097C13.9151 9.90581 13.8142 9.79399 13.7515 9.65993C13.4005 8.90426 12.8886 8.36336 12.2504 8.02814C11.6378 7.71723 10.9518 7.58657 10.27 7.65087C8.8785 7.76337 7.65132 8.56108 7.28584 9.56675C7.23414 9.70829 7.14113 9.8304 7.01928 9.91672C6.89744 10.003 6.75257 10.0494 6.60408 10.0497C5.41154 10.052 4.48836 10.3361 3.8133 10.8485C3.22989 11.2917 2.83201 11.911 2.62189 12.6531C2.43175 13.3515 2.40571 14.0857 2.54589 14.7962C2.67107 15.4303 2.91583 15.9553 3.19636 16.2382L3.2053 16.2462C3.44224 16.4814 3.49254 16.8485 3.32713 17.1382C2.92477 17.845 2.62412 18.8984 2.57495 19.9109C2.51906 21.0677 2.78283 22.0722 3.37854 22.7927L3.39642 22.8143C3.48632 22.9208 3.54413 23.0514 3.56297 23.1905C3.58179 23.3295 3.56083 23.4711 3.5026 23.5984C2.85883 25.0029 2.66101 26.1574 2.87448 27.0665C2.91284 27.2556 2.87714 27.4524 2.77496 27.6149C2.67279 27.7776 2.5122 27.8933 2.32739 27.9374C2.14257 27.9817 1.94812 27.9506 1.78542 27.8512C1.62269 27.7517 1.50458 27.5917 1.45618 27.4051C1.18459 26.2483 1.36901 24.9233 1.98483 23.4302L2.00048 23.3904L1.99153 23.3768C1.68885 22.9222 1.46298 22.4195 1.32318 21.8893L1.31759 21.8677C1.14793 21.2062 1.08119 20.5218 1.11977 19.8393C1.16895 18.8052 1.43048 17.7462 1.81495 16.8962L1.82836 16.8666L1.82612 16.8644C1.49865 16.3894 1.25612 15.7814 1.122 15.1087L1.11642 15.0814C0.931624 14.1431 0.967232 13.1737 1.22036 12.2519C1.51318 11.2122 2.08877 10.319 2.93707 9.67356C3.00413 9.62243 3.07454 9.57129 3.14495 9.52357C2.96724 7.827 3.01195 6.42134 3.27013 5.35317C3.41207 4.76455 3.62107 4.27365 3.89825 3.89071C4.20001 3.47254 4.58448 3.18391 5.03266 3.05323C5.32996 2.96686 5.63619 2.98732 5.92343 3.10209ZM10.5237 13.4315C11.5698 13.4315 12.5354 13.7871 13.2575 14.403C13.9616 15.0019 14.3807 15.8064 14.3807 16.6076C14.3807 17.6166 13.9269 18.403 13.1144 18.9052C12.4214 19.3314 11.4927 19.5382 10.4287 19.5382C9.30097 19.5382 8.33755 19.2439 7.64237 18.7041C6.95278 18.17 6.56608 17.42 6.56608 16.6076C6.56608 15.8042 7.0109 14.9973 7.74632 14.3962C8.49291 13.786 9.47867 13.4315 10.5237 13.4315ZM10.5237 14.4496C9.7483 14.4428 8.99346 14.7031 8.38226 15.1883C7.86702 15.6087 7.57532 16.1371 7.57532 16.6087C7.57532 17.095 7.81002 17.5507 8.25708 17.8973C8.76561 18.2916 9.51332 18.52 10.4287 18.52C11.3217 18.52 12.075 18.353 12.588 18.0359C13.1054 17.7178 13.3703 17.2564 13.3703 16.6076C13.3703 16.1269 13.0954 15.5962 12.607 15.1803C12.066 14.7201 11.3329 14.4496 10.5237 14.4496ZM11.2636 15.8246L11.268 15.8292C11.4021 16.0007 11.3742 16.2496 11.2054 16.386L10.8791 16.6473V17.1541C10.8785 17.267 10.8339 17.375 10.7551 17.4544C10.6764 17.5339 10.5698 17.5783 10.4589 17.578C10.3479 17.5783 10.2413 17.5339 10.1626 17.4544C10.0838 17.375 10.0392 17.267 10.0386 17.1541V16.6314L9.73573 16.3837C9.69577 16.3511 9.66255 16.3109 9.63798 16.2652C9.61342 16.2195 9.59799 16.1694 9.59258 16.1176C9.58718 16.0658 9.59191 16.0135 9.60651 15.9636C9.62111 15.9137 9.64527 15.8672 9.67762 15.8269C9.74358 15.7452 9.83858 15.6934 9.94196 15.6825C10.0453 15.6717 10.1487 15.7027 10.2297 15.7689L10.47 15.9644L10.7159 15.7667C10.7966 15.7018 10.899 15.6716 11.0013 15.6825C11.1036 15.6933 11.1977 15.7443 11.2636 15.8246ZM5.6306 13.644C6.16484 13.644 6.59961 14.0871 6.59961 14.6337C6.5999 14.8957 6.4979 15.1471 6.31599 15.3327C6.13408 15.5183 5.88717 15.6229 5.62949 15.6235C5.37218 15.6226 5.12572 15.518 4.9441 15.3327C4.76246 15.1474 4.66048 14.8965 4.66048 14.6349C4.65988 14.3729 4.76161 14.1213 4.94329 13.9355C5.125 13.7497 5.37292 13.6449 5.6306 13.644ZM15.3609 13.644C15.8973 13.644 16.331 14.0871 16.331 14.6337C16.3313 14.8957 16.2293 15.1471 16.0474 15.3327C15.8655 15.5183 15.6186 15.6229 15.3609 15.6235C15.1036 15.6226 14.8571 15.518 14.6755 15.3327C14.4938 15.1474 14.3919 14.8965 14.3919 14.6349C14.3913 14.3729 14.493 14.1213 14.6747 13.9355C14.8564 13.7497 15.1032 13.6449 15.3609 13.644ZM5.40372 4.47705L5.40037 4.47932C5.27088 4.53657 5.16032 4.63046 5.08184 4.74977L5.07625 4.75659C4.92201 4.97136 4.7879 5.28727 4.68731 5.70203C4.49731 6.48839 4.44589 7.55541 4.54872 8.86335C5.02931 8.7179 5.55349 8.62699 6.1179 8.59404L6.12908 8.5929L6.15031 8.55426C6.20172 8.46108 6.25649 8.37131 6.31572 8.28268C6.45319 7.40655 6.34031 6.35998 6.03296 5.50545C5.88319 5.09182 5.70102 4.76682 5.52666 4.5816C5.49067 4.5431 5.45055 4.5088 5.40707 4.47932L5.40372 4.47705ZM15.657 4.52251L15.6548 4.52364C15.6113 4.55312 15.5712 4.58741 15.5352 4.62591C15.3609 4.81114 15.1776 5.13727 15.0289 5.5509C14.7048 6.45316 14.5964 7.56905 14.7719 8.47358L14.8367 8.58381L14.8456 8.59972H14.8792C15.4339 8.59988 15.9856 8.68101 16.5176 8.84062C16.6138 7.56337 16.5601 6.51907 16.3746 5.74749C16.274 5.33272 16.1399 5.01682 15.9845 4.80205L15.98 4.79523C15.9017 4.67548 15.7911 4.58121 15.6615 4.52364L15.657 4.52251Z" fill="black"/>
+</svg>
diff --git a/docs/favicon.svg b/docs/favicon.svg
new file mode 100644
index 00000000..99d6b5e0
--- /dev/null
+++ b/docs/favicon.svg
@@ -0,0 +1,3 @@
+<svg width="10" height="12" viewBox="0 0 10 12" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path fill-rule="evenodd" clip-rule="evenodd" d="M2.95258 0.545107C3.06058 0.587607 3.15808 0.657607 3.24658 0.750107C3.39408 0.903107 3.51858 1.1221 3.61358 1.3816C3.70908 1.6426 3.77108 1.9316 3.79458 2.2216C4.10938 2.04351 4.45876 1.93506 4.81908 1.9036L4.84458 1.9016C5.27958 1.8666 5.70958 1.9451 6.08458 2.1386C6.13508 2.1651 6.18458 2.1936 6.23308 2.2236C6.25808 1.9391 6.31908 1.6566 6.41308 1.4016C6.50808 1.1416 6.63258 0.923107 6.77958 0.769607C6.86173 0.680739 6.96222 0.610791 7.07408 0.564607C7.20258 0.514607 7.33908 0.505607 7.47208 0.543607C7.67258 0.600607 7.84458 0.727607 7.98008 0.912107C8.10408 1.0806 8.19708 1.2966 8.26058 1.5556C8.37558 2.0226 8.39558 2.6371 8.31808 3.3781L8.34458 3.3981L8.35758 3.4076C8.73608 3.6956 8.99958 4.1061 9.1391 4.5826C9.3566 5.3261 9.2471 6.1601 8.87208 6.6266L8.86308 6.6371L8.86408 6.6386C9.0726 7.0196 9.1991 7.4221 9.2261 7.8386L9.2271 7.8536C9.2591 8.3861 9.1271 8.9221 8.82008 9.4486L8.81658 9.4536L8.82158 9.4656C9.0576 10.0441 9.1316 10.6266 9.0406 11.2086L9.0376 11.2281C9.0235 11.3131 8.97624 11.3891 8.90623 11.4393C8.83621 11.4895 8.74913 11.51 8.66408 11.4961C8.62196 11.4895 8.58156 11.4747 8.54518 11.4525C8.50881 11.4303 8.47717 11.4011 8.45209 11.3666C8.42701 11.3322 8.40897 11.2931 8.39902 11.2516C8.38906 11.2102 8.38738 11.1672 8.39408 11.1251C8.47758 10.6086 8.39908 10.0906 8.15408 9.56361C8.13122 9.51465 8.12107 9.46071 8.12456 9.40679C8.12806 9.35287 8.14509 9.3007 8.17408 9.2551L8.17608 9.2521C8.47808 8.7901 8.60308 8.3371 8.57608 7.8921C8.55308 7.5026 8.41358 7.1201 8.17608 6.7556C8.12989 6.68474 8.11343 6.59855 8.13027 6.51566C8.14711 6.43276 8.1959 6.35982 8.26608 6.3126L8.27058 6.3096C8.39208 6.2301 8.50408 6.0271 8.56058 5.7496C8.62292 5.42147 8.60664 5.08324 8.51308 4.7626C8.41058 4.4126 8.22308 4.1206 7.96058 3.9211C7.66308 3.6941 7.26908 3.5846 6.77058 3.6161C6.70539 3.62034 6.64044 3.60493 6.5841 3.57186C6.52777 3.53879 6.48265 3.48959 6.45458 3.4306C6.29758 3.0981 6.06858 2.8601 5.78308 2.7126C5.50898 2.5758 5.20211 2.51831 4.89708 2.5466C4.27458 2.5961 3.72558 2.9471 3.56208 3.3896C3.53895 3.45188 3.49734 3.50561 3.44283 3.54359C3.38832 3.58156 3.32351 3.60198 3.25708 3.6021C2.72358 3.6031 2.31058 3.7281 2.00858 3.9536C1.74758 4.1486 1.56958 4.4211 1.47558 4.7476C1.39052 5.05493 1.37887 5.37795 1.44158 5.6906C1.49758 5.9696 1.60708 6.2006 1.73258 6.3251L1.73658 6.3286C1.84258 6.4321 1.86508 6.5936 1.79108 6.7211C1.61108 7.0321 1.47658 7.4956 1.45458 7.9411C1.42958 8.4501 1.54758 8.8921 1.81408 9.2091L1.82208 9.2186C1.8623 9.26547 1.88816 9.32293 1.89659 9.38412C1.90501 9.4453 1.89563 9.50761 1.86958 9.56361C1.58158 10.1816 1.49308 10.6896 1.58858 11.0896C1.60574 11.1728 1.58977 11.2594 1.54406 11.3309C1.49835 11.4025 1.42651 11.4534 1.34383 11.4728C1.26115 11.4923 1.17416 11.4786 1.10137 11.4349C1.02857 11.3911 0.975731 11.3207 0.954081 11.2386C0.832581 10.7296 0.915081 10.1466 1.19058 9.4896L1.19758 9.4721L1.19358 9.4661C1.05817 9.2661 0.957121 9.04489 0.894581 8.8116L0.892081 8.8021C0.816181 8.51105 0.786321 8.2099 0.803581 7.9096C0.825581 7.4546 0.942581 6.9886 1.11458 6.6146L1.12058 6.6016L1.11958 6.6006C0.973081 6.3916 0.864581 6.1241 0.804581 5.8281L0.802081 5.8161C0.719411 5.40323 0.735341 4.97667 0.848581 4.5711C0.979581 4.1136 1.23708 3.7206 1.61658 3.4366C1.64658 3.4141 1.67808 3.3916 1.70958 3.3706C1.63008 2.6241 1.65008 2.0056 1.76558 1.5356C1.82908 1.2766 1.92258 1.0606 2.04658 0.892107C2.18158 0.708107 2.35358 0.581107 2.55408 0.523607C2.68708 0.485607 2.82408 0.494607 2.95258 0.545107ZM5.01058 5.0901C5.47858 5.0901 5.91058 5.2466 6.23358 5.5176C6.54858 5.7811 6.73608 6.1351 6.73608 6.4876C6.73608 6.9316 6.53308 7.2776 6.16958 7.4986C5.85958 7.6861 5.44408 7.7771 4.96808 7.7771C4.46358 7.7771 4.03258 7.6476 3.72158 7.4101C3.41308 7.1751 3.24008 6.8451 3.24008 6.4876C3.24008 6.1341 3.43908 5.7791 3.76808 5.5146C4.10208 5.2461 4.54308 5.0901 5.01058 5.0901ZM5.01058 5.5381C4.6637 5.53508 4.32601 5.64963 4.05258 5.8631C3.82208 6.0481 3.69158 6.2806 3.69158 6.4881C3.69158 6.7021 3.79658 6.9026 3.99658 7.0551C4.22408 7.2286 4.55858 7.3291 4.96808 7.3291C5.36758 7.3291 5.70458 7.2556 5.93408 7.1161C6.16558 6.9761 6.28408 6.7731 6.28408 6.4876C6.28408 6.2761 6.16108 6.0426 5.94258 5.8596C5.70058 5.6571 5.37258 5.5381 5.01058 5.5381ZM5.34158 6.1431L5.34358 6.1451C5.40358 6.2206 5.39108 6.3301 5.31558 6.3901L5.16958 6.5051V6.7281C5.16931 6.77775 5.14937 6.82526 5.11413 6.86022C5.07889 6.89518 5.03122 6.91474 4.98158 6.9146C4.93194 6.91474 4.88427 6.89518 4.84903 6.86022C4.81378 6.82526 4.79384 6.77775 4.79358 6.7281V6.4981L4.65808 6.3891C4.6402 6.37478 4.62534 6.35706 4.61435 6.33697C4.60336 6.31687 4.59646 6.2948 4.59404 6.27202C4.59162 6.24924 4.59374 6.22621 4.60027 6.20426C4.6068 6.1823 4.61761 6.16186 4.63208 6.1441C4.66159 6.10818 4.70409 6.08535 4.75034 6.08057C4.79658 6.0758 4.84285 6.08947 4.87908 6.1186L4.98658 6.2046L5.09658 6.1176C5.13268 6.08908 5.1785 6.07579 5.22426 6.08055C5.27003 6.08531 5.31212 6.10776 5.34158 6.1431ZM2.82158 5.1836C3.06058 5.1836 3.25508 5.3786 3.25508 5.6191C3.25521 5.73439 3.20958 5.84501 3.1282 5.92667C3.04682 6.00832 2.93636 6.05434 2.82108 6.0546C2.70597 6.05421 2.59571 6.0082 2.51446 5.92667C2.4332 5.84513 2.38758 5.73471 2.38758 5.6196C2.38731 5.50432 2.43282 5.39365 2.5141 5.3119C2.59539 5.23014 2.7063 5.184 2.82158 5.1836ZM7.17458 5.1836C7.41458 5.1836 7.60858 5.3786 7.60858 5.6191C7.60871 5.73439 7.56308 5.84501 7.4817 5.92667C7.40032 6.00832 7.28986 6.05434 7.17458 6.0546C7.05947 6.05421 6.94921 6.0082 6.86796 5.92667C6.7867 5.84513 6.74108 5.73471 6.74108 5.6196C6.74081 5.50432 6.78632 5.39365 6.8676 5.3119C6.94889 5.23014 7.0593 5.184 7.17458 5.1836ZM2.72008 1.1501L2.71858 1.1511C2.66065 1.17629 2.61119 1.2176 2.57608 1.2701L2.57358 1.2731C2.50458 1.3676 2.44458 1.5066 2.39958 1.6891C2.31458 2.0351 2.29158 2.5046 2.33758 3.0801C2.55258 3.0161 2.78708 2.9761 3.03958 2.9616L3.04458 2.9611L3.05408 2.9441C3.07708 2.9031 3.10158 2.8636 3.12808 2.8246C3.18958 2.4391 3.13908 1.9786 3.00158 1.6026C2.93458 1.4206 2.85308 1.2776 2.77508 1.1961C2.75898 1.17916 2.74103 1.16407 2.72158 1.1511L2.72008 1.1501ZM7.30708 1.1701L7.30608 1.1706C7.28663 1.18357 7.26868 1.19866 7.25258 1.2156C7.17458 1.2971 7.09258 1.4406 7.02608 1.6226C6.88108 2.0196 6.83258 2.5106 6.91108 2.9086L6.94008 2.9571L6.94408 2.9641H6.95908C7.20723 2.96417 7.45408 2.99987 7.69208 3.0701C7.73508 2.5081 7.71108 2.0486 7.62808 1.7091C7.58308 1.5266 7.52308 1.3876 7.45358 1.2931L7.45158 1.2901C7.41654 1.23741 7.36707 1.19593 7.30908 1.1706L7.30708 1.1701Z" fill="white"/>
+</svg>
diff --git a/docs/gpu.md b/docs/gpu.mdx
similarity index 67%
rename from docs/gpu.md
rename to docs/gpu.mdx
index 910f82d1..84ef2a49 100644
--- a/docs/gpu.md
+++ b/docs/gpu.mdx
@@ -1,39 +1,36 @@
-# GPU
+---
+title: Hardware support
+---
+
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
+
+Ollama supports Nvidia GPUs with compute capability 5.0+.
 
 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
 
-| Compute Capability | Family              | Cards                                                                                                       |
-| ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
-| 12.0               | GeForce RTX 50xx    | `RTX 5060` `RTX 5060 Ti` `RTX 5070` `RTX 5070 Ti` `RTX 5080` `RTX 5090`                                     |
-|                    | NVIDIA Professioal  | `RTX PRO 4000 Blackwell` `RTX PRO 4500 Blackwell` `RTX PRO 5000 Blackwell` `RTX PRO 6000 Blackwell`         |
-| 11.0               | Jetson              | `T4000` `T5000` (Requires driver 580 or newer)                                                              |
-| 10.3               | NVIDIA Professioal  | `B300` `GB300` (Requires driver 580 or newer)                                                               |
-| 10.0               | NVIDIA Professioal  | `B200` `GB200` (Requires driver 580 or newer)                                                               |
-| 9.0                | NVIDIA              | `H200` `H100` `GH200`                                                                                       |
-| 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060`  |
-|                    | NVIDIA Professional | `L4` `L40` `RTX 6000`                                                                                       |
-| 8.7                | Jetson              | `Orin Nano` `Orin NX` `AGX Orin`                                                                            |
-| 8.6                | GeForce RTX 30xx    | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` `RTX 3050 Ti` `RTX 3050`   |
-|                    | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2`                          |
-| 8.0                | NVIDIA              | `A100` `A30`                                                                                                |
-| 7.5                | GeForce GTX/RTX     | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060`                                    |
-|                    | NVIDIA Professional | `T4` `RTX 5000` `RTX 4000` `RTX 3000` `T2000` `T1200` `T1000` `T600` `T500`                                 |
-|                    | Quadro              | `RTX 8000` `RTX 6000` `RTX 5000` `RTX 4000`                                                                 |
-| 7.2                | Jetson              | `Xavier NX` `AGX Xavier` (Jetpack 5)                                                                        |
-| 7.0                | NVIDIA              | `TITAN V` `V100` `Quadro GV100`                                                                             |
-| 6.1                | NVIDIA TITAN        | `TITAN Xp` `TITAN X`                                                                                        |
-|                    | GeForce GTX         | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050 Ti` `GTX 1050`                       |
-|                    | Quadro              | `P6000` `P5200` `P4200` `P3200` `P5000` `P4000` `P3000` `P2200` `P2000` `P1000` `P620` `P600` `P500` `P520` |
-|                    | Tesla               | `P40` `P4`                                                                                                  |
-| 6.0                | NVIDIA              | `Tesla P100` `Quadro GP100`                                                                                 |
-| 5.2                | GeForce GTX         | `GTX TITAN X` `GTX 980 Ti` `GTX 980` `GTX 970` `GTX 960` `GTX 950`                                          |
-|                    | Quadro              | `M6000 24GB` `M6000` `M5000` `M5500M` `M4000` `M2200` `M2000` `M620`                                        |
-|                    | Tesla               | `M60` `M40`                                                                                                 |
-| 5.0                | GeForce GTX         | `GTX 750 Ti` `GTX 750` `NVS 810`                                                                            |
-|                    | Quadro              | `K2200` `K1200` `K620` `M1200` `M520` `M5000M` `M4000M` `M3000M` `M2000M` `M1000M` `K620M` `M600M` `M500M`  |
+| Compute Capability | Family              | Cards                                                                                                                         |
+| ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| 9.0                | NVIDIA              | `H200` `H100`                                                                                                                 |
+| 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
+|                    | NVIDIA Professional | `L4` `L40` `RTX 6000`                                                                                                         |
+| 8.6                | GeForce RTX 30xx    | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` `RTX 3050 Ti` `RTX 3050`  |
+|                    | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2`                                            |
+| 8.0                | NVIDIA              | `A100` `A30`                                                                                                                  |
+| 7.5                | GeForce GTX/RTX     | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060`                                                      |
+|                    | NVIDIA Professional | `T4` `RTX 5000` `RTX 4000` `RTX 3000` `T2000` `T1200` `T1000` `T600` `T500`                                                   |
+|                    | Quadro              | `RTX 8000` `RTX 6000` `RTX 5000` `RTX 4000`                                                                                   |
+| 7.0                | NVIDIA              | `TITAN V` `V100` `Quadro GV100`                                                                                               |
+| 6.1                | NVIDIA TITAN        | `TITAN Xp` `TITAN X`                                                                                                          |
+|                    | GeForce GTX         | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050 Ti` `GTX 1050`                                         |
+|                    | Quadro              | `P6000` `P5200` `P4200` `P3200` `P5000` `P4000` `P3000` `P2200` `P2000` `P1000` `P620` `P600` `P500` `P520`                   |
+|                    | Tesla               | `P40` `P4`                                                                                                                    |
+| 6.0                | NVIDIA              | `Tesla P100` `Quadro GP100`                                                                                                   |
+| 5.2                | GeForce GTX         | `GTX TITAN X` `GTX 980 Ti` `GTX 980` `GTX 970` `GTX 960` `GTX 950`                                                            |
+|                    | Quadro              | `M6000 24GB` `M6000` `M5000` `M5500M` `M4000` `M2200` `M2000` `M620`                                                          |
+|                    | Tesla               | `M60` `M40`                                                                                                                   |
+| 5.0                | GeForce GTX         | `GTX 750 Ti` `GTX 750` `NVS 810`                                                                                              |
+|                    | Quadro              | `K2200` `K1200` `K620` `M1200` `M520` `M5000M` `M4000M` `M3000M` `M2000M` `M1000M` `K620M` `M600M` `M500M`                    |
 
 For building locally to support older GPUs, see [developer.md](./development.md#linux-cuda-nvidia)
 
@@ -48,51 +45,53 @@ ignore the GPUs and force CPU usage, use an invalid GPU ID (e.g., "-1")
 ### Linux Suspend Resume
 
 On linux, after a suspend/resume cycle, sometimes Ollama will fail to discover
-your NVIDIA GPU, and fallback to running on the CPU.  You can workaround this
+your NVIDIA GPU, and fallback to running on the CPU. You can workaround this
 driver bug by reloading the NVIDIA UVM driver with `sudo rmmod nvidia_uvm &&
 sudo modprobe nvidia_uvm`
 
 ## AMD Radeon
+
 Ollama supports the following AMD GPUs:
 
 ### Linux Support
-| Family         | Cards and accelerators                                                                                               |
-| -------------- | -------------------------------------------------------------------------------------------------------------------- |
-| AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800`  |
-| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320`           |
-| AMD Instinct   | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100`                                                   |
+
+| Family         | Cards and accelerators                                                                                                                         |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56`        |
+| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `VII` `SSG` |
+| AMD Instinct   | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50`                                                               |
 
 ### Windows Support
-With ROCm v6.2, the following GPUs are supported on Windows.
 
-| Family         | Cards and accelerators                                                                                                               |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800`    |
-| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` |
+With ROCm v6.1, the following GPUs are supported on Windows.
 
-### Known Workarounds
-
-- The RX Vega 56 requires `HSA_ENABLE_SDMA=0` to disable SDMA
+| Family         | Cards and accelerators                                                                                              |
+| -------------- | ------------------------------------------------------------------------------------------------------------------- |
+| AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` |
+| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620`                               |
 
 ### Overrides on Linux
+
 Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
 some cases you can force the system to try to use a similar LLVM target that is
-close.  For example The Radeon RX 5400 is `gfx1034` (also known as 10.3.4)
+close. For example The Radeon RX 5400 is `gfx1034` (also known as 10.3.4)
 however, ROCm does not currently support this target. The closest support is
-`gfx1030`.  You can use the environment variable `HSA_OVERRIDE_GFX_VERSION` with
-`x.y.z` syntax.  So for example, to force the system to run on the RX 5400, you
+`gfx1030`. You can use the environment variable `HSA_OVERRIDE_GFX_VERSION` with
+`x.y.z` syntax. So for example, to force the system to run on the RX 5400, you
 would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
-server.  If you have an unsupported AMD GPU you can experiment using the list of
+server. If you have an unsupported AMD GPU you can experiment using the list of
 supported types below.
 
 If you have multiple GPUs with different GFX versions, append the numeric device
-number to the environment variable to set them individually.  For example,
-`HSA_OVERRIDE_GFX_VERSION_0=10.3.0` and  `HSA_OVERRIDE_GFX_VERSION_1=11.0.0`
+number to the environment variable to set them individually. For example,
+`HSA_OVERRIDE_GFX_VERSION_0=10.3.0` and `HSA_OVERRIDE_GFX_VERSION_1=11.0.0`
 
 At this time, the known supported GPU types on linux are the following LLVM Targets.
 This table shows some example GPUs that map to these LLVM targets:
 | **LLVM Target** | **An Example GPU** |
 |-----------------|---------------------|
+| gfx900 | Radeon RX Vega 56 |
+| gfx906 | Radeon Instinct MI50 |
 | gfx908 | Radeon Instinct MI100 |
 | gfx90a | Radeon Instinct MI210 |
 | gfx940 | Radeon Instinct MI300 |
@@ -113,15 +112,16 @@ Reach out on [Discord](https://discord.gg/ollama) or file an
 
 If you have multiple AMD GPUs in your system and want to limit Ollama to use a
 subset, you can set `ROCR_VISIBLE_DEVICES` to a comma separated list of GPUs.
-You can see the list of devices with `rocminfo`.  If you want to ignore the GPUs
-and force CPU usage, use an invalid GPU ID (e.g., "-1").  When available, use the
+You can see the list of devices with `rocminfo`. If you want to ignore the GPUs
+and force CPU usage, use an invalid GPU ID (e.g., "-1"). When available, use the
 `Uuid` to uniquely identify the device instead of numeric value.
 
 ### Container Permission
 
 In some Linux distributions, SELinux can prevent containers from
-accessing the AMD GPU devices.  On the host system you can run 
+accessing the AMD GPU devices. On the host system you can run
 `sudo setsebool container_use_devices=1` to allow containers to use devices.
 
 ### Metal (Apple GPUs)
+
 Ollama supports GPU acceleration on Apple devices via the Metal API.
diff --git a/docs/images/cline-mcp.png b/docs/images/cline-mcp.png
new file mode 100644
index 00000000..9d2c746c
Binary files /dev/null and b/docs/images/cline-mcp.png differ
diff --git a/docs/images/cline-settings.png b/docs/images/cline-settings.png
new file mode 100644
index 00000000..4c5c6158
Binary files /dev/null and b/docs/images/cline-settings.png differ
diff --git a/docs/images/codex-mcp.png b/docs/images/codex-mcp.png
new file mode 100644
index 00000000..f37c9a15
Binary files /dev/null and b/docs/images/codex-mcp.png differ
diff --git a/docs/images/favicon.png b/docs/images/favicon.png
new file mode 100644
index 00000000..e1130b23
Binary files /dev/null and b/docs/images/favicon.png differ
diff --git a/docs/images/goose-cli.png b/docs/images/goose-cli.png
new file mode 100644
index 00000000..89ac37ac
Binary files /dev/null and b/docs/images/goose-cli.png differ
diff --git a/docs/images/goose-mcp-1.png b/docs/images/goose-mcp-1.png
new file mode 100644
index 00000000..6bee203d
Binary files /dev/null and b/docs/images/goose-mcp-1.png differ
diff --git a/docs/images/goose-mcp-2.png b/docs/images/goose-mcp-2.png
new file mode 100644
index 00000000..bfe6d0d2
Binary files /dev/null and b/docs/images/goose-mcp-2.png differ
diff --git a/docs/images/goose-settings.png b/docs/images/goose-settings.png
new file mode 100644
index 00000000..edac2684
Binary files /dev/null and b/docs/images/goose-settings.png differ
diff --git a/docs/images/intellij-chat-sidebar.png b/docs/images/intellij-chat-sidebar.png
new file mode 100644
index 00000000..2c24e562
Binary files /dev/null and b/docs/images/intellij-chat-sidebar.png differ
diff --git a/docs/images/intellij-current-model.png b/docs/images/intellij-current-model.png
new file mode 100644
index 00000000..96c5f2ed
Binary files /dev/null and b/docs/images/intellij-current-model.png differ
diff --git a/docs/images/intellij-local-models.png b/docs/images/intellij-local-models.png
new file mode 100644
index 00000000..846a3786
Binary files /dev/null and b/docs/images/intellij-local-models.png differ
diff --git a/docs/images/logo-dark.png b/docs/images/logo-dark.png
new file mode 100644
index 00000000..e50ee0dc
Binary files /dev/null and b/docs/images/logo-dark.png differ
diff --git a/docs/images/logo.png b/docs/images/logo.png
new file mode 100644
index 00000000..827de1b9
Binary files /dev/null and b/docs/images/logo.png differ
diff --git a/docs/images/n8n-chat-model.png b/docs/images/n8n-chat-model.png
new file mode 100644
index 00000000..cafbc7a8
Binary files /dev/null and b/docs/images/n8n-chat-model.png differ
diff --git a/docs/images/n8n-chat-node.png b/docs/images/n8n-chat-node.png
new file mode 100644
index 00000000..89768e20
Binary files /dev/null and b/docs/images/n8n-chat-node.png differ
diff --git a/docs/images/n8n-credential-creation.png b/docs/images/n8n-credential-creation.png
new file mode 100644
index 00000000..1eeb5010
Binary files /dev/null and b/docs/images/n8n-credential-creation.png differ
diff --git a/docs/images/n8n-models.png b/docs/images/n8n-models.png
new file mode 100644
index 00000000..c1c70aca
Binary files /dev/null and b/docs/images/n8n-models.png differ
diff --git a/docs/images/n8n-ollama-form.png b/docs/images/n8n-ollama-form.png
new file mode 100644
index 00000000..2f9174de
Binary files /dev/null and b/docs/images/n8n-ollama-form.png differ
diff --git a/docs/images/ollama-settings.png b/docs/images/ollama-settings.png
new file mode 100644
index 00000000..a3470f7a
Binary files /dev/null and b/docs/images/ollama-settings.png differ
diff --git a/docs/images/vscode-model-options.png b/docs/images/vscode-model-options.png
new file mode 100644
index 00000000..b1cca5d0
Binary files /dev/null and b/docs/images/vscode-model-options.png differ
diff --git a/docs/images/vscode-models.png b/docs/images/vscode-models.png
new file mode 100644
index 00000000..af250eac
Binary files /dev/null and b/docs/images/vscode-models.png differ
diff --git a/docs/images/vscode-sidebar.png b/docs/images/vscode-sidebar.png
new file mode 100644
index 00000000..aa4a0735
Binary files /dev/null and b/docs/images/vscode-sidebar.png differ
diff --git a/docs/images/welcome.png b/docs/images/welcome.png
new file mode 100644
index 00000000..88ce37b2
Binary files /dev/null and b/docs/images/welcome.png differ
diff --git a/docs/images/xcode-chat-icon.png b/docs/images/xcode-chat-icon.png
new file mode 100644
index 00000000..3396a8a0
Binary files /dev/null and b/docs/images/xcode-chat-icon.png differ
diff --git a/docs/images/xcode-intelligence-window.png b/docs/images/xcode-intelligence-window.png
new file mode 100644
index 00000000..599d2f8b
Binary files /dev/null and b/docs/images/xcode-intelligence-window.png differ
diff --git a/docs/images/xcode-locally-hosted.png b/docs/images/xcode-locally-hosted.png
new file mode 100644
index 00000000..e8efd7db
Binary files /dev/null and b/docs/images/xcode-locally-hosted.png differ
diff --git a/docs/images/zed-ollama-dropdown.png b/docs/images/zed-ollama-dropdown.png
new file mode 100644
index 00000000..7cacd158
Binary files /dev/null and b/docs/images/zed-ollama-dropdown.png differ
diff --git a/docs/images/zed-settings.png b/docs/images/zed-settings.png
new file mode 100644
index 00000000..913882b2
Binary files /dev/null and b/docs/images/zed-settings.png differ
diff --git a/docs/import.md b/docs/import.mdx
similarity index 77%
rename from docs/import.md
rename to docs/import.mdx
index 104b4162..b1959689 100644
--- a/docs/import.md
+++ b/docs/import.mdx
@@ -1,11 +1,13 @@
-# Importing a model
+---
+title: Importing a Model
+---
 
 ## Table of Contents
 
-  * [Importing a Safetensors adapter](#Importing-a-fine-tuned-adapter-from-Safetensors-weights)
-  * [Importing a Safetensors model](#Importing-a-model-from-Safetensors-weights)
-  * [Importing a GGUF file](#Importing-a-GGUF-based-model-or-adapter)
-  * [Sharing models on ollama.com](#Sharing-your-model-on-ollamacom)
+- [Importing a Safetensors adapter](#Importing-a-fine-tuned-adapter-from-Safetensors-weights)
+- [Importing a Safetensors model](#Importing-a-model-from-Safetensors-weights)
+- [Importing a GGUF file](#Importing-a-GGUF-based-model-or-adapter)
+- [Sharing models on ollama.com](#Sharing-your-model-on-ollamacom)
 
 ## Importing a fine tuned adapter from Safetensors weights
 
@@ -32,16 +34,15 @@ ollama run my-model
 
 Ollama supports importing adapters based on several different model architectures including:
 
-  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
-  * Mistral (including Mistral 1, Mistral 2, and Mixtral); and
-  * Gemma (including Gemma 1 and Gemma 2)
+- Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
+- Mistral (including Mistral 1, Mistral 2, and Mixtral); and
+- Gemma (including Gemma 1 and Gemma 2)
 
 You can create the adapter using a fine tuning framework or tool which can output adapters in the Safetensors format, such as:
 
-  * Hugging Face [fine tuning framework](https://huggingface.co/docs/transformers/en/training)
-  * [Unsloth](https://github.com/unslothai/unsloth)
-  * [MLX](https://github.com/ml-explore/mlx)
-
+- Hugging Face [fine tuning framework](https://huggingface.co/docs/transformers/en/training)
+- [Unsloth](https://github.com/unslothai/unsloth)
+- [MLX](https://github.com/ml-explore/mlx)
 
 ## Importing a model from Safetensors weights
 
@@ -53,8 +54,6 @@ FROM /path/to/safetensors/directory
 
 If you create the Modelfile in the same directory as the weights, you can use the command `FROM .`.
 
-If you do not create the Modelfile, ollama will act as if there was a Modelfile with the command `FROM .`.
-
 Now run the `ollama create` command from the directory where you created the `Modelfile`:
 
 ```shell
@@ -69,19 +68,20 @@ ollama run my-model
 
 Ollama supports importing models for several different architectures including:
 
-  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
-  * Mistral (including Mistral 1, Mistral 2, and Mixtral);
-  * Gemma (including Gemma 1 and Gemma 2); and
-  * Phi3
+- Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
+- Mistral (including Mistral 1, Mistral 2, and Mixtral);
+- Gemma (including Gemma 1 and Gemma 2); and
+- Phi3
 
 This includes importing foundation models as well as any fine tuned models which have been _fused_ with a foundation model.
+
 ## Importing a GGUF based model or adapter
 
 If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
 
-  * converting a Safetensors model with the `convert_hf_to_gguf.py` from Llama.cpp; 
-  * converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
-  * downloading a model or adapter from a place such as HuggingFace
+- converting a Safetensors model with the `convert_hf_to_gguf.py` from Llama.cpp;
+- converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
+- downloading a model or adapter from a place such as HuggingFace
 
 To import a GGUF model, create a `Modelfile` containing:
 
@@ -98,9 +98,9 @@ ADAPTER /path/to/file.gguf
 
 When importing a GGUF adapter, it's important to use the same base model as the base model that the adapter was created with. You can use:
 
- * a model from Ollama
- * a GGUF file
- * a Safetensors based model 
+- a model from Ollama
+- a GGUF file
+- a Safetensors based model
 
 Once you have created your `Modelfile`, use the `ollama create` command to build the model.
 
@@ -134,13 +134,22 @@ success
 
 ### Supported Quantizations
 
+- `q4_0`
+- `q4_1`
+- `q5_0`
+- `q5_1`
 - `q8_0`
 
 #### K-means Quantizations
 
+- `q3_K_S`
+- `q3_K_M`
+- `q3_K_L`
 - `q4_K_S`
 - `q4_K_M`
-
+- `q5_K_S`
+- `q5_K_M`
+- `q6_K`
 
 ## Sharing your model on ollama.com
 
@@ -148,7 +157,7 @@ You can share any model you have created by pushing it to [ollama.com](https://o
 
 First, use your browser to go to the [Ollama Sign-Up](https://ollama.com/signup) page. If you already have an account, you can skip this step.
 
-<img src="images/signup.png" alt="Sign-Up" width="40%">
+<img src="images/signup.png" alt="Sign-Up" width="40%" />
 
 The `Username` field will be used as part of your model's name (e.g. `jmorganca/mymodel`), so make sure you are comfortable with the username that you have selected.
 
@@ -156,7 +165,7 @@ Now that you have created an account and are signed-in, go to the [Ollama Keys S
 
 Follow the directions on the page to determine where your Ollama Public Key is located.
 
-<img src="images/ollama-keys.png" alt="Ollama Keys" width="80%">
+<img src="images/ollama-keys.png" alt="Ollama Keys" width="80%" />
 
 Click on the `Add Ollama Public Key` button, and copy and paste the contents of your Ollama Public Key into the text field.
 
@@ -173,4 +182,3 @@ Once your model has been pushed, other users can pull and run it by using the co
 ```shell
 ollama run myuser/mymodel
 ```
-
diff --git a/docs/index.mdx b/docs/index.mdx
new file mode 100644
index 00000000..669d30cf
--- /dev/null
+++ b/docs/index.mdx
@@ -0,0 +1,58 @@
+---
+title: Ollama's documentation
+sidebarTitle: Welcome
+---
+
+<img src="/images/welcome.png" noZoom className="rounded-3xl" />
+
+[Ollama](https://ollama.com) is the easiest way to get up and running with large language models such as gpt-oss, Gemma 3, DeepSeek-R1, Qwen3 and more.
+
+<CardGroup cols={2}>
+  <Card title="Quickstart" icon="rocket" href="/quickstart">
+    Get up and running with your first model
+  </Card>
+  <Card
+    title="Download Ollama"
+    icon="download"
+    href="https://ollama.com/download"
+  >
+    Download Ollama on macOS, Windows or Linux
+  </Card>
+  <Card title="Cloud" icon="cloud" href="/cloud">
+    Ollama's cloud models offer larger models with better performance.
+  </Card>
+  <Card title="API reference" icon="terminal" href="/api">
+    View Ollama's API reference
+  </Card>
+</CardGroup>
+
+## Libraries
+
+<CardGroup cols={2}>
+  <Card
+    title="Ollama's Python Library"
+    icon="python"
+    href="https://github.com/ollama/ollama-python"
+  >
+    The official library for using Ollama with Python
+  </Card>
+
+  <Card title="Ollama's JavaScript library" icon="js" href="https://github.com/ollama/ollama-js">
+    The official library for using Ollama with JavaScript or TypeScript.
+  </Card>
+  <Card title="Community libraries" icon="github" href="https://github.com/ollama/ollama?tab=readme-ov-file#libraries-1">
+    View a list of 20+ community-supported libraries for Ollama
+  </Card>
+</CardGroup>
+
+## Community
+
+<CardGroup cols={2}>
+  <Card title="Discord" icon="discord" href="https://discord.gg/ollama">
+    Join our Discord community
+  </Card>
+
+  <Card title="Reddit" icon="reddit" href="https://reddit.com/r/ollama">
+    Join our Reddit community
+  </Card>
+</CardGroup>
diff --git a/docs/integrations/cline.mdx b/docs/integrations/cline.mdx
new file mode 100644
index 00000000..371fc628
--- /dev/null
+++ b/docs/integrations/cline.mdx
@@ -0,0 +1,38 @@
+---
+title: Cline
+---
+
+## Install
+
+Install [Cline](https://docs.cline.bot/getting-started/installing-cline) in your IDE.
+
+
+## Usage with Ollama
+
+1. Open Cline settings > `API Configuration` and set `API Provider` to `Ollama`
+2. Select a model under `Model` or type one (e.g. `qwen3`)
+3. Update the context window to at least 32K tokens under `Context Window`
+
+<Note>Coding tools require a larger context window. It is recommended to use a context window of at least 32K tokens. See [Context length](/context-length) for more information.</Note>
+
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/cline-settings.png" 
+    alt="Cline settings configuration showing API Provider set to Ollama"
+    width="50%"
+  />
+</div>
+
+
+    
+## Connecting to ollama.com
+1. Create an [API key](https://ollama.com/settings/keys) from ollama.com
+2. Click on `Use custom base URL` and set it to `https://ollama.com`
+3. Enter your **Ollama API Key**
+4. Select a model from the list
+
+
+### Recommended Models
+
+- `qwen3-coder:480b` 
+- `deepseek-v3.1:671b`
diff --git a/docs/integrations/codex.mdx b/docs/integrations/codex.mdx
new file mode 100644
index 00000000..f9df1b85
--- /dev/null
+++ b/docs/integrations/codex.mdx
@@ -0,0 +1,56 @@
+---
+title: Codex
+---
+
+
+## Install
+
+Install the [Codex CLI](https://developers.openai.com/codex/cli/):
+
+```
+npm install -g @openai/codex
+```
+
+## Usage with Ollama
+
+<Note>Codex requires a larger context window. It is recommended to use a context window of at least 32K tokens.</Note>
+
+To use `codex` with Ollama, use the `--oss` flag:
+
+```
+codex --oss
+```
+
+### Changing Models
+
+By default, codex will use the local `gpt-oss:20b` model. However, you can specify a different model with the `-m` flag:
+
+```
+codex --oss -m gpt-oss:120b
+```
+
+### Cloud Models
+
+```
+codex --oss -m gpt-oss:120b-cloud
+```
+
+
+## Connecting to ollama.com
+
+
+Create an [API key](https://ollama.com/settings/keys) from ollama.com and export it as `OLLAMA_API_KEY`.
+
+To use ollama.com directly, edit your `~/.codex/config.toml` file to point to ollama.com.
+
+```toml
+model = "gpt-oss:120b"
+model_provider = "ollama"
+
+[model_providers.ollama]
+name = "Ollama"
+base_url = "https://ollama.com/v1"
+env_key = "OLLAMA_API_KEY"
+```
+
+Run `codex` in a new terminal to load the new settings.
diff --git a/docs/integrations/droid.mdx b/docs/integrations/droid.mdx
new file mode 100644
index 00000000..b1ba3771
--- /dev/null
+++ b/docs/integrations/droid.mdx
@@ -0,0 +1,76 @@
+---
+title: Droid
+---
+
+
+## Install
+
+Install the [Droid CLI](https://factory.ai/):
+
+```bash
+curl -fsSL https://app.factory.ai/cli | sh
+```
+
+<Note>Droid requires a larger context window. It is recommended to use a context window of at least 32K tokens. See [Context length](/context-length) for more information.</Note>
+
+## Usage with Ollama
+
+Add a local configuration block to `~/.factory/config.json`:
+
+```json
+{
+  "custom_models": [
+    {
+      "model_display_name": "qwen3-coder [Ollama]",
+      "model": "qwen3-coder",
+      "base_url": "http://localhost:11434/v1/",
+      "api_key": "not-needed",
+      "provider": "generic-chat-completion-api",
+      "max_tokens": 32000 
+    }
+  ]
+}
+```
+
+
+## Cloud Models
+`qwen3-coder:480b-cloud` is the recommended model for use with Droid.
+
+Add the cloud configuration block to `~/.factory/config.json`:
+
+```json
+{
+  "custom_models": [
+    {
+      "model_display_name": "qwen3-coder [Ollama Cloud]",
+      "model": "qwen3-coder:480b-cloud",
+      "base_url": "http://localhost:11434/v1/",
+      "api_key": "not-needed",
+      "provider": "generic-chat-completion-api",
+      "max_tokens": 128000
+    }
+  ]
+}
+```
+
+## Connecting to ollama.com
+
+1. Create an [API key](https://ollama.com/settings/keys) from ollama.com and export it as `OLLAMA_API_KEY`.
+2. Add the cloud configuration block to `~/.factory/config.json`:
+
+   ```json
+   {
+     "custom_models": [
+       {
+         "model_display_name": "qwen3-coder [Ollama Cloud]",
+         "model": "qwen3-coder:480b",
+         "base_url": "https://ollama.com/v1/",
+         "api_key": "OLLAMA_API_KEY",
+         "provider": "generic-chat-completion-api",
+         "max_tokens": 128000
+       }
+     ]
+   }
+   ```
+
+Run `droid` in a new terminal to load the new settings.
\ No newline at end of file
diff --git a/docs/integrations/goose.mdx b/docs/integrations/goose.mdx
new file mode 100644
index 00000000..35099a3b
--- /dev/null
+++ b/docs/integrations/goose.mdx
@@ -0,0 +1,49 @@
+---
+title: Goose
+---
+
+## Goose Desktop
+
+Install [Goose](https://block.github.io/goose/docs/getting-started/installation/) Desktop.
+
+### Usage with Ollama
+1. In Goose, open **Settings** → **Configure Provider**.  
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/goose-settings.png" 
+    alt="Goose settings Panel"
+    width="75%"
+  />
+</div>
+2. Find **Ollama**, click **Configure** 
+3. Confirm **API Host** is `http://localhost:11434` and click Submit
+
+
+### Connecting to ollama.com
+
+1. Create an [API key](https://ollama.com/settings/keys) on ollama.com and save it in your `.env` 
+2. In Goose, set **API Host** to `https://ollama.com`
+
+
+## Goose CLI
+
+Install [Goose](https://block.github.io/goose/docs/getting-started/installation/) CLI
+
+### Usage with Ollama
+1. Run `goose configure`
+2. Select **Configure Providers** and select **Ollama**
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/goose-cli.png" 
+    alt="Goose CLI"
+    width="50%"
+  />
+</div>
+3. Enter model name (e.g `qwen3`)
+
+### Connecting to ollama.com
+
+1. Create an [API key](https://ollama.com/settings/keys) on ollama.com and save it in your `.env` 
+2. Run `goose configure`
+3. Select **Configure Providers** and select **Ollama**
+4. Update **OLLAMA_HOST** to `https://ollama.com`
diff --git a/docs/integrations/jetbrains.mdx b/docs/integrations/jetbrains.mdx
new file mode 100644
index 00000000..29fbd95b
--- /dev/null
+++ b/docs/integrations/jetbrains.mdx
@@ -0,0 +1,47 @@
+---
+title: JetBrains
+---
+
+<Note>This example uses **IntelliJ**; same steps apply to other JetBrains IDEs (e.g., PyCharm).</Note>
+
+## Install
+
+Install [IntelliJ](https://www.jetbrains.com/idea/).
+
+## Usage with Ollama
+
+<Note>
+  To use **Ollama**,  you will need a [JetBrains AI Subscription](https://www.jetbrains.com/ai-ides/buy/?section=personal&billing=yearly).
+</Note>
+
+1. In Intellij, click the **chat icon** located in the right sidebar
+
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/intellij-chat-sidebar.png" 
+    alt="Intellij Sidebar Chat"
+    width="50%"
+  />
+</div>
+
+2. Select the **current model** in the sidebar, then click **Set up Local Models**
+
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/intellij-current-model.png" 
+    alt="Intellij model bottom right corner"
+    width="50%"
+  />
+</div>
+
+3. Under **Third Party AI Providers**, choose **Ollama**  
+4. Confirm the **Host URL** is `http://localhost:11434`, then click **Ok**  
+5. Once connected, select a model under **Local models by Ollama**
+
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/intellij-local-models.png" 
+    alt="Zed star icon in bottom right corner"
+    width="50%"
+  />
+</div>
diff --git a/docs/integrations/n8n.mdx b/docs/integrations/n8n.mdx
new file mode 100644
index 00000000..c58967fa
--- /dev/null
+++ b/docs/integrations/n8n.mdx
@@ -0,0 +1,53 @@
+---
+title: n8n
+---
+
+## Install
+
+Install [n8n](https://docs.n8n.io/choose-n8n/).
+
+## Using Ollama Locally
+
+1. In the top right corner, click the dropdown and select **Create Credential**
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/n8n-credential-creation.png" 
+    alt="Create a n8n Credential"
+    width="75%"
+  />
+</div>
+
+2. Under **Add new credential** select **Ollama**
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/n8n-ollama-form.png" 
+    alt="Select Ollama under Credential"
+    width="75%"
+  />
+</div>
+3. Confirm Base URL is set to `http://localhost:11434` and click **Save**
+<Note> If connecting to `http://localhost:11434` fails, use `http://127.0.0.1:11434`</Note>
+4. When creating a new workflow, select **Add a first step** and select an **Ollama node** 
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/n8n-chat-node.png" 
+    alt="Add a first step with Ollama node"
+    width="75%"
+  />
+</div>
+5. Select your model of choice (e.g. `qwen3-coder`) 
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/n8n-models.png" 
+    alt="Set up Ollama credentials"
+    width="75%"
+  />
+</div>
+
+## Connecting to ollama.com
+1. Create an [API key](https://ollama.com/settings/keys) on **ollama.com**.  
+2. In n8n, click **Create Credential** and select **Ollama**
+4. Set the **API URL** to `https://ollama.com`
+5. Enter your **API Key** and click **Save**
+
+
diff --git a/docs/integrations/roo-code.mdx b/docs/integrations/roo-code.mdx
new file mode 100644
index 00000000..61c91a71
--- /dev/null
+++ b/docs/integrations/roo-code.mdx
@@ -0,0 +1,30 @@
+---
+title: Roo Code
+---
+
+
+## Install
+
+Install [Roo Code](https://marketplace.visualstudio.com/items?itemName=RooVeterinaryInc.roo-cline) from the VS Code Marketplace.
+
+## Usage with Ollama
+
+1. Open Roo Code in VS Code and click the **gear icon** on the top right corner of the Roo Code window to open **Provider Settings**
+2. Set `API Provider` to `Ollama`
+3. (Optional) Update `Base URL` if your Ollama instance is running remotely. The default is `http://localhost:11434`
+4. Enter a valid `Model ID` (for example `qwen3` or `qwen3-coder:480b-cloud`)
+5. Adjust the `Context Window` to at least 32K tokens for coding tasks
+
+<Note>Coding tools require a larger context window. It is recommended to use a context window of at least 32K tokens. See [Context length](/context-length) for more information.</Note>
+
+## Connecting to ollama.com
+
+1. Create an [API key](https://ollama.com/settings/keys) from ollama.com
+2. Enable `Use custom base URL` and set it to `https://ollama.com`
+3. Enter your **Ollama API Key**
+4. Select a model from the list
+
+### Recommended Models
+
+- `qwen3-coder:480b`
+- `deepseek-v3.1:671b`
diff --git a/docs/integrations/vscode.mdx b/docs/integrations/vscode.mdx
new file mode 100644
index 00000000..c68f9199
--- /dev/null
+++ b/docs/integrations/vscode.mdx
@@ -0,0 +1,34 @@
+---
+title: VS Code 
+---
+
+## Install
+
+Install [VSCode](https://code.visualstudio.com/download). 
+
+## Usage with Ollama 
+
+1. Open Copilot side bar found in top right window
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/vscode-sidebar.png" 
+    alt="VSCode chat Sidebar"
+    width="75%"
+  />
+</div>
+2. Select the model drowpdown > **Manage models**
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/vscode-models.png" 
+    alt="VSCode model picker"
+    width="75%"
+  />
+</div>
+3. Enter **Ollama** under **Provider Dropdown** and select desired models (e.g `qwen3, qwen3-coder:480b-cloud`)
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/vscode-model-options.png" 
+    alt="VSCode model options dropdown"
+    width="75%"
+  />
+</div>
diff --git a/docs/integrations/xcode.mdx b/docs/integrations/xcode.mdx
new file mode 100644
index 00000000..7d10317a
--- /dev/null
+++ b/docs/integrations/xcode.mdx
@@ -0,0 +1,45 @@
+---
+title: Xcode 
+---
+
+## Install
+
+Install [XCode](https://developer.apple.com/xcode/)
+
+
+## Usage with Ollama 
+<Note> Ensure Apple Intelligence is setup and the latest XCode version is v26.0 </Note>
+
+1. Click **XCode** in top left corner > **Settings** 
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/xcode-intelligence-window.png" 
+    alt="Xcode Intelligence window"
+    width="50%"
+  />
+</div>
+
+2. Select **Locally Hosted**, enter port **11434** and click **Add**
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/xcode-locally-hosted.png" 
+    alt="Xcode settings"
+    width="50%"
+  />
+</div>
+
+3. Select the **star icon** on the top left corner and click the **dropdown**
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/xcode-chat-icon.png" 
+    alt="Xcode settings"
+    width="50%"
+  />
+</div>
+4. Click **My Account** and select your desired model
+
+
+## Connecting to ollama.com directly
+1. Create an [API key](https://ollama.com/settings/keys) from ollama.com 
+2. Select **Internet Hosted** and enter URL as `https://ollama.com`  
+3. Enter your **Ollama API Key** and click **Add** 
\ No newline at end of file
diff --git a/docs/integrations/zed.mdx b/docs/integrations/zed.mdx
new file mode 100644
index 00000000..478d3bc8
--- /dev/null
+++ b/docs/integrations/zed.mdx
@@ -0,0 +1,38 @@
+---
+title: Zed
+---
+
+## Install
+
+Install [Zed](https://zed.dev/download).
+
+## Usage with Ollama
+
+1. In Zed, click the **star icon** in the bottom-right corner, then select **Configure**.
+
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/zed-settings.png" 
+    alt="Zed star icon in bottom right corner"
+    width="50%"
+  />
+</div>
+
+2. Under **LLM Providers**, choose **Ollama**
+3. Confirm the **Host URL** is `http://localhost:11434`, then click **Connect**
+4. Once connected, select a model under **Ollama**
+
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/zed-ollama-dropdown.png" 
+    alt="Zed star icon in bottom right corner"
+    width="50%"
+  />
+</div>
+
+## Connecting to ollama.com
+1. Create an [API key](https://ollama.com/settings/keys) on **ollama.com**
+2. In Zed, open the **star icon** → **Configure**
+3. Under **LLM Providers**, select **Ollama**
+4. Set the **API URL** to `https://ollama.com`
+
diff --git a/docs/linux.md b/docs/linux.mdx
similarity index 70%
rename from docs/linux.md
rename to docs/linux.mdx
index ce5ed860..c40ab054 100644
--- a/docs/linux.md
+++ b/docs/linux.mdx
@@ -1,4 +1,6 @@
-# Linux
+---
+title: Linux
+---
 
 ## Install
 
@@ -10,15 +12,16 @@ curl -fsSL https://ollama.com/install.sh | sh
 
 ## Manual install
 
-> [!NOTE]
-> If you are upgrading from a prior version, you **MUST** remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
+<Note>
+  If you are upgrading from a prior version, you should remove the old libraries
+  with `sudo rm -rf /usr/lib/ollama` first.
+</Note>
 
 Download and extract the package:
 
 ```shell
-curl -LO https://ollama.com/download/ollama-linux-amd64.tgz
-sudo rm -rf /usr/lib/ollama
-sudo tar -C /usr -xzf ollama-linux-amd64.tgz
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz \
+    | sudo tar zx -C /usr
 ```
 
 Start Ollama:
@@ -35,15 +38,11 @@ ollama -v
 
 ### AMD GPU install
 
-If you have an AMD GPU, **also** download and extract the additional ROCm package:
-
-> [!IMPORTANT]
-> The ROCm tgz contains only AMD dependent libraries.  You must extract **both** `ollama-linux-amd64.tgz` and `ollama-linux-amd64-rocm.tgz` into the same location.
-
+If you have an AMD GPU, also download and extract the additional ROCm package:
 
 ```shell
-curl -L https://ollama.com/download/ollama-linux-amd64-rocm.tgz -o ollama-linux-amd64-rocm.tgz
-sudo tar -C /usr -xzf ollama-linux-amd64-rocm.tgz
+curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tgz \
+    | sudo tar zx -C /usr
 ```
 
 ### ARM64 install
@@ -51,8 +50,8 @@ sudo tar -C /usr -xzf ollama-linux-amd64-rocm.tgz
 Download and extract the ARM64-specific package:
 
 ```shell
-curl -L https://ollama.com/download/ollama-linux-arm64.tgz -o ollama-linux-arm64.tgz
-sudo tar -C /usr -xzf ollama-linux-arm64.tgz
+curl -fsSL https://ollama.com/download/ollama-linux-arm64.tgz \
+    | sudo tar zx -C /usr
 ```
 
 ### Adding Ollama as a startup service (recommended)
@@ -113,12 +112,13 @@ sudo systemctl start ollama
 sudo systemctl status ollama
 ```
 
-> [!NOTE]
-> While AMD has contributed the `amdgpu` driver upstream to the official linux
-> kernel source, the version is older and may not support all ROCm features. We
-> recommend you install the latest driver from
-> [AMD](https://www.amd.com/en/support/download/linux-drivers.html) for best support
-> of your Radeon GPU.
+<Note>
+  While AMD has contributed the `amdgpu` driver upstream to the official linux
+  kernel source, the version is older and may not support all ROCm features. We
+  recommend you install the latest driver from
+  https://www.amd.com/en/support/linux-drivers for best support of your Radeon
+  GPU.
+</Note>
 
 ## Customizing
 
@@ -146,8 +146,8 @@ curl -fsSL https://ollama.com/install.sh | sh
 Or by re-downloading Ollama:
 
 ```shell
-curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
-sudo tar -C /usr -xzf ollama-linux-amd64.tgz
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz \
+    | sudo tar zx -C /usr
 ```
 
 ## Installing specific versions
@@ -178,6 +178,12 @@ sudo systemctl disable ollama
 sudo rm /etc/systemd/system/ollama.service
 ```
 
+Remove ollama libraries from your lib directory (either `/usr/local/lib`, `/usr/lib`, or `/lib`):
+
+```shell
+sudo rm -r $(which ollama | tr 'bin' 'lib')
+```
+
 Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr/bin`, or `/bin`):
 
 ```shell
@@ -187,13 +193,7 @@ sudo rm $(which ollama)
 Remove the downloaded models and Ollama service user and group:
 
 ```shell
-sudo rm -r /usr/share/ollama
 sudo userdel ollama
 sudo groupdel ollama
-```
-
-Remove installed libraries:
-
-```shell
-sudo rm -rf /usr/local/lib/ollama
+sudo rm -r /usr/share/ollama
 ```
diff --git a/docs/logo.svg b/docs/logo.svg
new file mode 100644
index 00000000..2b410d09
--- /dev/null
+++ b/docs/logo.svg
@@ -0,0 +1,3 @@
+<svg width="28" height="28" viewBox="0 0 28 28" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path fill-rule="evenodd" clip-rule="evenodd" d="M7.25558 0.114339C7.61134 0.222519 7.93252 0.400698 8.22405 0.636149C8.70993 1.0256 9.12005 1.58303 9.433 2.24356C9.74758 2.90792 9.95182 3.64354 10.0292 4.38171C11.0662 3.9284 12.2171 3.65235 13.4041 3.57227L13.4881 3.56718C14.921 3.47809 16.3375 3.6779 17.5728 4.17044C17.7391 4.2379 17.9022 4.31044 18.062 4.3868C18.1443 3.66263 18.3453 2.94355 18.6549 2.29447C18.9678 1.63266 19.378 1.07651 19.8622 0.685785C20.1328 0.459579 20.4638 0.281532 20.8323 0.163974C21.2556 0.0367035 21.7053 0.0137947 22.1434 0.110521C22.8039 0.255609 23.3704 0.578877 23.8168 1.04851C24.2253 1.47739 24.5316 2.0272 24.7408 2.68646C25.1196 3.87517 25.1855 5.43933 24.9302 7.32549L25.0175 7.37639L25.0603 7.40058C26.3072 8.13366 27.1752 9.17855 27.6348 10.3914C28.3512 12.284 27.9905 14.4068 26.7552 15.5943L26.7255 15.621L26.7288 15.6248C27.4157 16.5946 27.8324 17.6192 27.9214 18.6793L27.9246 18.7175C28.0301 20.0729 27.5952 21.4373 26.5839 22.7774L26.5723 22.7902L26.5888 22.8207C27.3663 24.2932 27.6101 25.7759 27.3103 27.2574L27.3004 27.307C27.254 27.5234 27.0983 27.7168 26.8677 27.8446C26.637 27.9724 26.3501 28.0246 26.07 27.9892C25.9312 27.9724 25.7982 27.9347 25.6783 27.8782C25.5585 27.8217 25.4543 27.7474 25.3717 27.6595C25.289 27.572 25.2296 27.4725 25.1968 27.3668C25.164 27.2614 25.1585 27.152 25.1806 27.0448C25.4556 25.7301 25.197 24.4116 24.39 23.0702C24.3147 22.9456 24.2812 22.8083 24.2927 22.671C24.3043 22.5338 24.3604 22.401 24.4559 22.2849L24.4624 22.2773C25.4573 21.1013 25.869 19.9482 25.7801 18.8155C25.7043 17.8241 25.2448 16.8504 24.4624 15.9226C24.3103 15.7423 24.2561 15.5229 24.3115 15.3119C24.367 15.1009 24.5277 14.9152 24.7589 14.795L24.7737 14.7874C25.174 14.585 25.5429 14.0683 25.729 13.3619C25.9344 12.5267 25.8808 11.6658 25.5726 10.8496C25.2349 9.95872 24.6173 9.21546 23.7526 8.70765C22.7726 8.12984 21.4747 7.85111 19.8326 7.9313C19.6178 7.94209 19.4039 7.90286 19.2183 7.81869C19.0327 7.73451 18.8841 7.60927 18.7916 7.45912C18.2744 6.61277 17.5201 6.00696 16.5796 5.63151C15.6767 5.2833 14.6658 5.13696 13.661 5.20897C11.6104 5.33497 9.80194 6.22841 9.26335 7.35476C9.18715 7.51329 9.05009 7.65005 8.87052 7.74673C8.69096 7.84338 8.47747 7.89535 8.25864 7.89566C6.50122 7.8982 5.14075 8.21638 4.14592 8.79037C3.28615 9.28673 2.6998 9.98036 2.39015 10.8114C2.10995 11.5937 2.07158 12.4159 2.27815 13.2118C2.46262 13.9219 2.82333 14.5099 3.23674 14.8268L3.24992 14.8357C3.5991 15.0992 3.67321 15.5103 3.42945 15.8348C2.83651 16.6264 2.39345 17.8062 2.32098 18.9402C2.23862 20.2358 2.62733 21.3609 3.50521 22.1678L3.53157 22.192C3.66406 22.3113 3.74924 22.4576 3.77701 22.6133C3.80475 22.769 3.77385 22.9276 3.68804 23.0702C2.73933 24.6432 2.4478 25.9363 2.76239 26.9545C2.81892 27.1662 2.76631 27.3867 2.61573 27.5687C2.46516 27.7509 2.22851 27.8805 1.95615 27.9299C1.68379 27.9795 1.39724 27.9446 1.15746 27.8334C0.917644 27.7219 0.743586 27.5427 0.672268 27.3337C0.272031 26.0381 0.543797 24.5541 1.45133 22.8818L1.47438 22.8373L1.46121 22.822C1.01515 22.3129 0.682282 21.7498 0.476267 21.156L0.468032 21.1318C0.218008 20.391 0.119645 19.6244 0.176502 18.86C0.248972 17.7019 0.634385 16.5157 1.20097 15.5637L1.22074 15.5306L1.21744 15.5281C0.734856 14.9961 0.377443 14.3152 0.179796 13.5618L0.17156 13.5312C-0.100765 12.4803 -0.0482896 11.3945 0.324737 10.3622C0.756268 9.19764 1.6045 8.19729 2.85462 7.47439C2.95345 7.41712 3.05721 7.35985 3.16098 7.3064C2.89909 5.40624 2.96498 3.8319 3.34545 2.63556C3.55463 1.97629 3.86263 1.42648 4.2711 0.997598C4.71581 0.529242 5.2824 0.205974 5.94287 0.0596123C6.38099 -0.0371136 6.83228 -0.0142049 7.25558 0.114339ZM14.0349 11.6832C15.5765 11.6832 16.9996 12.0816 18.0636 12.7714C19.1013 13.4421 19.7189 14.3432 19.7189 15.2405C19.7189 16.3706 19.0502 17.2513 17.8528 17.8139C16.8316 18.2911 15.4629 18.5228 13.8949 18.5228C12.233 18.5228 10.8132 18.1931 9.78876 17.5886C8.77252 16.9904 8.20264 16.1504 8.20264 15.2405C8.20264 14.3407 8.85817 13.437 9.94194 12.7638C11.0422 12.0803 12.4949 11.6832 14.0349 11.6832ZM14.0349 12.8236C12.8922 12.8159 11.7798 13.1075 10.8791 13.6508C10.1198 14.1217 9.68994 14.7136 9.68994 15.2417C9.68994 15.7865 10.0358 16.2968 10.6946 16.685C11.4441 17.1266 12.5459 17.3824 13.8949 17.3824C15.2109 17.3824 16.321 17.1953 17.077 16.8403C17.8396 16.4839 18.23 15.9672 18.23 15.2405C18.23 14.7021 17.8248 14.1077 17.105 13.6419C16.3078 13.1265 15.2274 12.8236 14.0349 12.8236ZM15.1252 14.3636L15.1318 14.3687C15.3295 14.5608 15.2883 14.8396 15.0396 14.9923L14.5587 15.285V15.8526C14.5578 15.979 14.4921 16.0999 14.376 16.1889C14.2599 16.2779 14.1029 16.3277 13.9394 16.3274C13.7758 16.3277 13.6188 16.2779 13.5027 16.1889C13.3866 16.0999 13.3209 15.979 13.3201 15.8526V15.2672L12.8737 14.9897C12.8148 14.9533 12.7659 14.9082 12.7297 14.857C12.6935 14.8059 12.6707 14.7497 12.6628 14.6917C12.6548 14.6337 12.6618 14.5751 12.6833 14.5192C12.7048 14.4633 12.7404 14.4113 12.7881 14.3661C12.8853 14.2747 13.0253 14.2166 13.1776 14.2044C13.3299 14.1923 13.4824 14.2271 13.6017 14.3012L13.9558 14.5201L14.3182 14.2987C14.4371 14.2261 14.588 14.1922 14.7388 14.2043C14.8896 14.2165 15.0282 14.2736 15.1252 14.3636ZM6.82405 11.9212C7.61134 11.9212 8.25205 12.4176 8.25205 13.0298C8.25248 13.3232 8.10217 13.6048 7.83409 13.8127C7.56602 14.0205 7.20215 14.1376 6.8224 14.1383C6.44321 14.1373 6.08 14.0202 5.81235 13.8127C5.54467 13.6051 5.3944 13.324 5.3944 13.031C5.39351 12.7376 5.54342 12.4559 5.81117 12.2478C6.07895 12.0397 6.4443 11.9223 6.82405 11.9212ZM21.1634 11.9212C21.954 11.9212 22.593 12.4176 22.593 13.0298C22.5935 13.3232 22.4432 13.6048 22.1751 13.8127C21.907 14.0205 21.5431 14.1376 21.1634 14.1383C20.7842 14.1373 20.421 14.0202 20.1533 13.8127C19.8857 13.6051 19.7354 13.324 19.7354 13.031C19.7345 12.7376 19.8844 12.4559 20.1522 12.2478C20.4199 12.0397 20.7836 11.9223 21.1634 11.9212ZM6.48969 1.6543L6.48475 1.65684C6.29392 1.72096 6.131 1.82611 6.01534 1.95975L6.0071 1.96738C5.77981 2.20793 5.58216 2.56174 5.43393 3.02628C5.15392 3.90699 5.07816 5.10206 5.22969 6.56695C5.93793 6.40405 6.7104 6.30223 7.54217 6.26532L7.55864 6.26405L7.58993 6.22077C7.6657 6.11641 7.7464 6.01587 7.8337 5.9166C8.03629 4.93534 7.86993 3.76318 7.41699 2.8061C7.19628 2.34283 6.92781 1.97884 6.67087 1.77139C6.61783 1.72827 6.55871 1.68986 6.49463 1.65684L6.48969 1.6543ZM21.5999 1.70521L21.5966 1.70648C21.5325 1.73949 21.4734 1.7779 21.4203 1.82102C21.1634 2.02847 20.8933 2.39374 20.6742 2.85701C20.1966 3.86754 20.0368 5.11734 20.2954 6.13041L20.3909 6.25387L20.4041 6.27168H20.4535C21.2709 6.27186 22.0841 6.36273 22.8681 6.5415C23.0097 5.11097 22.9307 3.94136 22.6573 3.07719C22.509 2.61265 22.3114 2.25883 22.0824 2.01829L22.0759 2.01066C21.9604 1.87654 21.7975 1.77095 21.6064 1.70648L21.5999 1.70521Z" fill="black"/>
+</svg>
diff --git a/docs/macos.md b/docs/macos.mdx
similarity index 98%
rename from docs/macos.md
rename to docs/macos.mdx
index 26fb23c7..bb92234c 100644
--- a/docs/macos.md
+++ b/docs/macos.mdx
@@ -1,4 +1,6 @@
-# Ollama for macOS
+---
+title: macOS
+---
 
 ## System Requirements
 
diff --git a/docs/modelfile.md b/docs/modelfile.mdx
similarity index 64%
rename from docs/modelfile.md
rename to docs/modelfile.mdx
index 53a21714..c91d7310 100644
--- a/docs/modelfile.md
+++ b/docs/modelfile.mdx
@@ -1,9 +1,8 @@
-# Ollama Model File
+---
+title: Modelfile Reference
+---
 
-> [!NOTE]
-> `Modelfile` syntax is in development
-
-A model file is the blueprint to create and share models with Ollama.
+A Modelfile is the blueprint to create and share customized models using Ollama.
 
 ## Table of Contents
 
@@ -73,26 +72,23 @@ To view the Modelfile of a given model, use the `ollama show --modelfile` comman
 ollama show --modelfile llama3.2
 ```
 
-> **Output**:
->
-> ```
-> # Modelfile generated by "ollama show"
-> # To build a new Modelfile based on this one, replace the FROM line with:
-> # FROM llama3.2:latest
-> FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
-> TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
->
-> {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
->
-> {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
->
-> {{ .Response }}<|eot_id|>"""
-> PARAMETER stop "<|start_header_id|>"
-> PARAMETER stop "<|end_header_id|>"
-> PARAMETER stop "<|eot_id|>"
-> PARAMETER stop "<|reserved_special_token"
-> ```
+```
+# Modelfile generated by "ollama show"
+# To build a new Modelfile based on this one, replace the FROM line with:
+# FROM llama3.2:latest
+FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
+TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
 
+{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
+
+{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ .Response }}<|eot_id|>"""
+PARAMETER stop "<|start_header_id|>"
+PARAMETER stop "<|end_header_id|>"
+PARAMETER stop "<|eot_id|>"
+PARAMETER stop "<|reserved_special_token"
+```
 
 ## Instructions
 
@@ -110,10 +106,13 @@ FROM <model name>:<tag>
 FROM llama3.2
 ```
 
-A list of available base models:
-<https://github.com/ollama/ollama#model-library>
-Additional models can be found at:
-<https://ollama.com/library>
+<Card title="Base Models" href="https://github.com/ollama/ollama#model-library">
+  A list of available base models
+</Card>
+
+<Card title="Base Models" href="https://ollama.com/library">
+  Additional models can be found at
+</Card>
 
 #### Build from a Safetensors model
 
@@ -124,10 +123,11 @@ FROM <model directory>
 The model directory should contain the Safetensors weights for a supported architecture.
 
 Currently supported model architectures:
-  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2)
-  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
-  * Gemma (including Gemma 1 and Gemma 2)
-  * Phi3
+
+- Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2)
+- Mistral (including Mistral 1, Mistral 2, and Mixtral)
+- Gemma (including Gemma 1 and Gemma 2)
+- Phi3
 
 #### Build from a GGUF file
 
@@ -137,7 +137,6 @@ FROM ./ollama-model.gguf
 
 The GGUF file location should be specified as an absolute path or relative to the `Modelfile` location.
 
-
 ### PARAMETER
 
 The `PARAMETER` instruction defines a parameter that can be set when the model is run.
@@ -148,18 +147,21 @@ PARAMETER <parameter> <parametervalue>
 
 #### Valid Parameters and Values
 
-| Parameter      | Description                                                                                                                                                                                                                                             | Value Type | Example Usage        |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
-| num_ctx        | Sets the size of the context window used to generate the next token. (Default: 4096)                                                                                                                                                                    | int        | num_ctx 4096         |
-| repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                           | int        | repeat_last_n 64     |
-| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                     | float      | repeat_penalty 1.1   |
-| temperature    | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)                                                                                                                                     | float      | temperature 0.7      |
-| seed           | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0)                                                                                       | int        | seed 42              |
-| stop           | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile.                                      | string     | stop "AI assistant:" |
-| num_predict    | Maximum number of tokens to predict when generating text. (Default: -1, infinite generation)                                                                                                                                   | int        | num_predict 42       |
-| top_k          | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)                                                                        | int        | top_k 40             |
-| top_p          | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)                                                                 | float      | top_p 0.9            |
-| min_p          | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. (Default: 0.0) | float      | min_p 0.05            |
+| Parameter      | Description                                                                                                                                                                                                                                                                                                                                                                     | Value Type | Example Usage        |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
+| mirostat       | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)                                                                                                                                                                                                                                                                 | int        | mirostat 0           |
+| mirostat_eta   | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1)                                                                                                                                                | float      | mirostat_eta 0.1     |
+| mirostat_tau   | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0)                                                                                                                                                                                                                                 | float      | mirostat_tau 5.0     |
+| num_ctx        | Sets the size of the context window used to generate the next token. (Default: 2048)                                                                                                                                                                                                                                                                                            | int        | num_ctx 4096         |
+| repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                                                                                                                                                   | int        | repeat_last_n 64     |
+| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                                                                                                                                             | float      | repeat_penalty 1.1   |
+| temperature    | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)                                                                                                                                                                                                                                                             | float      | temperature 0.7      |
+| seed           | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0)                                                                                                                                                                                                               | int        | seed 42              |
+| stop           | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile.                                                                                                                                                              | string     | stop "AI assistant:" |
+| num_predict    | Maximum number of tokens to predict when generating text. (Default: -1, infinite generation)                                                                                                                                                                                                                                                                                    | int        | num_predict 42       |
+| top_k          | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)                                                                                                                                                                                                | int        | top_k 40             |
+| top_p          | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)                                                                                                                                                                                         | float      | top_p 0.9            |
+| min_p          | Alternative to the top*p, and aims to ensure a balance of quality and variety. The parameter \_p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with _p_=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. (Default: 0.0) | float      | min_p 0.05           |
 
 ### TEMPLATE
 
@@ -201,9 +203,10 @@ ADAPTER <path to safetensor adapter>
 ```
 
 Currently supported Safetensor adapters:
-  * Llama (including Llama 2, Llama 3, and Llama 3.1)
-  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
-  * Gemma (including Gemma 1 and Gemma 2)
+
+- Llama (including Llama 2, Llama 3, and Llama 3.1)
+- Mistral (including Mistral 1, Mistral 2, and Mixtral)
+- Gemma (including Gemma 1 and Gemma 2)
 
 #### GGUF adapter
 
@@ -237,7 +240,6 @@ MESSAGE <role> <message>
 | user      | An example message of what the user could have asked.        |
 | assistant | An example message of how the model should respond.          |
 
-
 #### Example conversation
 
 ```
@@ -249,7 +251,6 @@ MESSAGE user Is Ontario in Canada?
 MESSAGE assistant yes
 ```
 
-
 ## Notes
 
 - the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
diff --git a/docs/ollama-logo.svg b/docs/ollama-logo.svg
new file mode 100644
index 00000000..b215c89b
--- /dev/null
+++ b/docs/ollama-logo.svg
@@ -0,0 +1,3 @@
+<svg width="17" height="25" viewBox="0 0 17 25" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path fill-rule="evenodd" clip-rule="evenodd" d="M4.40517 0.102088C4.62117 0.198678 4.81617 0.357766 4.99317 0.56799C5.28817 0.915712 5.53718 1.41342 5.72718 2.00318C5.91818 2.59635 6.04218 3.25316 6.08918 3.91224C6.71878 3.5075 7.41754 3.26103 8.13818 3.18953L8.18918 3.18498C9.05919 3.10544 9.91919 3.28384 10.6692 3.72361C10.7702 3.78384 10.8692 3.84861 10.9662 3.91679C11.0162 3.27021 11.1382 2.62817 11.3262 2.04863C11.5162 1.45773 11.7652 0.961166 12.0592 0.612308C12.2235 0.410338 12.4245 0.251368 12.6482 0.146406C12.9052 0.032771 13.1782 0.0123167 13.4442 0.098679C13.8452 0.228223 14.1892 0.516855 14.4602 0.936167C14.7082 1.3191 14.8942 1.81 15.0212 2.39863C15.2512 3.45998 15.2912 4.85655 15.1362 6.54061L15.1892 6.58607L15.2152 6.60766C15.9722 7.26219 16.4992 8.19513 16.7782 9.27807C17.2133 10.9678 16.9943 12.8632 16.2442 13.9235L16.2262 13.9473L16.2282 13.9507C16.6453 14.8166 16.8983 15.7314 16.9523 16.678L16.9543 16.7121C17.0183 17.9223 16.7542 19.1404 16.1402 20.337L16.1332 20.3484L16.1432 20.3756C16.6152 21.6904 16.7632 23.0142 16.5812 24.3369L16.5752 24.3813C16.547 24.5744 16.4525 24.7472 16.3125 24.8612C16.1725 24.9753 15.9983 25.0219 15.8282 24.9903C15.744 24.9753 15.6632 24.9417 15.5904 24.8912C15.5177 24.8408 15.4544 24.7744 15.4042 24.696C15.3541 24.6178 15.318 24.529 15.2981 24.4347C15.2782 24.3406 15.2748 24.2428 15.2882 24.1472C15.4552 22.9733 15.2982 21.7961 14.8082 20.5984C14.7625 20.4871 14.7422 20.3645 14.7492 20.242C14.7562 20.1194 14.7902 20.0009 14.8482 19.8972L14.8522 19.8904C15.4562 18.8404 15.7062 17.8109 15.6522 16.7996C15.6062 15.9143 15.3272 15.045 14.8522 14.2166C14.7598 14.0556 14.7269 13.8597 14.7606 13.6713C14.7943 13.4829 14.8918 13.3171 15.0322 13.2098L15.0412 13.203C15.2842 13.0223 15.5082 12.561 15.6212 11.9303C15.7459 11.1846 15.7133 10.4159 15.5262 9.68716C15.3212 8.89171 14.9462 8.22809 14.4212 7.77468C13.8262 7.25878 13.0382 7.00992 12.0412 7.08151C11.9108 7.09115 11.7809 7.05613 11.6682 6.98097C11.5556 6.90581 11.4653 6.79399 11.4092 6.65993C11.0952 5.90426 10.6372 5.36336 10.0662 5.02814C9.51799 4.71723 8.90425 4.58657 8.29418 4.65087C7.04918 4.76337 5.95118 5.56108 5.62418 6.56675C5.57792 6.70829 5.4947 6.8304 5.38568 6.91672C5.27666 7.00301 5.14703 7.04942 5.01417 7.0497C3.94717 7.05197 3.12117 7.33606 2.51717 7.84855C1.99517 8.29172 1.63916 8.91103 1.45116 9.65307C1.28104 10.3515 1.25774 11.0857 1.38316 11.7962C1.49516 12.4303 1.71416 12.9553 1.96517 13.2382L1.97317 13.2462C2.18517 13.4814 2.23017 13.8485 2.08217 14.1382C1.72216 14.845 1.45316 15.8984 1.40916 16.9109C1.35916 18.0677 1.59516 19.0722 2.12817 19.7927L2.14417 19.8143C2.22461 19.9208 2.27633 20.0514 2.29319 20.1905C2.31003 20.3295 2.29127 20.4711 2.23917 20.5984C1.66316 22.0029 1.48616 23.1574 1.67716 24.0665C1.71148 24.2556 1.67954 24.4524 1.58812 24.6149C1.4967 24.7776 1.35302 24.8933 1.18766 24.9374C1.0223 24.9817 0.848322 24.9506 0.702741 24.8512C0.557141 24.7517 0.451463 24.5917 0.408163 24.4051C0.165162 23.2483 0.330162 21.9233 0.881162 20.4302L0.895162 20.3904L0.887162 20.3768C0.616341 19.9222 0.414243 19.4195 0.289162 18.8893L0.284162 18.8677C0.132362 18.2062 0.0726416 17.5218 0.107162 16.8393C0.151162 15.8052 0.385163 14.7462 0.729162 13.8962L0.741162 13.8666L0.739162 13.8644C0.446163 13.3894 0.229162 12.7814 0.109162 12.1087L0.104162 12.0814C-0.0611788 11.1431 -0.0293187 10.1737 0.197162 9.25194C0.459163 8.21218 0.974162 7.31901 1.73316 6.67356C1.79316 6.62243 1.85616 6.57129 1.91916 6.52357C1.76016 4.827 1.80016 3.42134 2.03117 2.35317C2.15817 1.76455 2.34517 1.27365 2.59317 0.890713C2.86317 0.472537 3.20717 0.183905 3.60817 0.0532252C3.87417 -0.0331371 4.14817 -0.0126829 4.40517 0.102088ZM8.52118 10.4315C9.45719 10.4315 10.3212 10.7871 10.9672 11.403C11.5972 12.0019 11.9722 12.8064 11.9722 13.6076C11.9722 14.6166 11.5662 15.403 10.8392 15.9052C10.2192 16.3314 9.38819 16.5382 8.43618 16.5382C7.42718 16.5382 6.56518 16.2439 5.94318 15.7041C5.32618 15.17 4.98017 14.42 4.98017 13.6076C4.98017 12.8042 5.37818 11.9973 6.03618 11.3962C6.70418 10.786 7.58618 10.4315 8.52118 10.4315ZM8.52118 11.4496C7.82742 11.4428 7.15204 11.7031 6.60518 12.1883C6.14418 12.6087 5.88318 13.1371 5.88318 13.6087C5.88318 14.095 6.09318 14.5507 6.49318 14.8973C6.94818 15.2916 7.61718 15.52 8.43618 15.52C9.23519 15.52 9.90919 15.353 10.3682 15.0359C10.8312 14.7178 11.0682 14.2564 11.0682 13.6076C11.0682 13.1269 10.8222 12.5962 10.3852 12.1803C9.90119 11.7201 9.24519 11.4496 8.52118 11.4496ZM9.18319 12.8246L9.18719 12.8292C9.30719 13.0007 9.28219 13.2496 9.13119 13.386L8.83919 13.6473V14.1541C8.83865 14.267 8.79877 14.375 8.72829 14.4544C8.6578 14.5339 8.56246 14.5783 8.46318 14.578C8.3639 14.5783 8.26856 14.5339 8.19808 14.4544C8.12758 14.375 8.0877 14.267 8.08718 14.1541V13.6314L7.81618 13.3837C7.78042 13.3511 7.7507 13.3109 7.72872 13.2652C7.70674 13.2195 7.69294 13.1694 7.6881 13.1176C7.68326 13.0658 7.6875 13.0135 7.70056 12.9636C7.71362 12.9137 7.73524 12.8672 7.76418 12.8269C7.8232 12.7452 7.9082 12.6934 8.0007 12.6825C8.09318 12.6717 8.18572 12.7027 8.25818 12.7689L8.47318 12.9644L8.69318 12.7667C8.76538 12.7018 8.85702 12.6716 8.94854 12.6825C9.04009 12.6933 9.12427 12.7443 9.18319 12.8246ZM4.14317 10.644C4.62117 10.644 5.01017 11.0871 5.01017 11.6337C5.01043 11.8957 4.91917 12.1471 4.75641 12.3327C4.59365 12.5183 4.37273 12.6229 4.14217 12.6235C3.91195 12.6226 3.69143 12.518 3.52893 12.3327C3.36641 12.1474 3.27517 11.8965 3.27517 11.6349C3.27463 11.3729 3.36565 11.1213 3.52821 10.9355C3.69079 10.7497 3.91261 10.6449 4.14317 10.644ZM12.8492 10.644C13.3292 10.644 13.7172 11.0871 13.7172 11.6337C13.7175 11.8957 13.6262 12.1471 13.4634 12.3327C13.3007 12.5183 13.0798 12.6229 12.8492 12.6235C12.619 12.6226 12.3985 12.518 12.236 12.3327C12.0734 12.1474 11.9822 11.8965 11.9822 11.6349C11.9817 11.3729 12.0727 11.1213 12.2352 10.9355C12.3978 10.7497 12.6186 10.6449 12.8492 10.644ZM3.94017 1.47705L3.93717 1.47932C3.82131 1.53657 3.72239 1.63046 3.65217 1.74977L3.64717 1.75659C3.50917 1.97136 3.38917 2.28727 3.29917 2.70203C3.12917 3.48839 3.08317 4.55541 3.17517 5.86335C3.60517 5.7179 4.07417 5.62699 4.57917 5.59404L4.58917 5.5929L4.60817 5.55426C4.65417 5.46108 4.70317 5.37131 4.75617 5.28268C4.87917 4.40655 4.77817 3.35998 4.50317 2.50545C4.36917 2.09182 4.20617 1.76682 4.05017 1.5816C4.01797 1.5431 3.98207 1.5088 3.94317 1.47932L3.94017 1.47705ZM13.1142 1.52251L13.1122 1.52364C13.0733 1.55312 13.0374 1.58741 13.0052 1.62591C12.8492 1.81114 12.6852 2.13727 12.5522 2.5509C12.2622 3.45316 12.1652 4.56905 12.3222 5.47358L12.3802 5.58381L12.3882 5.59972H12.4182C12.9145 5.59988 13.4082 5.68101 13.8842 5.84062C13.9702 4.56337 13.9222 3.51907 13.7562 2.74749C13.6662 2.33272 13.5462 2.01682 13.4072 1.80205L13.4032 1.79523C13.3331 1.67548 13.2342 1.58121 13.1182 1.52364L13.1142 1.52251Z" fill="black"/>
+</svg>
diff --git a/docs/ollama.png b/docs/ollama.png
new file mode 100644
index 00000000..8cd2cf1e
Binary files /dev/null and b/docs/ollama.png differ
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
new file mode 100644
index 00000000..28e4eed6
--- /dev/null
+++ b/docs/openapi.yaml
@@ -0,0 +1,1413 @@
+openapi: 3.1.0
+info:
+  title: Ollama API
+  version: 0.1.0
+  description: |
+    OpenAPI specification for the Ollama HTTP API
+
+servers:
+  - url: http://localhost:11434
+    description: Local Ollama instance
+components:
+  securitySchemes:
+    bearerAuth:
+      type: http
+      scheme: bearer
+      bearerFormat: API Key
+  parameters:
+    DigestParam:
+      name: digest
+      in: path
+      required: true
+      description: SHA256 digest identifier, prefixed with `sha256:`
+      schema:
+        type: string
+  schemas:
+    ModelOptions:
+      type: object
+      description: Runtime options that control text generation
+      properties:
+        # Sampling Options
+        seed:
+          type: integer
+          description: Random seed used for reproducible outputs
+        temperature:
+          type: number
+          format: float
+          description: Controls randomness in generation (higher = more random)
+        top_k:
+          type: integer
+          description: Limits next token selection to the K most likely
+        top_p:
+          type: number
+          format: float
+          description: Cumulative probability threshold for nucleus sampling
+        min_p:
+          type: number
+          format: float
+          description: Minimum probability threshold for token selection
+        stop:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                type: string
+          description: Stop sequences that will halt generation
+
+        # Runtime Options
+        num_ctx:
+          type: integer
+          description: Context length size (number of tokens)
+        num_predict:
+          type: integer
+          description: Maximum number of tokens to generate
+      additionalProperties: true
+    GenerateRequest:
+      type: object
+      required: [model]
+      properties:
+        model:
+          type: string
+          description: Model name
+        prompt:
+          type: string
+          description: Text for the model to generate a response from
+        suffix:
+          type: string
+          description: Used for fill-in-the-middle models, text that appears after the user prompt and before the model response
+        images:
+          type: array
+          items:
+            type: string
+            description: Base64-encoded images for models that support image input
+        format:
+          description: Structured output format for the model to generate a response from. Supports either the string `"json"` or a JSON schema object.
+          oneOf:
+            - type: string
+            - type: object
+        system:
+          description: System prompt for the model to generate a response from
+          type: string
+        stream:
+          description: When true, returns a stream of partial responses
+          type: boolean
+          default: true
+        think:
+          type: boolean
+          description: When true, returns separate thinking output in addition to content
+        raw:
+          type: boolean
+          description: When true, returns the raw response from the model without any prompt templating
+        keep_alive:
+          oneOf:
+            - type: string
+            - type: number
+          description: Model keep-alive duration (for example `5m` or `0` to unload immediately)
+        options:
+          $ref: "#/components/schemas/ModelOptions"
+    GenerateResponse:
+      type: object
+      properties:
+        model:
+          type: string
+          description: Model name
+        created_at:
+          type: string
+          description: ISO 8601 timestamp of response creation
+        response:
+          type: string
+          description: The model's generated text response
+        thinking:
+          type: string
+          description: The model's generated thinking output
+        done:
+          type: boolean
+          description: Indicates whether generation has finished
+        done_reason:
+          type: string
+          description: Reason the generation stopped
+        total_duration:
+          type: integer
+          description: Time spent generating the response in nanoseconds
+        load_duration:
+          type: integer
+          description: Time spent loading the model in nanoseconds
+        prompt_eval_count:
+          type: integer
+          description: Number of input tokens in the prompt
+        prompt_eval_duration:
+          type: integer
+          description: Time spent evaluating the prompt in nanoseconds
+        eval_count:
+          type: integer
+          description: Number of output tokens generated in the response
+        eval_duration:
+          type: integer
+          description: Time spent generating tokens in nanoseconds
+    GenerateStreamEvent:
+      type: object
+      properties:
+        model:
+          type: string
+          description: Model name
+        created_at:
+          type: string
+          description: ISO 8601 timestamp of response creation
+        response:
+          type: string
+          description: The model's generated text response for this chunk
+        thinking:
+          type: string
+          description: The model's generated thinking output for this chunk
+        done:
+          type: boolean
+          description: Indicates whether the stream has finished
+        done_reason:
+          type: string
+          description: Reason streaming finished
+        total_duration:
+          type: integer
+          description: Time spent generating the response in nanoseconds
+        load_duration:
+          type: integer
+          description: Time spent loading the model in nanoseconds
+        prompt_eval_count:
+          type: integer
+          description: Number of input tokens in the prompt
+        prompt_eval_duration:
+          type: integer
+          description: Time spent evaluating the prompt in nanoseconds
+        eval_count:
+          type: integer
+          description: Number of output tokens generated in the response
+        eval_duration:
+          type: integer
+          description: Time spent generating tokens in nanoseconds
+    ChatMessage:
+      type: object
+      required: [role, content]
+      properties:
+        role:
+          type: string
+          enum: [system, user, assistant, tool]
+          description: Author of the message.
+        content:
+          type: string
+          description: Message text content
+        images:
+          type: array
+          items:
+            type: string
+            description: Base64-encoded image content
+          description: Optional list of inline images for multimodal models
+        tool_calls:
+          type: array
+          items:
+            $ref: "#/components/schemas/ToolCall"
+          description: Tool call requests produced by the model
+    ToolCall:
+      type: object
+      properties:
+        function:
+          type: object
+          required: [name]
+          properties:
+            name:
+              type: string
+              description: Name of the function to call
+            description:
+              type: string
+              description: What the function does
+            arguments:
+              type: object
+              description: JSON object of arguments to pass to the function
+    ToolDefinition:
+      type: object
+      required: [type, function]
+      properties:
+        type:
+          type: string
+          enum: [function]
+          description: Type of tool (always `function`)
+        function:
+          type: object
+          required: [name, parameters]
+          properties:
+            name:
+              type: string
+              description: Function name exposed to the model
+            description:
+              type: string
+              description: Human-readable description of the function
+            parameters:
+              type: object
+              description: JSON Schema for the function parameters
+    ChatRequest:
+      type: object
+      required: [model, messages]
+      properties:
+        model:
+          type: string
+          description: Model name
+        messages:
+          type: array
+          description: Chat history as an array of message objects (each with a role and content)
+          items:
+            $ref: "#/components/schemas/ChatMessage"
+        tools:
+          type: array
+          description: Optional list of function tools the model may call during the chat
+          items:
+            $ref: "#/components/schemas/ToolDefinition"
+        format:
+          oneOf:
+            - type: string
+              enum: [json]
+            - type: object
+          description: Format to return a response in. Can be `json` or a JSON schema
+        options:
+          $ref: "#/components/schemas/ModelOptions"
+        stream:
+          type: boolean
+          default: true
+        think:
+          type: boolean
+          description: When true, returns separate thinking output in addition to content
+        keep_alive:
+          oneOf:
+            - type: string
+            - type: number
+          description: Model keep-alive duration (for example `5m` or `0` to unload immediately)
+    ChatResponse:
+      type: object
+      properties:
+        model:
+          type: string
+          description: Model name used to generate this message
+        created_at:
+          type: string
+          format: date-time
+          description: Timestamp of response creation (ISO 8601)
+        message:
+          type: object
+          properties:
+            role:
+              type: string
+              enum: [assistant]
+              description: Always `assistant` for model responses
+            content:
+              type: string
+              description: Assistant message text
+            thinking:
+              type: string
+              description: Optional deliberate thinking trace when `think` is enabled
+            tool_calls:
+              type: array
+              items:
+                $ref: "#/components/schemas/ToolCall"
+              description: Tool calls requested by the assistant
+            images:
+              type: array
+              items:
+                type: string
+              nullable: true
+              description: Optional base64-encoded images in the response
+        done:
+          type: boolean
+          description: Indicates whether the chat response has finished
+        done_reason:
+          type: string
+          description: Reason the response finished
+        total_duration:
+          type: integer
+          description: Total time spent generating in nanoseconds
+        load_duration:
+          type: integer
+          description: Time spent loading the model in nanoseconds
+        prompt_eval_count:
+          type: integer
+          description: Number of tokens in the prompt
+        prompt_eval_duration:
+          type: integer
+          description: Time spent evaluating the prompt in nanoseconds
+        eval_count:
+          type: integer
+          description: Number of tokens generated in the response
+        eval_duration:
+          type: integer
+          description: Time spent generating tokens in nanoseconds
+    ChatStreamEvent:
+      type: object
+      properties:
+        model:
+          type: string
+          description: Model name used for this stream event
+        created_at:
+          type: string
+          format: date-time
+          description: When this chunk was created (ISO 8601)
+        message:
+          type: object
+          properties:
+            role:
+              type: string
+              description: Role of the message for this chunk
+            content:
+              type: string
+              description: Partial assistant message text
+            thinking:
+              type: string
+              description: Partial thinking text when `think` is enabled
+            tool_calls:
+              type: array
+              items:
+                $ref: "#/components/schemas/ToolCall"
+              description: Partial tool calls, if any
+            images:
+              type: array
+              items:
+                type: string
+              nullable: true
+              description: Partial base64-encoded images, when present
+        done:
+          type: boolean
+          description: True for the final event in the stream
+    StatusEvent:
+      type: object
+      properties:
+        status:
+          type: string
+          description: Human-readable status message
+        digest:
+          type: string
+          description: Content digest associated with the status, if applicable
+        total:
+          type: integer
+          description: Total number of bytes expected for the operation
+        completed:
+          type: integer
+          description: Number of bytes transferred so far
+    StatusResponse:
+      type: object
+      properties:
+        status:
+          type: string
+          description: Current status message
+    EmbedRequest:
+      type: object
+      required: [model, input]
+      properties:
+        model:
+          type: string
+          description: Model name
+        input:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                type: string
+          description: Text or array of texts to generate embeddings for
+        truncate:
+          type: boolean
+          default: true
+          description: If true, truncate inputs that exceed the context window. If false, returns an error.
+        dimensions:
+          type: integer
+          description: Number of dimensions to generate embeddings for
+        keep_alive:
+          type: string
+          description: Model keep-alive duration
+        options:
+          $ref: "#/components/schemas/ModelOptions"
+    EmbedResponse:
+      type: object
+      properties:
+        model:
+          type: string
+          description: Model that produced the embeddings
+        embeddings:
+          type: array
+          items:
+            type: array
+            items:
+              type: number
+          description: Array of vector embeddings
+        total_duration:
+          type: integer
+          description: Total time spent generating in nanoseconds
+        load_duration:
+          type: integer
+          description: Load time in nanoseconds
+        prompt_eval_count:
+          type: integer
+          description: Number of input tokens processed to generate embeddings
+    CreateRequest:
+      type: object
+      required: [model]
+      properties:
+        model:
+          type: string
+          description: Name for the model to create
+        from:
+          type: string
+          description: Existing model to create from
+        template:
+          type: string
+          description: Prompt template to use for the model
+        license:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                type: string
+          description: License string or list of licenses for the model
+        system:
+          type: string
+          description: System prompt to embed in the model
+        parameters:
+          type: object
+          description: Key-value parameters for the model
+        messages:
+          description: Message history to use for the model
+          type: array
+          items:
+            $ref: "#/components/schemas/ChatMessage"
+        quantize:
+          type: string
+          description: Quantization level to apply (e.g. `q4_K_M`, `q8_0`)
+        stream:
+          type: boolean
+          default: true
+          description: Stream status updates
+    CopyRequest:
+      type: object
+      required: [source, destination]
+      properties:
+        source:
+          type: string
+          description: Existing model name to copy from
+        destination:
+          type: string
+          description: New model name to create
+    DeleteRequest:
+      type: object
+      required: [model]
+      properties:
+        model:
+          type: string
+          description: Model name to delete
+    PullRequest:
+      type: object
+      required: [model]
+      properties:
+        model:
+          type: string
+          description: Name of the model to download
+        insecure:
+          type: boolean
+          description: Allow downloading over insecure connections
+        stream:
+          type: boolean
+          default: true
+          description: Stream progress updates
+    PushRequest:
+      type: object
+      required: [model]
+      properties:
+        model:
+          type: string
+          description: Name of the model to publish
+        insecure:
+          type: boolean
+          description: Allow publishing over insecure connections
+        stream:
+          type: boolean
+          default: true
+          description: Stream progress updates
+    ShowRequest:
+      type: object
+      required: [model]
+      properties:
+        model:
+          type: string
+          description: Model name to show
+        verbose:
+          type: boolean
+          description: If true, includes large verbose fields in the response.
+    ShowResponse:
+      type: object
+      properties:
+        parameters:
+          type: string
+          description: Model parameter settings serialized as text
+        license:
+          type: string
+          description: The license of the model
+        details:
+          type: object
+          description: High-level model details
+        template:
+          type: string
+          description: The template used by the model to render prompts
+        capabilities:
+          type: array
+          items:
+            type: string
+          description: List of supported features
+        model_info:
+          type: object
+          description: Additional model metadata
+    ModelSummary:
+      type: object
+      description: Summary information for a locally available model
+      properties:
+        name:
+          type: string
+          description: Model name
+        modified_at:
+          type: string
+          description: Last modified timestamp in ISO 8601 format
+        size:
+          type: integer
+          description: Total size of the model on disk in bytes
+        digest:
+          type: string
+          description: SHA256 digest identifier of the model contents
+        details:
+          type: object
+          description: Additional information about the model's format and family
+          properties:
+            format:
+              type: string
+              description: Model file format (for example `gguf`)
+            family:
+              type: string
+              description: Primary model family (for example `llama`)
+            families:
+              type: array
+              items:
+                type: string
+              description: All families the model belongs to, when applicable
+            parameter_size:
+              type: string
+              description: Approximate parameter count label (for example `7B`, `13B`)
+            quantization_level:
+              type: string
+              description: Quantization level used (for example `Q4_0`)
+    ListResponse:
+      type: object
+      properties:
+        models:
+          type: array
+          items:
+            $ref: "#/components/schemas/ModelSummary"
+    Ps:
+      type: object
+      properties:
+        model:
+          type: string
+          description: Name of the running model
+        size:
+          type: integer
+          description: Size of the model in bytes
+        digest:
+          type: string
+          description: SHA256 digest of the model
+        details:
+          type: object
+          description: Model details such as format and family
+        expires_at:
+          type: string
+          description: Time when the model will be unloaded
+        size_vram:
+          type: integer
+          description: VRAM usage in bytes
+    PsResponse:
+      type: object
+      properties:
+        models:
+          type: array
+          items:
+            $ref: "#/components/schemas/Ps"
+          description: Currently running models
+    WebSearchRequest:
+      type: object
+      required: [query]
+      properties:
+        query:
+          type: string
+          description: Search query string
+        max_results:
+          type: integer
+          minimum: 1
+          maximum: 10
+          default: 5
+          description: Maximum number of results to return
+    WebSearchResult:
+      type: object
+      properties:
+        title:
+          type: string
+          description: Page title of the result
+        url:
+          type: string
+          format: uri
+          description: Resolved URL for the result
+        content:
+          type: string
+          description: Extracted text content snippet
+    WebSearchResponse:
+      type: object
+      properties:
+        results:
+          type: array
+          items:
+            $ref: "#/components/schemas/WebSearchResult"
+          description: Array of matching search results
+    WebFetchRequest:
+      type: object
+      required: [url]
+      properties:
+        url:
+          type: string
+          format: uri
+          description: The URL to fetch
+    WebFetchResponse:
+      type: object
+      properties:
+        title:
+          type: string
+          description: Title of the fetched page
+        content:
+          type: string
+          description: Extracted page content
+        links:
+          type: array
+          items:
+            type: string
+            format: uri
+          description: Links found on the page
+    VersionResponse:
+      type: object
+      properties:
+        version:
+          type: string
+          description: Version of Ollama
+    ErrorResponse:
+      type: object
+      properties:
+        error:
+          type: string
+          description: Error message describing what went wrong
+paths:
+  /api/generate:
+    post:
+      summary: Generate a response
+      description: Generates a response for the provided prompt
+      operationId: generate
+      x-mint:
+        href: /api/generate
+      x-codeSamples:
+        - lang: bash
+          label: Default
+          source: |
+            curl http://localhost:11434/api/generate -d '{
+              "model": "gemma3",
+              "prompt": "Why is the sky blue?"
+            }'
+        - lang: bash
+          label: Non-streaming
+          source: |
+            curl http://localhost:11434/api/generate -d '{
+              "model": "gemma3",
+              "prompt": "Why is the sky blue?",
+              "stream": false
+            }'
+        - lang: bash
+          label: With options
+          source: |
+            curl http://localhost:11434/api/generate -d '{
+              "model": "gemma3",
+              "prompt": "Why is the sky blue?",
+              "options": {
+                "temperature": 0.8,
+                "top_p": 0.9,
+                "seed": 42
+              }
+            }'
+        - lang: bash
+          label: Structured outputs
+          source: |
+            curl http://localhost:11434/api/generate -d '{
+              "model": "gemma3",
+              "prompt": "What are the populations of the United States and Canada?",
+              "stream": false,
+              "format": {
+                "type": "object",
+                "properties": {
+                  "countries": {
+                    "type": "array",
+                    "items": {
+                      "type": "object",
+                      "properties": {
+                        "country": {"type": "string"},
+                        "population": {"type": "integer"}
+                      },
+                      "required": ["country", "population"]
+                    }
+                  }
+                },
+                "required": ["countries"]
+              }
+            }'
+        - lang: bash
+          label: With images
+          source: |
+            curl http://localhost:11434/api/generate -d '{
+              "model": "gemma3",
+              "prompt": "What is in this picture?",
+              "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
+            }'
+        - lang: bash
+          label: Load model
+          source: |
+            curl http://localhost:11434/api/generate -d '{
+              "model": "gemma3"
+            }'
+        - lang: bash
+          label: Unload model
+          source: |
+            curl http://localhost:11434/api/generate -d '{
+              "model": "gemma3",
+              "keep_alive": 0
+            }'
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenerateRequest"
+            example:
+              model: gemma3
+              prompt: Why is the sky blue?
+      responses:
+        "200":
+          description: Generation responses
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenerateResponse"
+              example:
+                model: "gemma3"
+                created_at: "2025-10-17T23:14:07.414671Z"
+                response: "Hello! How can I help you today?"
+                done: true
+                done_reason: "stop"
+                total_duration: 174560334
+                load_duration: 101397084
+                prompt_eval_count: 11
+                prompt_eval_duration: 13074791
+                eval_count: 18
+                eval_duration: 52479709
+            application/x-ndjson:
+              schema:
+                $ref: "#/components/schemas/GenerateStreamEvent"
+  /api/chat:
+    post:
+      summary: Generate a chat message
+      description: Generate the next chat message in a conversation between a user and an assistant.
+      operationId: chat
+      x-mint:
+        href: /api/chat
+      x-codeSamples:
+        - lang: bash
+          label: Default
+          source: |
+            curl http://localhost:11434/api/chat -d '{
+              "model": "gemma3",
+              "messages": [
+                {
+                  "role": "user",
+                  "content": "why is the sky blue?"
+                }
+              ]
+            }'
+        - lang: bash
+          label: Non-streaming
+          source: |
+            curl http://localhost:11434/api/chat -d '{
+              "model": "gemma3",
+              "messages": [
+                {
+                  "role": "user",
+                  "content": "why is the sky blue?"
+                }
+              ],
+              "stream": false
+            }'
+        - lang: bash
+          label: Structured outputs
+          source: |
+            curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
+              "model": "gemma3",
+              "messages": [
+                {
+                  "role": "user",
+                  "content": "What are the populations of the United States and Canada?"
+                }
+              ],
+              "stream": false,
+              "format": {
+                "type": "object",
+                "properties": {
+                  "countries": {
+                    "type": "array",
+                    "items": {
+                      "type": "object",
+                      "properties": {
+                        "country": {"type": "string"},
+                        "population": {"type": "integer"}
+                      },
+                      "required": ["country", "population"]
+                    }
+                  }
+                },
+                "required": ["countries"]
+              }
+            }'
+        - lang: bash
+          label: Tool calling
+          source: |
+            curl http://localhost:11434/api/chat -d '{
+              "model": "qwen3",
+              "messages": [
+                {
+                  "role": "user",
+                  "content": "What is the weather today in Paris?"
+                }
+              ],
+              "stream": false,
+              "tools": [
+                {
+                  "type": "function",
+                  "function": {
+                    "name": "get_current_weather",
+                    "description": "Get the current weather for a location",
+                    "parameters": {
+                      "type": "object",
+                      "properties": {
+                        "location": {
+                          "type": "string",
+                          "description": "The location to get the weather for, e.g. San Francisco, CA"
+                        },
+                        "format": {
+                          "type": "string",
+                          "description": "The format to return the weather in, e.g. 'celsius' or 'fahrenheit'",
+                          "enum": ["celsius", "fahrenheit"]
+                        }
+                      },
+                      "required": ["location", "format"]
+                    }
+                  }
+                }
+              ]
+            }'
+        - lang: bash
+          label: Thinking
+          source: |
+            curl http://localhost:11434/api/chat -d '{
+              "model": "gpt-oss",
+              "messages": [
+                {
+                  "role": "user",
+                  "content": "What is 1+1?"
+                }
+              ],
+              "think": "low"
+            }'
+        - lang: bash
+          label: Images
+          source: |
+            curl http://localhost:11434/api/chat -d '{
+              "model": "gemma3",
+              "messages": [
+                {
+                  "role": "user",
+                  "content": "What is in this image?",
+                  "images": [
+                    "iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"
+                  ]
+                }
+              ]
+            }'
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ChatRequest"
+      responses:
+        "200":
+          description: Chat response
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ChatResponse"
+              example:
+                model: "gemma3"
+                created_at: "2025-10-17T23:14:07.414671Z"
+                message:
+                  role: "assistant"
+                  content: "Hello! How can I help you today?"
+                done: true
+                done_reason: "stop"
+                total_duration: 174560334
+                load_duration: 101397084
+                prompt_eval_count: 11
+                prompt_eval_duration: 13074791
+                eval_count: 18
+                eval_duration: 52479709
+            application/x-ndjson:
+              schema:
+                $ref: "#/components/schemas/ChatStreamEvent"
+  /api/embed:
+    post:
+      summary: Generate embeddings
+      description: Creates vector embeddings representing the input text
+      operationId: embed
+      x-mint:
+        href: /api/embed
+      x-codeSamples:
+        - lang: bash
+          label: Default
+          source: |
+            curl http://localhost:11434/api/embed -d '{
+              "model": "embeddinggemma",
+              "input": "Why is the sky blue?"
+            }'
+        - lang: bash
+          label: Multiple inputs
+          source: |
+            curl http://localhost:11434/api/embed -d '{
+              "model": "embeddinggemma",
+              "input": [
+                "Why is the sky blue?",
+                "Why is the grass green?"
+              ]
+            }'
+        - lang: bash
+          label: Truncation
+          source: |
+            curl http://localhost:11434/api/embed -d '{
+              "model": "embeddinggemma",
+              "input": "Generate embeddings for this text",
+              "truncate": true
+            }'
+        - lang: bash
+          label: Dimensions
+          source: |
+            curl http://localhost:11434/api/embed -d '{
+              "model": "embeddinggemma",
+              "input": "Generate embeddings for this text",
+              "dimensions": 128
+            }'
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/EmbedRequest"
+            example:
+              model: embeddinggemma
+              input: "Generate embeddings for this text"
+      responses:
+        "200":
+          description: Vector embeddings for the input text
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/EmbedResponse"
+              example:
+                model: "embeddinggemma"
+                embeddings:
+                  - [
+                      0.010071029,
+                      -0.0017594862,
+                      0.05007221,
+                      0.04692972,
+                      0.054916814,
+                      0.008599704,
+                      0.105441414,
+                      -0.025878139,
+                      0.12958129,
+                      0.031952348,
+                    ]
+                total_duration: 14143917
+                load_duration: 1019500
+                prompt_eval_count: 8
+  /api/tags:
+    get:
+      summary: List models
+      description: Fetch a list of models and their details
+      operationId: list
+      x-mint:
+        href: /api/tags
+      x-codeSamples:
+        - lang: bash
+          label: List models
+          source: |
+            curl http://localhost:11434/api/tags
+      responses:
+        "200":
+          description: List available models
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ListResponse"
+              example:
+                models:
+                  - name: "gemma3"
+                    modified_at: "2025-10-03T23:34:03.409490317-07:00"
+                    size: 3338801804
+                    digest: "a2af6cc3eb7fa8be8504abaf9b04e88f17a119ec3f04a3addf55f92841195f5a"
+                    details:
+                      format: "gguf"
+                      family: "gemma"
+                      families:
+                        - "gemma"
+                      parameter_size: "4.3B"
+                      quantization_level: "Q4_K_M"
+  /api/ps:
+    get:
+      summary: List running models
+      description: Retrieve a list of models that are currently running
+      operationId: ps
+      x-mint:
+        href: /api/ps
+      x-codeSamples:
+        - lang: bash
+          label: List running models
+          source: |
+            curl http://localhost:11434/api/ps
+      responses:
+        "200":
+          description: Models currently loaded into memory
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PsResponse"
+              example:
+                models:
+                  - model: "gemma3"
+                    size: 6591830464
+                    digest: "a2af6cc3eb7fa8be8504abaf9b04e88f17a119ec3f04a3addf55f92841195f5a"
+                    details:
+                      parent_model: ""
+                      format: "gguf"
+                      family: "gemma3"
+                      families:
+                        - "gemma3"
+                      parameter_size: "4.3B"
+                      quantization_level: "Q4_K_M"
+                    expires_at: "2025-10-17T16:47:07.93355-07:00"
+                    size_vram: 5333539264
+                    context_length: 4096
+  /api/show:
+    post:
+      summary: Show model details
+      operationId: show
+      x-codeSamples:
+        - lang: bash
+          label: Default
+          source: |
+            curl http://localhost:11434/api/show -d '{
+              "model": "gemma3"
+            }'
+        - lang: bash
+          label: Verbose
+          source: |
+            curl http://localhost:11434/api/show -d '{
+              "model": "gemma3",
+              "verbose": true
+            }'
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ShowRequest"
+            example:
+              model: gemma3
+      responses:
+        "200":
+          description: Model information
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ShowResponse"
+              example:
+                parameters: "temperature 0.7\nnum_ctx 2048"
+                license: "Gemma Terms of Use \n\nLast modified: February 21, 2024..."
+                capabilities:
+                  - "completion"
+                  - "vision"
+                modified_at: "2025-08-14T15:49:43.634137516-07:00"
+                details:
+                  parent_model: ""
+                  format: "gguf"
+                  family: "gemma3"
+                  families:
+                    - "gemma3"
+                  parameter_size: "4.3B"
+                  quantization_level: "Q4_K_M"
+                model_info:
+                  gemma3.attention.head_count: 8
+                  gemma3.attention.head_count_kv: 4
+                  gemma3.attention.key_length: 256
+                  gemma3.attention.sliding_window: 1024
+                  gemma3.attention.value_length: 256
+                  gemma3.block_count: 34
+                  gemma3.context_length: 131072
+                  gemma3.embedding_length: 2560
+                  gemma3.feed_forward_length: 10240
+                  gemma3.mm.tokens_per_image: 256
+                  gemma3.vision.attention.head_count: 16
+                  gemma3.vision.attention.layer_norm_epsilon: 0.000001
+                  gemma3.vision.block_count: 27
+                  gemma3.vision.embedding_length: 1152
+                  gemma3.vision.feed_forward_length: 4304
+                  gemma3.vision.image_size: 896
+                  gemma3.vision.num_channels: 3
+                  gemma3.vision.patch_size: 14
+                  general.architecture: "gemma3"
+                  general.file_type: 15
+                  general.parameter_count: 4299915632
+                  general.quantization_version: 2
+                  tokenizer.ggml.add_bos_token: true
+                  tokenizer.ggml.add_eos_token: false
+                  tokenizer.ggml.add_padding_token: false
+                  tokenizer.ggml.add_unknown_token: false
+                  tokenizer.ggml.bos_token_id: 2
+                  tokenizer.ggml.eos_token_id: 1
+                  tokenizer.ggml.merges: null
+                  tokenizer.ggml.model: "llama"
+                  tokenizer.ggml.padding_token_id: 0
+                  tokenizer.ggml.pre: "default"
+                  tokenizer.ggml.scores: null
+                  tokenizer.ggml.token_type: null
+                  tokenizer.ggml.tokens: null
+                  tokenizer.ggml.unknown_token_id: 3
+  /api/create:
+    post:
+      summary: Create a model
+      operationId: create
+      x-mint:
+        href: /api/create
+      x-codeSamples:
+        - lang: bash
+          label: Default
+          source: |
+            curl http://localhost:11434/api/create -d '{
+              "from": "gemma3",
+              "model": "alpaca",
+              "system": "You are Alpaca, a helpful AI assistant. You only answer with Emojis."
+            }'
+        - lang: bash
+          label: Create from existing
+          source: |
+            curl http://localhost:11434/api/create -d '{
+              "model": "ollama",
+              "from": "gemma3",
+              "system": "You are Ollama the llama."
+            }'
+        - lang: bash
+          label: Quantize
+          source: |
+            curl http://localhost:11434/api/create -d '{
+              "model": "llama3.1:8b-instruct-Q4_K_M",
+              "from": "llama3.1:8b-instruct-fp16",
+              "quantize": "q4_K_M"
+            }'
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/CreateRequest"
+            example:
+              model: mario
+              from: gemma3
+              system: "You are Mario from Super Mario Bros."
+      responses:
+        "200":
+          description: Stream of create status updates
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/StatusResponse"
+              example:
+                status: "success"
+            application/x-ndjson:
+              schema:
+                $ref: "#/components/schemas/StatusEvent"
+              example:
+                status: "success"
+  /api/copy:
+    post:
+      summary: Copy a model
+      operationId: copy
+      x-mint:
+        href: /api/copy
+      x-codeSamples:
+        - lang: bash
+          label: Copy a model to a new name
+          source: |
+            curl http://localhost:11434/api/copy -d '{
+              "source": "gemma3",
+              "destination": "gemma3-backup"
+            }'
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/CopyRequest"
+            example:
+              source: gemma3
+              destination: gemma3-backup
+  /api/pull:
+    post:
+      summary: Pull a model
+      operationId: pull
+      x-mint:
+        href: /api/pull
+      x-codeSamples:
+        - lang: bash
+          label: Default
+          source: |
+            curl http://localhost:11434/api/pull -d '{
+              "model": "gemma3"
+            }'
+        - lang: bash
+          label: Non-streaming
+          source: |
+            curl http://localhost:11434/api/pull -d '{
+              "model": "gemma3",
+              "stream": false
+            }'
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/PullRequest"
+            example:
+              model: gemma3
+      responses:
+        "200":
+          description: Pull status updates.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/StatusResponse"
+              example:
+                status: "success"
+            application/x-ndjson:
+              schema:
+                $ref: "#/components/schemas/StatusEvent"
+              example:
+                status: "success"
+  /api/push:
+    post:
+      summary: Push a model
+      operationId: push
+      x-mint:
+        href: /api/push
+      x-codeSamples:
+        - lang: bash
+          label: Push model
+          source: |
+            curl http://localhost:11434/api/push -d '{
+              "model": "my-username/my-model"
+            }'
+        - lang: bash
+          label: Non-streaming
+          source: |
+            curl http://localhost:11434/api/push -d '{
+              "model": "my-username/my-model",
+              "stream": false
+            }'
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/PushRequest"
+            example:
+              model: my-username/my-model
+      responses:
+        "200":
+          description: Push status updates.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/StatusResponse"
+              example:
+                status: "success"
+            application/x-ndjson:
+              schema:
+                $ref: "#/components/schemas/StatusEvent"
+              example:
+                status: "success"
+  /api/delete:
+    delete:
+      summary: Delete a model
+      operationId: delete
+      x-mint:
+        href: /api/delete
+      x-codeSamples:
+        - lang: bash
+          label: Delete model
+          source: |
+            curl -X DELETE http://localhost:11434/api/delete -d '{
+              "model": "gemma3"
+            }'
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/DeleteRequest"
+            example:
+              model: gemma3
+      responses:
+        "200":
+          description: Deletion status updates.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/StatusResponse"
+              example:
+                status: "success"
+            application/x-ndjson:
+              schema:
+                $ref: "#/components/schemas/StatusEvent"
+  /api/version:
+    get:
+      summary: Get version
+      description: Retrieve the version of the Ollama
+      operationId: version
+      x-codeSamples:
+        - lang: bash
+          label: Default
+          source: |
+            curl http://localhost:11434/api/version
+      responses:
+        "200":
+          description: Version information
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/VersionResponse"
+              example:
+                version: "0.12.6"
diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx
new file mode 100644
index 00000000..5ef9fa82
--- /dev/null
+++ b/docs/quickstart.mdx
@@ -0,0 +1,103 @@
+---
+title: Quickstart
+---
+
+This quickstart will walk your through running your first model with Ollama. To get started, download Ollama on macOS, Windows or Linux.
+
+<a
+  href="https://ollama.com/download"
+  target="_blank"
+  className="inline-block px-6 py-2 bg-black rounded-full dark:bg-neutral-700 text-white font-normal border-none"
+>
+  Download Ollama
+</a>
+
+## Run a model
+
+<Tabs>
+  <Tab title="CLI">
+    Open a terminal and run the command:
+
+    ```
+    ollama run gemma3
+    ```
+
+  </Tab>
+  <Tab title="cURL">
+    ```
+    ollama pull gemma3
+    ```
+
+    Lastly, chat with the model:
+
+    ```shell
+    curl http://localhost:11434/api/chat -d '{
+      "model": "gemma3",
+      "messages": [{
+        "role": "user",
+        "content": "Hello there!"
+      }],
+      "stream": false
+    }'
+    ```
+
+  </Tab>
+  <Tab title="Python">
+    Start by downloading a model:
+
+    ```
+    ollama pull gemma3
+    ```
+
+    Then install Ollama's Python library:
+
+    ```
+    pip install ollama
+    ```
+
+    Lastly, chat with the model:
+
+    ```python
+    from ollama import chat
+    from ollama import ChatResponse
+
+    response: ChatResponse = chat(model='gemma3', messages=[
+      {
+        'role': 'user',
+        'content': 'Why is the sky blue?',
+      },
+    ])
+    print(response['message']['content'])
+    # or access fields directly from the response object
+    print(response.message.content)
+    ```
+
+  </Tab>
+  <Tab title="JavaScript">
+    Start by downloading a model:
+
+    ```
+    ollama pull gemma3
+    ```
+
+    Then install the Ollama JavaScript library:
+    ```
+    npm i ollama
+    ```
+
+    Lastly, chat with the model:
+
+    ```shell
+    import ollama from 'ollama'
+
+    const response = await ollama.chat({
+      model: 'gemma3',
+      messages: [{ role: 'user', content: 'Why is the sky blue?' }],
+    })
+    console.log(response.message.content)
+    ```
+
+  </Tab>
+</Tabs>
+
+See a full list of available models [here](https://ollama.com/models).
diff --git a/docs/styling.css b/docs/styling.css
new file mode 100644
index 00000000..e63b6be8
--- /dev/null
+++ b/docs/styling.css
@@ -0,0 +1,16 @@
+body {
+    font-family: ui-sans-serif, system-ui, sans-serif, Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol,Noto Color Emoji;
+}
+
+pre, code, .font-mono {
+    font-family: ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,monospace;
+}
+
+.nav-logo {
+    height: 44px;
+}
+
+.eyebrow {
+    color: #666;
+    font-weight: 400;
+}
diff --git a/docs/template.md b/docs/template.mdx
similarity index 87%
rename from docs/template.md
rename to docs/template.mdx
index 636d72f0..9ebac8c0 100644
--- a/docs/template.md
+++ b/docs/template.mdx
@@ -1,4 +1,6 @@
-# Template
+---
+title: Template
+---
 
 Ollama provides a powerful templating engine backed by Go's built-in templating engine to construct prompts for your large language model. This feature is a valuable tool to get the most out of your models.
 
@@ -6,13 +8,13 @@ Ollama provides a powerful templating engine backed by Go's built-in templating
 
 A basic Go template consists of three main parts:
 
-* **Layout**: The overall structure of the template.
-* **Variables**: Placeholders for dynamic data that will be replaced with actual values when the template is rendered.
-* **Functions**: Custom functions or logic that can be used to manipulate the template's content.
+- **Layout**: The overall structure of the template.
+- **Variables**: Placeholders for dynamic data that will be replaced with actual values when the template is rendered.
+- **Functions**: Custom functions or logic that can be used to manipulate the template's content.
 
 Here's an example of a simple chat template:
 
-```go
+```gotmpl
 {{- range .Messages }}
 {{ .Role }}: {{ .Content }}
 {{- end }}
@@ -20,9 +22,9 @@ Here's an example of a simple chat template:
 
 In this example, we have:
 
-* A basic messages structure (layout)
-* Three variables: `Messages`, `Role`, and `Content` (variables)
-* A custom function (action) that iterates over an array of items (`range .Messages`) and displays each item
+- A basic messages structure (layout)
+- Three variables: `Messages`, `Role`, and `Content` (variables)
+- A custom function (action) that iterates over an array of items (`range .Messages`) and displays each item
 
 ## Adding templates to your model
 
@@ -61,7 +63,7 @@ TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
 
 `Messages[].Role` (string): role which can be one of `system`, `user`, `assistant`, or `tool`
 
-`Messages[].Content` (string):  message content
+`Messages[].Content` (string): message content
 
 `Messages[].ToolCalls` (list): list of tools the model wants to call
 
@@ -99,9 +101,9 @@ TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
 
 Keep the following tips and best practices in mind when working with Go templates:
 
-* **Be mindful of dot**: Control flow structures like `range` and `with` changes the value `.`
-* **Out-of-scope variables**: Use `$.` to reference variables not currently in scope, starting from the root
-* **Whitespace control**: Use `-` to trim leading (`{{-`) and trailing (`-}}`) whitespace
+- **Be mindful of dot**: Control flow structures like `range` and `with` changes the value `.`
+- **Out-of-scope variables**: Use `$.` to reference variables not currently in scope, starting from the root
+- **Whitespace control**: Use `-` to trim leading (`{{-`) and trailing (`-}}`) whitespace
 
 ## Examples
 
@@ -155,13 +157,14 @@ CodeLlama [7B](https://ollama.com/library/codellama:7b-code) and [13B](https://o
 <PRE> {{ .Prompt }} <SUF>{{ .Suffix }} <MID>
 ```
 
-> [!NOTE]
-> CodeLlama 34B and 70B code completion and all instruct and Python fine-tuned models do not support fill-in-middle.
+<Note>
+  CodeLlama 34B and 70B code completion and all instruct and Python fine-tuned models do not support fill-in-middle.
+</Note>
 
 #### Codestral
 
 Codestral [22B](https://ollama.com/library/codestral:22b) supports fill-in-middle.
 
-```go
+```gotmpl
 [SUFFIX]{{ .Suffix }}[PREFIX] {{ .Prompt }}
 ```
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 18c014d1..c141bf43 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -1,106 +1,3 @@
-# How to troubleshoot issues
+# Troubleshooting
 
-Sometimes Ollama may not perform as expected. One of the best ways to figure out what happened is to take a look at the logs. Find the logs on **Mac** by running the command:
-
-```shell
-cat ~/.ollama/logs/server.log
-```
-
-On **Linux** systems with systemd, the logs can be found with this command:
-
-```shell
-journalctl -u ollama --no-pager --follow --pager-end
-```
-
-When you run Ollama in a **container**, the logs go to stdout/stderr in the container:
-
-```shell
-docker logs <container-name>
-```
-
-(Use `docker ps` to find the container name)
-
-If manually running `ollama serve` in a terminal, the logs will be on that terminal.
-
-When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
-- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
-- `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
-- `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
-
-To enable additional debug logging to help troubleshoot problems, first **Quit the running app from the tray menu** then in a powershell terminal
-
-```powershell
-$env:OLLAMA_DEBUG="1"
-& "ollama app.exe"
-```
-
-Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
-
-## LLM libraries
-
-Ollama includes multiple LLM libraries compiled for different GPU libraries and versions. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library.
-
-**Experimental LLM Library Override**
-
-You can set OLLAMA_LLM_LIBRARY to any of the available LLM libraries to limit autodetection, so for example, if you have both CUDA and AMD GPUs, but want to force the CUDA v13 only, use:
-
-```shell
-OLLAMA_LLM_LIBRARY="cuda_v13" ollama serve
-```
-
-## Installing older or pre-release versions on Linux
-
-If you run into problems on Linux and want to install an older version, or you'd like to try out a pre-release before it's officially released, you can tell the install script which version to install.
-
-```shell
-curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.5.7 sh
-```
-
-## Linux docker
-
-If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker.  Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.
-
-## NVIDIA GPU Discovery
-
-When Ollama starts up, it takes inventory of the GPUs present in the system to determine compatibility and how much VRAM is available.  Sometimes this discovery can fail to find your GPUs.  In general, running the latest driver will yield the best results.
-
-### Linux NVIDIA Troubleshooting
-
-If you are using a container to run Ollama, make sure you've set up the container runtime first as described in [docker.md](./docker.md)
-
-Sometimes the Ollama can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem
-
-- If you are using a container, is the container runtime working?  Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama won't be able to see your NVIDIA GPU.
-- Is the uvm driver loaded? `sudo nvidia-modprobe -u`
-- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
-- Try rebooting
-- Make sure you're running the latest nvidia drivers
-
-If none of those resolve the problem, gather additional information and file an issue:
-- Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs
-- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
-
-You may get more details for initialization failures by enabling debug prints in the uvm driver.  You should only use this temporarily while troubleshooting
-- `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm uvm_debug_prints=1`
-
-
-## AMD GPU Discovery
-
-On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
-
-When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44`
-
-If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
-- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems
-- `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
-- Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`
-
-## Multiple AMD GPUs
-
-If you experience gibberish responses when models load across multiple AMD GPUs on Linux, see the following guide.
-
-- https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/mgpu.html#mgpu-known-issues-and-limitations
-
-## Windows Terminal Errors
-
-Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly.  This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect`  To resolve this problem, please update to Win 10 22H1 or newer.
+For troubleshooting, see [https://docs.ollama.com/troubleshooting](https://docs.ollama.com/troubleshooting)
diff --git a/docs/troubleshooting.mdx b/docs/troubleshooting.mdx
new file mode 100644
index 00000000..ec662572
--- /dev/null
+++ b/docs/troubleshooting.mdx
@@ -0,0 +1,125 @@
+---
+title: Troubleshooting
+description: How to troubleshoot issues encountered with Ollama
+---
+
+Sometimes Ollama may not perform as expected. One of the best ways to figure out what happened is to take a look at the logs. Find the logs on **Mac** by running the command:
+
+```shell
+cat ~/.ollama/logs/server.log
+```
+
+On **Linux** systems with systemd, the logs can be found with this command:
+
+```shell
+journalctl -u ollama --no-pager --follow --pager-end
+```
+
+When you run Ollama in a **container**, the logs go to stdout/stderr in the container:
+
+```shell
+docker logs <container-name>
+```
+
+(Use `docker ps` to find the container name)
+
+If manually running `ollama serve` in a terminal, the logs will be on that terminal.
+
+When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
+
+- `explorer %LOCALAPPDATA%\Ollama` to view logs. The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
+- `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
+- `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
+- `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories
+
+To enable additional debug logging to help troubleshoot problems, first **Quit the running app from the tray menu** then in a powershell terminal
+
+```powershell
+$env:OLLAMA_DEBUG="1"
+& "ollama app.exe"
+```
+
+Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
+
+## LLM libraries
+
+Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library.
+
+In the server log, you will see a message that looks something like this (varies from release to release):
+
+```
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
+```
+
+**Experimental LLM Library Override**
+
+You can set OLLAMA_LLM_LIBRARY to any of the available LLM libraries to bypass autodetection, so for example, if you have a CUDA card, but want to force the CPU LLM library with AVX2 vector support, use:
+
+```shell
+OLLAMA_LLM_LIBRARY="cpu_avx2" ollama serve
+```
+
+You can see what features your CPU has with the following.
+
+```shell
+cat /proc/cpuinfo| grep flags | head -1
+```
+
+## Installing older or pre-release versions on Linux
+
+If you run into problems on Linux and want to install an older version, or you'd like to try out a pre-release before it's officially released, you can tell the install script which version to install.
+
+```shell
+curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.5.7 sh
+```
+
+## Linux tmp noexec
+
+If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example OLLAMA_TMPDIR=/usr/share/ollama/
+
+## Linux docker
+
+If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker. Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.
+
+## NVIDIA GPU Discovery
+
+When Ollama starts up, it takes inventory of the GPUs present in the system to determine compatibility and how much VRAM is available. Sometimes this discovery can fail to find your GPUs. In general, running the latest driver will yield the best results.
+
+### Linux NVIDIA Troubleshooting
+
+If you are using a container to run Ollama, make sure you've set up the container runtime first as described in [docker.md](./docker.md)
+
+Sometimes the Ollama can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem
+
+- If you are using a container, is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama won't be able to see your NVIDIA GPU.
+- Is the uvm driver loaded? `sudo nvidia-modprobe -u`
+- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
+- Try rebooting
+- Make sure you're running the latest nvidia drivers
+
+If none of those resolve the problem, gather additional information and file an issue:
+
+- Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs
+- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
+
+## AMD GPU Discovery
+
+On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
+
+When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices. For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44`
+
+If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
+
+- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems
+- `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
+- Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`
+
+## Multiple AMD GPUs
+
+If you experience gibberish responses when models load across multiple AMD GPUs on Linux, see the following guide.
+
+- https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/mgpu.html#mgpu-known-issues-and-limitations
+
+## Windows Terminal Errors
+
+Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly. This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect` To resolve this problem, please update to Win 10 22H1 or newer.
diff --git a/docs/windows.md b/docs/windows.md
deleted file mode 100644
index eb067ed0..00000000
--- a/docs/windows.md
+++ /dev/null
@@ -1,76 +0,0 @@
-# Ollama Windows
-
-Welcome to Ollama for Windows.
-
-No more WSL required!
-
-Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
-After installing Ollama for Windows, Ollama will run in the background and
-the `ollama` command line is available in `cmd`, `powershell` or your favorite
-terminal application. As usual the Ollama [api](./api.md) will be served on
-`http://localhost:11434`.
-
-## System Requirements
-
-* Windows 10 22H2 or newer, Home or Pro
-* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
-* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
-
-Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings.
-
-## Filesystem Requirements
-
-The Ollama install does not require Administrator, and installs in your home directory by default.  You'll need at least 4GB of space for the binary install.  Once you've installed Ollama, you'll need additional space for storing the Large Language models, which can be tens to hundreds of GB in size.  If your home directory doesn't have enough space, you can change where the binaries are installed, and where the models are stored.
-
-### Changing Install Location
-
-To install the Ollama application in a location different than your home directory, start the installer with the following flag
-
-```powershell
-OllamaSetup.exe /DIR="d:\some\location"
-```
-
-## API Access
-
-Here's a quick example showing API access from `powershell`
-
-```powershell
-(Invoke-WebRequest -method POST -Body '{"model":"llama3.2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
-```
-
-## Troubleshooting
-
-Ollama on Windows stores files in a few different locations.  You can view them in
-the explorer window by hitting `<Ctrl>+R` and type in:
-- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
-    - *app.log* contains most resent logs from the GUI application
-    - *server.log* contains the most recent server logs
-    - *upgrade.log* contains log output for upgrades
-- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
-- `explorer %HOMEPATH%\.ollama` contains models and configuration
-
-## Uninstall
-
-The Ollama Windows installer registers an Uninstaller application.  Under `Add or remove programs` in Windows Settings, you can uninstall Ollama.
-
-> [!NOTE]
-> If you have [changed the OLLAMA_MODELS location](#changing-model-location), the installer will not remove your downloaded models
-
-
-## Standalone CLI
-
-The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
-installer. It installs in your account without requiring Administrator rights.
-We update Ollama regularly to support the latest models, and this installer will
-help you keep up to date.
-
-If you'd like to install or integrate Ollama as a service, a standalone
-`ollama-windows-amd64.zip` zip file is available containing only the Ollama CLI
-and GPU library dependencies for Nvidia.  If you have an AMD GPU, also download
-and extract the additional ROCm package `ollama-windows-amd64-rocm.zip` into the
-same directory.  Both zip files are necessary for a complete AMD installation.
-This allows for embedding Ollama in existing applications, or running it as a
-system service via `ollama serve` with tools such as [NSSM](https://nssm.cc/). 
-
-> [!NOTE]  
-> If you are upgrading from a prior version, you should remove the old directories first.
diff --git a/docs/windows.mdx b/docs/windows.mdx
new file mode 100644
index 00000000..37fd1973
--- /dev/null
+++ b/docs/windows.mdx
@@ -0,0 +1,91 @@
+---
+title: Windows
+---
+
+Ollama runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
+After installing Ollama for Windows, Ollama will run in the background and
+the `ollama` command line is available in `cmd`, `powershell` or your favorite
+terminal application. As usual the Ollama [API](/api) will be served on
+`http://localhost:11434`.
+
+## System Requirements
+
+- Windows 10 22H2 or newer, Home or Pro
+- NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
+- AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
+
+Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings.
+
+## Filesystem Requirements
+
+The Ollama install does not require Administrator, and installs in your home directory by default. You'll need at least 4GB of space for the binary install. Once you've installed Ollama, you'll need additional space for storing the Large Language models, which can be tens to hundreds of GB in size. If your home directory doesn't have enough space, you can change where the binaries are installed, and where the models are stored.
+
+### Changing Install Location
+
+To install the Ollama application in a location different than your home directory, start the installer with the following flag
+
+```powershell
+OllamaSetup.exe /DIR="d:\some\location"
+```
+
+### Changing Model Location
+
+To change where Ollama stores the downloaded models instead of using your home directory, set the environment variable `OLLAMA_MODELS` in your user account.
+
+1. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.
+
+2. Click on _Edit environment variables for your account_.
+
+3. Edit or create a new variable for your user account for `OLLAMA_MODELS` where you want the models stored
+
+4. Click OK/Apply to save.
+
+If Ollama is already running, Quit the tray application and relaunch it from the Start menu, or a new terminal started after you saved the environment variables.
+
+## API Access
+
+Here's a quick example showing API access from `powershell`
+
+```powershell
+(Invoke-WebRequest -method POST -Body '{"model":"llama3.2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
+```
+
+## Troubleshooting
+
+Ollama on Windows stores files in a few different locations. You can view them in
+the explorer window by hitting `<Ctrl>+R` and type in:
+
+- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
+  - _app.log_ contains most resent logs from the GUI application
+  - _server.log_ contains the most recent server logs
+  - _upgrade.log_ contains log output for upgrades
+- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
+- `explorer %HOMEPATH%\.ollama` contains models and configuration
+- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
+
+## Uninstall
+
+The Ollama Windows installer registers an Uninstaller application. Under `Add or remove programs` in Windows Settings, you can uninstall Ollama.
+
+<Note>
+  If you have [changed the OLLAMA_MODELS location](#changing-model-location), the installer will not remove your downloaded models
+</Note>
+
+## Standalone CLI
+
+The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
+installer. It installs in your account without requiring Administrator rights.
+We update Ollama regularly to support the latest models, and this installer will
+help you keep up to date.
+
+If you'd like to install or integrate Ollama as a service, a standalone
+`ollama-windows-amd64.zip` zip file is available containing only the Ollama CLI
+and GPU library dependencies for Nvidia. If you have an AMD GPU, also download
+and extract the additional ROCm package `ollama-windows-amd64-rocm.zip` into the
+same directory. This allows for embedding Ollama in existing applications, or
+running it as a system service via `ollama serve` with tools such as
+[NSSM](https://nssm.cc/).
+
+<Note>
+  If you are upgrading from a prior version, you should remove the old directories first.
+</Note>
diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index fcb3d9fd..c0ca068a 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -242,13 +242,13 @@ func (kv KV) OllamaEngineRequired() bool {
 	return slices.Contains([]string{
 		"gemma3",
 		"gemma3n",
-		"mistral3",
-		"qwen3",
-		"qwen3moe",
+		"gptoss", "gpt-oss",
 		"llama4",
+		"mistral3",
 		"mllama",
 		"qwen25vl",
-		"gptoss", "gpt-oss",
+		"qwen3", "qwen3moe",
+		"qwen3vl", "qwen3vlmoe",
 	}, kv.Architecture())
 }
 
diff --git a/integration/README.md b/integration/README.md
index 1dfd0e35..5d2acc45 100644
--- a/integration/README.md
+++ b/integration/README.md
@@ -7,7 +7,7 @@ By default, these tests are disabled so `go test ./...` will exercise only unit
 
 The integration tests have 2 modes of operating.
 
-1. By default, they will start the server on a random port, run the tests, and then shutdown the server.
+1. By default, on Unix systems, they will start the server on a random port, run the tests, and then shutdown the server.  On Windows you must ALWAYS run the server on OLLAMA_HOST for the tests to work.
 2. If `OLLAMA_TEST_EXISTING` is set to a non-empty string, the tests will run against an existing running server, which can be remote based on your `OLLAMA_HOST` environment variable
 
 > [!IMPORTANT]
diff --git a/integration/embed_test.go b/integration/embed_test.go
index 3a8bcd24..e155498d 100644
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -14,6 +14,10 @@ import (
 
 func dotProduct[V float32 | float64](v1, v2 []V) V {
 	var result V = 0
+	if len(v1) != len(v2) {
+		return result
+	}
+
 	for i := 0; i < len(v1); i++ {
 		result += v1[i] * v2[i]
 	}
@@ -29,9 +33,115 @@ func magnitude[V float32 | float64](v []V) V {
 }
 
 func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
+	mag1 := magnitude(v1)
+	mag2 := magnitude(v2)
+
+	if mag1 == 0 || mag2 == 0 {
+		return 0
+	}
+
 	return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2))
 }
 
+func euclideanDistance[V float32 | float64](v1, v2 []V) V {
+	if len(v1) != len(v2) {
+		return V(math.Inf(1))
+	}
+
+	var sum V = 0
+	for i := 0; i < len(v1); i++ {
+		diff := v1[i] - v2[i]
+		sum += diff * diff
+	}
+
+	return V(math.Sqrt(float64(sum)))
+}
+
+func manhattanDistance[V float32 | float64](v1, v2 []V) V {
+	if len(v1) != len(v2) {
+		return V(math.Inf(1))
+	}
+
+	var sum V = 0
+	for i := 0; i < len(v1); i++ {
+		sum += V(math.Abs(float64(v1[i] - v2[i])))
+	}
+
+	return sum
+}
+
+func TestEmbedCosineDistanceCorrelation(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	for _, model := range libraryEmbedModels {
+		t.Run(model, func(t *testing.T) {
+			testCases := []struct {
+				a string
+				b string
+				c string
+			}{
+				{"cat", "kitten", "dog"},
+				{"king", "queen", "baron"},
+				{"paris", "london", "vancouver"},
+				{"The cat is sleeping on the sofa", "A feline is sleeping on the couch", "Quantum physics is complex"},
+				{"I love programming in python", "Coding in python brings me joy", "Pizza is delicious"},
+				{"Machine learning is fascinating", "Artificial intelligence is amazing", "I need to buy groceries"},
+				{"The quick brown fox jumps over the lazy dog", "A fast brown fox leaps over a sleepy dog", "The weather is warm and sunny today"},
+			}
+
+			for _, tc := range testCases {
+				testEmbed := make(map[string][]float32)
+				strs := []string{tc.a, tc.b, tc.c}
+
+				req := api.EmbedRequest{
+					Model:     model,
+					Input:     strs,
+					KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				}
+
+				resp, err := embedTestHelper(ctx, client, t, req)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				for cnt, v := range resp.Embeddings {
+					testEmbed[strs[cnt]] = v
+				}
+
+				// Calculate cosine similarities
+				cosAB := cosineSimilarity(testEmbed[tc.a], testEmbed[tc.b])
+				cosAC := cosineSimilarity(testEmbed[tc.a], testEmbed[tc.c])
+
+				// Calculate distances
+				distAB := euclideanDistance(testEmbed[tc.a], testEmbed[tc.b])
+				distAC := euclideanDistance(testEmbed[tc.a], testEmbed[tc.c])
+
+				manhattanAB := manhattanDistance(testEmbed[tc.a], testEmbed[tc.b])
+				manhattanAC := manhattanDistance(testEmbed[tc.a], testEmbed[tc.c])
+
+				// Consistency check: if cosAB > cosAC, then distances should be smaller
+				if cosAB > cosAC {
+					if distAB >= distAC {
+						t.Errorf("Euclidean distance inconsistency (%s) for %s-%s-%s: cosAB=%f > cosAC=%f but distAB=%f >= distAC=%f",
+							model, tc.a, tc.b, tc.c, cosAB, cosAC, distAB, distAC)
+					}
+
+					if manhattanAB >= manhattanAC {
+						t.Errorf("Manhattan distance inconsistency (%s) for %s-%s-%s: cosAB=%f > cosAC=%f but manhattanAB=%f >= manhattanAC=%f",
+							model, tc.a, tc.b, tc.c, cosAB, cosAC, manhattanAB, manhattanAC)
+					}
+				} else {
+					t.Errorf("Cosine Similarity inconsistency (%s): cosinSim(%s, %s) < cosinSim(%s, %s)",
+						model, tc.a, tc.b, tc.a, tc.c)
+				}
+			}
+		})
+	}
+}
+
 func TestAllMiniLMEmbeddings(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
diff --git a/integration/llm_image_test.go b/integration/llm_image_test.go
index e3591565..e1c16baf 100644
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -26,6 +26,13 @@ func TestVisionModels(t *testing.T) {
 		{
 			model: "gemma3",
 		},
+		{
+			model: "qwen3-vl:8b",
+		},
+		{
+			// Qwen 3 VL mixture of experts
+			model: "qwen3-vl:30b",
+		},
 	}
 
 	for _, v := range testCases {
diff --git a/integration/testdata/embed.json b/integration/testdata/embed.json
index c80fd1b0..4850d676 100644
--- a/integration/testdata/embed.json
+++ b/integration/testdata/embed.json
@@ -8,6 +8,6 @@
 	"paraphrase-multilingual": [-0.019807, -0.124781, -0.010519, 0.035812, -0.103448, 0.051982, 0.035322, 0.030018, -0.179976, 0.194586, 0.129194, 0.157071, 0.083678, 0.074628, 0.093773, -0.367580, 0.002608, 0.086277, 0.050985, -0.005689, -0.038710, 0.071398, 0.010391, -0.059942, 0.007196, -0.066065, -0.010554, -0.011521, 0.145288, 0.120511, -0.139100, -0.096199, -0.045498, -0.109749, 0.046571, 0.023483, -0.086807, 0.150124, -0.067052, -0.100689, -0.004482, -0.014063, -0.062190, 0.071008, -0.107359, 0.012106, 0.026683, 0.107762, -0.002190, -0.121664, 0.057639, 0.175526, -0.129658, 0.061670, 0.274528, 0.052475, -0.124988, 0.189575, 0.027682, 0.105478, -0.010325, -0.008585, 0.156806, 0.021770, -0.119687, -0.030621, 0.061486, 0.089130, 0.080578, 0.004526, -0.163631, -0.035526, -0.044562, 0.036523, -0.202825, 0.050263, 0.022896, 0.042070, 0.126741, 0.073518, 0.199230, -0.121035, -0.013655, -0.071069, -0.065983, 0.313145, -0.021707, 0.124713, -0.039624, 0.225527, -0.015417, -0.164423, -0.142655, -0.059337, 0.030137, 0.127238, 0.127086, -0.082194, -0.081504, 0.325473, 0.274064, 0.185700, -0.021754, 0.175575, 0.002501, -0.045027, 0.057571, -0.260881, -0.035121, -0.142682, 0.209513, -0.166192, 0.007538, -0.121503, -0.079821, -0.121559, 0.157354, -0.130091, -0.088810, -0.004192, 0.023477, 0.050395, 0.015282, 0.022486, 0.027325, 0.041678, -0.146638, 0.171089, 0.150886, -0.087244, -0.011451, -0.035348, -0.045925, 0.063444, -0.065683, -0.126295, -0.046725, -0.017725, -0.119099, -0.096294, 0.124213, -0.001037, -0.077951, 0.116946, -0.128626, 0.076870, 0.015107, -0.013591, 0.030020, 0.049803, 0.057727, 0.192952, -0.265347, -0.031025, -0.077450, 0.015170, -0.168407, -0.094748, 0.057666, -0.069248, 0.034561, -0.111670, 0.047948, -0.082442, -0.038034, 0.005981, -0.336813, 0.151752, -0.080341, -0.163140, 0.234783, -0.070792, 0.098568, -0.062491, -0.038122, -0.056743, -0.216298, 0.015405, 0.036285, -0.018388, -0.129567, 0.114494, 0.100684, 0.136078, -0.278469, -0.029172, -0.025171, -0.035048, -0.017327, -0.020234, 0.006405, 0.059504, -0.055152, 0.047702, -0.109771, -0.095923, 0.154146, -0.082645, 0.002055, 0.063278, 0.045186, -0.016451, 0.120333, -0.030705, -0.125732, 0.082911, 0.183584, 0.005612, 0.086614, -0.122572, 0.187004, 0.008749, 0.122742, -0.099332, -0.099544, -0.030457, -0.014596, 0.159668, -0.182861, -0.038095, -0.018787, -0.129022, -0.070407, 0.040420, -0.078966, 0.110361, -0.051468, 0.023479, -0.055557, -0.074713, -0.025666, 0.041186, -0.000058, 0.008151, -0.078964, 0.127330, -0.045430, -0.043395, -0.025994, -0.305759, -0.000632, 0.091581, -0.041979, -0.096488, 0.007829, -0.035366, -0.129597, 0.031931, 0.011414, 0.026075, 0.070006, 0.143212, -0.131706, -0.065480, -0.091587, -0.089944, 0.304327, 0.096218, -0.155311, 0.154486, 0.056186, -0.002324, 0.134550, -0.185795, -0.054339, 0.010738, 0.268656, 0.230560, 0.050754, -0.097614, 0.096583, 0.082153, -0.127167, -0.107377, -0.047550, 0.109379, -0.032336, 0.005514, -0.189381, 0.015142, -0.220278, -0.155431, -0.080936, -0.017348, 0.057081, 0.040142, 0.024299, 0.038554, -0.014053, 0.088013, 0.058415, 0.047141, -0.052754, 0.062682, 0.094209, -0.061054, -0.029627, 0.057371, 0.032965, -0.137422, -0.197806, -0.105999, -0.003994, -0.005150, 0.015822, 0.145214, 0.171718, -0.092218, 0.165397, 0.172935, -0.016241, -0.069164, -0.034006, 0.263521, -0.112738, 0.144954, -0.008142, 0.109327, -0.000139, 0.203327, -0.000758, -0.102171, -0.004223, -0.122857, -0.078052, -0.005030, 0.179426, -0.008189, 0.172658, -0.182432, -0.028655, 0.246079, 0.040135, -0.001440, -0.101024, -0.116102, 0.035103, -0.111655, -0.171831, 0.053297, -0.021837, 0.020048, 0.071553, 0.017092, -0.495468, 0.006690, -0.174933, -0.039871, 0.017558, 0.093333, -0.067826, -0.026449, -0.034882, -0.078675, -0.026006, -0.127709, 0.073291, -0.096413, 0.173521, 0.141467, 0.049000, -0.128893, -0.095217, 0.197807, 0.064243, 0.147542, 0.107418, 0.088213, -0.047051, -0.014437, 0.377273, -0.041961, 0.123879, -0.009810, 0.105710, 0.168773, -0.020232, -0.108163, -0.050267, -0.069577, -0.031271, 0.047579, -0.278478, -0.072615, -0.059372, 0.114844, 0.055385, -0.052592, 0.140747, -0.053970, -0.049484, -0.056079, -0.052369, -0.061402, -0.010092, 0.040888, -0.010542, -0.008642, 0.127806, 0.142922, 0.061796, 0.215661, -0.121110, 0.177801, 0.082593, -0.098139, 0.160477, -0.112506, -0.128137, 0.010061, -0.246614, -0.134404, 0.134328, 0.037165, -0.056656, 0.085682, -0.002025, -0.048427, 0.047335, -0.152925, 0.076913, 0.144639, 0.002542, -0.008786, -0.207630, -0.092424, -0.056038, 0.039837, 0.130480, -0.019214, 0.085709, -0.068168, -0.057661, 0.256396, 0.000436, 0.002165, 0.008250, 0.435296, -0.023791, 0.112853, 0.118685, 0.015178, 0.142689, -0.139655, 0.084141, 0.053003, -0.127661, 0.121614, 0.090306, -0.053635, 0.143329, -0.020410, -0.130167, -0.062897, -0.043274, -0.012359, 0.014011, -0.309357, 0.110538, -0.099683, 0.018306, 0.439442, 0.034141, 0.002030, 0.026504, -0.224360, -0.192707, 0.154315, 0.020682, -0.212653, -0.198598, 0.103733, -0.084605, 0.123315, -0.190156, 0.051589, -0.114352, -0.215452, 0.227831, 0.089644, -0.156986, -0.110336, 0.023221, 0.186123, -0.009580, -0.108279, -0.008263, -0.079465, -0.019248, 0.037930, -0.005270, 0.017321, -0.003298, 0.294424, -0.011487, 0.139208, -0.054023, -0.135061, 0.010541, -0.181049, -0.041205, -0.110344, 0.128945, -0.090110, -0.092730, -0.029277, 0.101132, 0.017030, 0.041486, -0.143502, 0.224712, -0.052848, -0.128890, -0.150927, 0.027277, 0.097778, 0.225844, 0.132758, 0.049771, -0.195139, -0.030116, 0.007751, -0.079459, 0.195759, 0.028297, 0.147042, -0.010751, -0.044499, 0.024308, -0.101806, 0.131116, -0.123838, -0.073508, 0.129509, -0.011302, 0.326354, -0.237273, 0.024596, 0.004420, -0.039178, 0.025751, 0.013973, 0.154100, 0.041046, 0.024320, -0.092331, 0.075485, 0.194852, 0.043371, -0.251192, 0.134674, 0.052031, -0.132075, 0.094175, -0.014784, -0.095276, -0.167319, 0.093634, -0.053208, -0.299019, -0.019493, 0.110037, -0.111475, -0.098528, -0.045980, 0.011906, -0.084867, 0.071568, -0.053325, 0.037509, -0.058839, 0.001778, 0.058313, 0.127749, 0.036488, -0.065275, -0.057004, 0.002167, -0.194989, 0.068705, -0.069410, 0.112359, -0.152019, -0.107722, 0.070784, -0.017405, -0.203961, -0.063757, -0.000544, 0.104791, -0.084216, 0.204668, 0.103679, -0.267183, -0.073881, -0.051626, -0.263557, 0.077896, -0.046059, 0.181407, 0.004982, -0.028577, -0.070820, 0.120156, 0.068127, -0.016167, 0.168783, -0.009547, 0.057545, -0.206602, -0.138948, -0.287059, -0.089665, 0.193052, 0.181721, 0.076652, 0.230598, 0.038210, -0.065900, 0.351109, 0.163837, -0.106730, 0.004680, 0.054401, -0.162431, 0.109289, -0.027845, -0.077752, 0.074426, -0.206153, -0.205087, -0.047387, -0.115959, -0.012581, 0.006516, 0.137222, 0.024973, 0.067576, 0.079758, 0.005901, -0.085006, -0.211992, 0.079703, 0.164714, 0.012983, -0.047775, 0.009934, 0.166054, -0.117008, 0.112174, -0.081620, 0.252085, -0.095814, -0.160737, 0.098616, 0.049302, -0.169005, 0.056813, -0.110345, -0.072744, 0.016748, 0.018266, 0.276841, -0.109161, -0.030222, -0.091865, -0.098636, -0.029673, -0.037370, -0.277655, 0.068380, 0.040822, -0.014380, 0.363860, -0.091828, -0.034534, 0.108802, -0.056442, -0.141440, 0.096531, -0.126003, -0.072285, -0.014293, -0.315917, 0.013416, -0.057672, -0.064211, 0.077573, -0.015361, 0.105270, 0.046737, 0.073715, 0.133964, -0.039862, 0.192067, -0.038854, -0.035655, 0.101362, 0.148665, -0.078182, 0.041527, -0.077087, 0.026681, 0.089204, 0.506013, 0.121540, -0.163288, -0.046427, 0.129322, 0.186661, 0.032343, 0.020226, 0.031071, -0.050872, 0.091166, -0.050102, -0.042110, 0.055500, -0.027633, -0.272802, 0.198007, -0.049932, 0.015780, 0.053894, 0.063445, 0.013361, -0.017767, 0.103368, -0.049283, -0.161567, -0.018339, 0.159721, 0.019753, 0.256000, 0.122950, -0.067329, 0.049447, -0.039212, -0.101245, -0.019110, 0.068606, -0.009369, -0.081864, -0.116030, -0.107591, -0.032567, -0.213658, 0.024803, 0.012063, 0.073045, 0.151132, 0.040293, 0.111463, -0.057375, 0.336502, -0.153928, 0.049947, -0.022919, 0.136091, -0.179530, -0.101300, 0.034927, 0.026369, -0.290807, -0.027303, 0.077214, 0.085054, -0.088758],
 	"snowflake-arctic-embed": [0.164476, -0.981777, -0.405218, 0.399810, 0.901198, 0.409591, -0.077627, -0.677190, 0.222725, -1.757181, 1.154365, 0.970361, 0.139148, 0.673119, -0.024305, 0.273795, 0.692573, -0.239678, -0.362082, -0.275700, -0.206364, -0.501303, 0.699528, 0.320007, -0.261514, -0.199023, 0.255197, 0.461451, 0.586028, 0.502643, 0.727292, -0.206270, 0.097371, -0.161835, 0.680590, 0.230389, 0.173242, -0.845818, -0.187537, -0.595398, 0.080072, -0.614428, 0.249609, 0.753781, -0.356874, -0.436827, 0.524961, -0.157355, 0.518234, -1.566906, 0.572488, -0.467955, 0.191558, 0.039816, -0.793020, -0.215021, 1.121415, 1.650410, 0.526585, -1.186473, -0.232328, -0.854596, -0.380662, 0.417444, -0.008091, 0.964398, -0.264849, -0.478139, 0.551200, 0.654829, -0.477421, -0.520961, -0.090849, -0.448812, 0.104905, -0.738188, 0.303336, 0.398035, 1.183559, 0.649098, 0.404940, -0.358590, -0.979204, -1.484936, 0.228276, 0.803336, 2.641596, -0.125927, 0.113146, -0.385871, 0.499152, 0.051917, 0.334905, 1.279890, -0.545813, 0.604924, -0.420765, 0.912452, 0.772270, -0.737417, 0.391128, -1.199134, 0.121847, 1.555495, 0.648331, 0.196339, -0.591679, 0.363930, -0.068456, -0.155599, 0.527852, -0.488703, -0.712850, -0.531144, 0.479999, -0.559684, 1.147967, -0.265582, 0.119726, 1.675230, 0.942336, 0.065473, 0.428287, -0.342958, -0.162591, 1.297977, -0.338609, -0.096736, -0.088885, 0.330610, -1.069823, -0.485881, 0.355422, 0.058099, -0.582748, -0.080651, -2.783385, -1.813708, 0.929544, 0.427284, 0.167461, -1.018789, -0.186063, 0.125848, 1.110493, -0.323993, 0.468688, -0.310807, 0.267761, -0.193082, -0.649354, 0.090465, 0.213910, -0.901647, 0.184187, -2.126019, 0.618628, -0.386999, 0.338013, -0.291322, 0.601014, -0.248482, -0.011206, -0.109841, -0.738318, -0.745902, 1.245500, 1.346687, -0.500503, 0.614734, -0.478978, 1.417879, 0.647242, 0.600458, 0.093502, -0.399006, 0.019264, -0.670275, 0.760402, -0.139396, -0.833422, 0.600008, 0.150120, 0.215607, -0.787541, 1.722837, 0.167324, -0.535421, 0.388938, -1.382614, -0.172650, -0.562894, 0.249094, 0.258224, -0.438660, -0.845207, 0.875777, 0.783044, -0.391563, 0.029483, -0.291418, 0.204866, 0.673864, 0.580254, 0.495731, 1.010963, -0.346271, -0.046538, 0.067540, -0.395137, -0.387492, 0.393826, 0.172326, 0.251920, -0.889290, 0.045292, -0.161041, 0.922710, -0.320204, 0.351821, -0.392186, 0.629528, -0.211839, 0.032394, 0.861603, -0.016760, -0.558076, 0.262017, -0.085449, -0.318123, -0.498436, -0.133505, 0.664525, -0.666853, 0.140894, 0.074495, -0.730992, 0.992944, 0.263796, -0.169161, 0.421966, -0.251835, 0.246833, 0.467468, 0.229798, 0.471774, 0.010803, 0.537420, 1.005422, -0.544047, -1.095315, 0.525546, -0.378814, 0.772719, -0.635745, -0.187179, 0.751029, -1.497753, 0.605500, 0.040281, -0.410345, 0.186229, -0.747669, 0.437304, 0.144941, -0.459204, -0.198767, 0.449451, 0.858884, -0.359434, 0.437780, -0.007321, 0.043643, -0.462933, 0.042202, 0.678946, -0.236253, 0.311505, 0.022989, -0.236843, -0.317470, -0.867559, -0.468267, -0.032692, -0.619554, 0.740736, -0.394311, 0.164591, 0.771053, 0.628858, -0.159988, -0.132335, -0.270476, -0.244661, -0.045490, -1.068461, 0.361834, 0.681828, -0.072550, 0.995301, -0.476299, -0.130403, -0.443094, -1.400598, -1.715192, -0.609242, 0.392083, -1.302736, -1.254964, 0.315025, 1.056481, -0.284517, 0.145024, -0.197186, -1.191084, -0.475434, -0.337662, 0.478131, -0.134051, -0.541338, 0.065506, 0.982383, -0.017134, 0.082724, 0.754355, -0.607289, -0.561618, -0.672752, 0.071788, 0.801023, 0.425337, 0.566067, 0.911838, -0.390513, -0.408622, -0.555813, -0.248295, -0.697827, 0.293418, 0.759617, 0.671161, -0.225396, 0.400199, -0.615734, -0.089381, 0.535295, 0.435778, 1.210370, -0.322341, 1.353689, -0.435054, 1.075088, 0.098468, 0.002876, 0.152754, -0.287845, 1.134301, -0.570370, -0.841934, 0.699961, 0.661102, -1.207625, -0.006113, -0.292638, -1.588339, 0.058787, -0.000471, -0.147217, 1.015599, 1.263788, 0.532431, 0.427873, -0.974371, 0.469206, -2.081517, -0.388685, -0.406079, 0.724670, -0.209924, 1.067930, -0.204612, 0.721137, 0.562673, -0.707059, -1.473862, 0.808875, 0.731063, -0.227392, -0.899608, -0.376536, 0.129298, -0.727531, -0.080100, -0.603700, -0.313701, -0.383994, -0.551388, -0.945553, -0.780997, -1.005637, 1.145955, 1.714179, -0.614432, -0.243966, -0.018248, -0.135699, -0.490564, -0.513082, -0.206577, -0.301805, 0.211078, 0.084133, 0.834956, 0.017944, -0.600084, -0.209110, -0.153675, -0.477361, 0.291817, 0.594291, -0.062999, 0.252667, 0.253603, -0.102821, -0.086670, -0.257069, 0.350693, 0.477220, -0.893321, 0.729113, 0.591172, -0.885421, -0.189146, -0.826989, 0.353572, -0.433705, 0.448902, 0.380207, 0.027303, -0.379125, 0.636641, 0.343187, -0.146485, -0.199786, 0.017955, -0.172499, -0.219285, -0.017873, -0.242158, 0.135226, 0.705949, 1.523271, -0.805220, -0.542296, -2.357319, -0.279120, -0.574042, 0.293555, -0.602039, -0.058580, -1.546088, -0.100937, 0.281502, -0.406502, -0.351410, -0.459636, 1.166577, -0.482386, 0.655048, -0.003645, -0.069005, 1.035454, 0.287968, -0.083988, -0.167982, 0.045657, 0.909229, 0.163557, 0.200277, 0.661766, -0.859409, -0.271063, 0.376515, -0.768590, -1.227650, 0.251031, -0.152929, -0.052574, 0.698705, -1.078161, -0.407318, 1.014991, 0.492574, -0.408082, -0.391707, -0.306255, -0.208311, -0.109343, 0.148285, 0.494520, 0.340868, -1.124670, -0.289814, -0.446556, 0.735685, 0.302573, -0.316504, 0.691067, -0.925421, -1.132813, 0.672036, -0.337327, -0.476443, 0.034346, -0.878503, -0.416229, 0.471957, 0.982983, -0.279663, 0.109959, 0.352966, -0.322536, -0.531440, -0.073239, -0.904272, -0.925687, 0.689411, -0.287299, 0.071599, 0.040955, 0.363247, -0.329313, -0.734854, 0.052484, 0.374941, -0.262850, 0.674385, -1.883402, -0.469937, -0.481781, -1.376337, 0.911426, -0.421970, 0.758645, 0.866087, 0.054294, -2.286275, -1.039941, 0.392261, 0.600235, 0.566748, -1.022966, -0.349735, -0.143037, -0.283288, -0.109223, 0.977295, 0.156285, -0.241752, -0.327293, 0.096074, 1.355821, 0.886651, 0.834343, 0.149908, -0.404873, -0.431915, 0.959970, 0.061419, -0.285103, 0.111263, 0.202683, 0.376271, -0.116067, 0.138792, 0.445979, 0.290989, -0.242874, -0.346761, 0.078914, 0.034136, -0.870164, -0.230687, -0.679115, -0.453920, -0.498161, -0.709209, -0.146147, -0.017312, -0.411055, -0.364331, 1.790577, -0.176075, -0.723053, 0.661552, 0.051030, 0.182555, 0.162354, -0.562494, 0.585837, 0.219536, 0.723892, 0.267529, -0.493820, -0.000944, 1.073650, -0.610506, 1.969175, -0.321756, 0.165205, 0.560576, -0.445916, 0.147305, 0.081469, 0.733030, -0.639512, -0.128523, 0.821294, -0.156377, 0.661212, -0.447247, 0.747086, 1.197289, 1.120626, 0.644541, -1.037759, -0.927256, 1.108188, 0.443766, -0.037838, 1.038198, -0.777633, -0.206474, -0.710854, -0.355862, -0.180103, -0.558386, -0.997519, -0.045973, 0.604964, 0.637836, 0.303163, -0.184672, -0.063542, -0.034336, 0.758078, -0.627832, -1.903602, -0.933951, 0.025073, -0.847317, 0.818236, -0.455283, -0.528798, -0.199017, 0.047115, 0.782431, -0.855221, 0.439269, -0.285996, 0.625571, 0.096328, 0.089619, -0.090534, 0.529576, 4.501905, 1.003226, -0.528412, -1.492817, 0.117703, 1.167368, 0.004324, 0.388524, -0.685737, 0.061340, 0.331465, -0.331502, 1.002149, 0.419753, 0.302038, 0.609501, -0.266256, -1.288667, 0.551195, -0.368805, -0.667456, 0.202925, -1.491046, -0.230838, -0.461002, -0.123896, -0.036718, -1.897493, -0.502035, 0.408753, 0.170793, 0.273150, -0.000815, 0.124118, 0.156100, 0.594571, 0.917966, 0.389394, 0.366612, 0.521227, 0.562131, -0.225260, -0.873686, -0.046855, -0.135722, -0.503301, 0.022896, 0.998964, -0.616207, -0.396044, 0.910686, -0.176650, -0.609326, -0.449190, 0.576747, -0.521846, 0.431436, -1.468931, -1.514708, -0.154545, -0.435662, -0.673081, 0.473723, -0.590864, 0.025936, 0.215279, -0.016715, 1.134850, 0.661093, -0.513720, -0.121693, -0.326415, -0.136230, -0.568424, 1.253833, 0.408191, 0.712330, 0.746802, 0.040251, -0.059956, 0.024462, 0.911898, 0.040565, 0.012627, -0.030156, -0.520014, 0.850057, -0.270045, -0.847478, -0.157882, 0.443254, 0.806046, -0.186409, -0.422478, -1.133786, 1.079063, -0.166138, -0.571667, -0.098509, 0.364370, 0.282150, 1.241138, -0.254863, 0.275489, -1.065522, -0.333116, 0.098885, -0.880379, -0.746474, 0.167898, 0.622477, 0.259032, 1.088299, -0.092418, 0.559675, 0.407819, -0.206228, -1.064644, 1.147439, -0.502136, 0.062495, -1.109216, 0.489472, -0.802628, 0.369118, -0.378995, -0.544698, 0.353553, -0.191862, -1.536034, -0.484893, 0.444807, 0.157272, 0.381181, 0.449889, -0.075108, -0.352240, 0.955138, -0.743568, -0.823916, 0.352233, -0.961116, -1.893218, 0.018346, -0.022620, 0.857100, -0.413084, 0.308824, -0.227558, 0.461872, -0.424433, -0.114400, -0.592457, 0.122129, 0.790703, 0.366691, 0.552071, -0.423679, -0.394332, -0.144287, -0.164119, -0.047966, 0.180729, -0.003952, -0.355909, -0.133109, 0.789147, 0.631095, -0.126298, 0.862269, 0.101255, 1.096273, 0.545555, -0.639577, -2.018642, -0.439312, 0.003968, 0.327283, -0.092810, -1.042530, 1.364452, 1.179238, -0.883948, -0.371361, -0.329252, -0.025818, 0.505112, 0.683491, -1.012862, 0.493483, 0.539515, 0.500419, 0.288749, -0.114866, 0.352669, -0.456255, -0.580752, 0.585715, 0.144576, -0.035104, 1.233467, -0.195198, 1.108539, 1.116311, -0.347662, 0.016133, -0.638461, -0.484053, -0.320613, -0.085975, -0.232120, 0.161251, -0.568741, 0.960369, 0.155364, 0.735644, 0.690310, -0.882789, -0.913967, 0.081527, -0.160942, 0.896419, 0.815935, 0.775975, -0.035229, 0.086067, 0.087852, -0.830845, -0.041790, -1.243464, 1.077813, -1.032597, -0.740396, 0.250708, 0.160607, -0.222441, 0.088197, -0.714924, 0.323647, -0.979571, 0.903878, -0.037072, 0.791384, -0.732045, -0.145367, 0.280290, -0.850576, 1.411916, 0.231850, 0.276533, -1.070890, -0.301702, 0.003253, 0.272342, 0.515268, -0.294163, 0.183392, 0.166822, 0.171130, 0.058901, 0.402616, 1.227121, 0.264214, 0.447733, -1.129231, -0.026984, 1.065866, -0.700682, 0.365534, 0.382542, -0.918139, -0.707215, -0.394204, -0.508742, 0.480149, -0.082983, 0.365794, 0.358934, 0.430533, 0.000919, 0.917651, 0.456906, -1.114516, 1.789799, 0.563742, -0.845717, -0.513698, 0.248768, -0.953782, 1.674861, 0.771962, -0.836244, -0.015618, 0.296709, 0.385047, -1.406090, 0.869633, 1.178896, -0.181703, -0.002030, 1.396937, 0.106670, 1.051165, 0.232139, -0.785353, 0.440807, 0.134374, 0.422115, 0.017052, -0.285855, 0.881638, 0.943586, -0.419304, 0.852863, 0.640232, -0.067155, -0.269846, -0.091488, 0.728749, 0.800561, -0.179447, 0.737550, -0.039372, -0.298867, 2.224916, -0.833340, 0.586230, 0.680057, 0.273743, -0.536826, -0.445305, 0.109167, 0.042689, 0.324641, -0.135530, 0.299774, 0.135228, -0.322364, -0.536501, 0.250821, -0.529266, -0.036560, -0.006006, 0.202638, -0.135642, -0.427857, 0.223352, 0.747627, 0.093975, 0.408209, -0.207240, -0.228368, 0.782170, -0.550407, -0.078093, 0.006059, 0.011183, -1.023877, -0.775297],
 	"snowflake-arctic-embed2": [-0.337318, 0.485787, -0.037816, -0.943875, -0.819299, -0.257385, -0.115470, 0.246724, 0.048614, 0.159151, -0.467606, -0.364392, 0.089869, -0.209655, 0.342226, -0.527060, 0.520997, 0.927532, -0.102562, 0.333813, -0.854380, -0.701242, -1.463815, 0.799778, 0.750539, 0.757705, -0.125063, 0.527705, -0.437741, 0.078491, 0.460214, 0.255947, -0.031090, -0.345135, 0.058851, -0.327729, -0.372813, 0.352275, -1.168406, 0.354936, 0.625492, 0.045635, -0.242759, 0.650628, 0.195748, -0.495107, -0.539670, -0.986722, -1.069306, -0.014932, -0.385889, 0.215507, 0.333816, -0.158572, 0.246042, -0.687132, 0.207916, -0.342494, -0.347905, -0.563665, 0.336679, -0.059624, -0.155887, -0.246520, 0.296986, 0.569967, 0.131530, -0.355191, -0.582369, 0.490316, -0.415379, -0.019140, -0.214617, 1.085840, 0.019224, -1.180745, -0.544194, -0.182204, -0.471391, 0.877849, 1.787677, 0.196131, 0.338737, 0.554189, 0.723178, -0.052438, -0.270815, 0.443365, 0.101404, -0.692780, 0.004322, -0.050623, -0.693687, -0.116200, 0.434660, 0.065080, -0.055940, 0.122773, -0.999912, -0.499409, -0.359269, 0.027620, -0.399372, -0.299647, -0.744792, 0.102263, 1.084825, 0.028898, 0.323312, 0.014242, 1.325412, 0.983624, -0.325036, 0.526028, -0.157539, -0.063860, 0.436522, 0.116374, 0.118433, 0.614439, -0.139657, 0.522618, 0.017510, -0.188138, -0.677374, -0.840603, 0.192689, -0.135996, -0.894670, -0.158343, -0.792459, -0.136472, -0.355442, -0.123314, -0.910940, 0.186382, 0.334950, 0.204000, 0.222174, 0.263186, 0.094970, 0.061765, 0.345430, -0.235054, 0.172441, 0.881599, 0.841009, -0.169058, -0.269911, -0.217716, 0.359628, 0.208658, 0.652820, 0.442545, -0.161419, 0.418893, 0.292317, 0.231373, -0.805518, -0.000739, 0.297150, 0.121066, -0.408190, 0.273577, 0.463801, 0.064206, 0.312172, 1.092058, -0.371314, -0.277224, -0.683628, -0.435973, 0.045403, -0.140166, 0.184559, 1.853358, 0.416705, -0.374452, 0.760777, 0.248660, -0.569295, -0.954281, -0.347827, 0.531861, -0.570648, 0.556323, 0.206901, 0.252571, -0.043244, -0.062258, 1.049511, -0.402070, -0.068134, -0.149358, -0.012464, 0.620048, 0.654902, -0.538302, -0.245287, -0.066978, -1.405453, -0.445957, 0.331479, -0.495953, 0.923955, -0.328841, -0.644721, -0.372834, 0.357546, 0.478619, -0.081360, 0.340657, -0.122412, -0.597997, 0.235506, 0.016301, -0.058082, 0.446411, -0.802173, 0.115207, -0.464422, -0.257083, -0.133011, -0.359320, 0.389579, 0.485856, -0.053931, 1.149238, -0.967310, 0.020607, -0.235731, -0.358982, -0.698047, 0.653281, 0.734305, -0.836348, 0.074222, -0.177832, -0.486657, -0.344304, -0.443823, -0.255469, -0.606071, 0.069794, 0.069820, 0.494822, -0.536611, -0.175762, -0.448531, -0.522376, -0.108621, -0.271191, -0.141843, 0.071029, 0.171164, -0.195819, -0.059490, 0.026950, -0.433273, 0.016244, 0.146567, -0.032891, -0.039686, 0.323199, -0.057771, -0.176835, 0.470351, 0.048500, 0.327727, -0.158381, 0.162835, -0.407448, -0.555830, -0.465591, 0.264512, 0.354612, 0.218764, 0.031698, -0.265124, 0.312480, 0.181667, -0.338958, 0.186351, 0.053644, 0.812065, -0.862652, -0.026800, 0.572852, 0.005986, 0.828237, 0.090118, 0.063922, 0.076976, 0.096964, 0.180304, 0.781934, -0.003830, -0.027061, 0.221362, 0.449681, 0.125572, -0.095162, 0.018868, -0.360262, -0.373733, -0.392008, -0.125284, -0.212061, 0.159567, -0.233902, -0.235149, -0.190911, -0.028427, 0.344431, -0.155667, 0.722263, -0.144527, -0.199895, 0.188895, 0.894280, 0.140612, 0.698334, -0.078967, -0.845755, 0.500688, -0.028362, -0.309373, -0.050033, 0.393043, -0.684940, -0.012917, 0.442933, -0.152553, -0.068629, -0.237759, -0.239215, 0.132807, 0.019395, 0.123185, 0.242981, 0.786300, 0.018274, 0.157075, 0.240215, 0.229825, -0.137675, -0.203565, -0.245311, -0.036812, 0.430710, -0.207664, -0.132277, 0.557027, 0.452612, -0.331802, 0.004795, 0.139062, 0.078491, -0.501776, 0.156317, -0.092398, 0.078616, -0.144665, -0.419595, 0.396099, 0.319320, -0.084284, -0.013825, 0.811091, -0.228181, -0.249798, 0.043037, -0.014254, 0.145196, -0.379182, -0.241216, 0.270687, 0.331287, 0.078576, 0.225569, 0.075139, 0.206449, -0.216213, -0.179613, -0.196487, -0.121997, 0.634396, 0.243545, -0.646855, -0.196892, 0.164843, -0.165656, -0.017864, -0.220435, -0.315971, -0.428766, -0.276434, 0.298087, 0.034363, 0.339730, -0.001861, -0.061919, -0.482472, -0.097411, -0.183378, -0.040443, -0.111079, 0.394592, 0.943151, -0.304478, 0.354390, 0.196057, 0.199277, 0.341486, 0.218786, 0.193412, 0.226260, -0.177706, -0.272467, 0.395993, -0.259079, 0.001724, 0.371750, 0.350838, 0.290101, -0.419872, -0.302239, 0.187943, -0.047100, 0.501532, 0.395721, 0.057455, 0.260134, -0.393160, 0.164219, 0.066535, 0.172231, -0.359559, -0.161729, 0.682735, -0.679863, 0.053116, -0.210306, 0.089449, 0.457067, -0.076446, 0.443101, 0.434519, 0.493740, -0.721550, -0.047476, -0.149920, 0.792890, -0.869984, 0.416676, -0.278901, -0.456933, 0.201800, 0.250265, 0.093752, 0.216085, -0.122870, 0.141153, 0.164069, -0.099821, -0.121633, 0.180234, -0.016088, -0.070337, -0.163921, -0.103767, 0.440052, 0.191798, 0.114916, 0.325931, -0.172159, 0.250953, 0.115396, -0.131392, 0.363941, -0.167835, -0.198244, -0.989684, -0.186654, 0.199121, 0.593739, -0.318832, -0.185066, -0.236732, -0.230723, 0.018697, -0.223611, -0.002621, 0.185624, -0.180204, 0.115503, 0.430932, -0.117918, 0.103355, 0.195856, -0.223646, 0.132063, 0.571766, -0.608208, -0.051812, 0.142387, -0.170185, -0.515449, 0.352781, 0.486267, -0.422757, 0.272677, -0.105689, 0.340707, -0.156664, -0.782644, 0.512138, -0.341311, -0.487717, 0.194345, -0.057030, -0.015855, 0.099853, 0.549729, -0.415887, 0.604569, 0.066785, -0.448733, -0.011270, -0.616035, -0.562425, -0.334210, -0.393114, -0.628784, -0.305269, 0.209872, -0.199347, 0.101649, 0.090523, 0.282902, -0.088015, -0.191279, -0.044561, -0.709134, 0.072914, -0.249584, 0.037448, 0.165476, 0.059152, 0.055725, -0.518436, -0.005831, -0.164648, -0.281878, 0.347298, 0.177980, -0.114527, 0.210128, 0.120374, -0.146421, 0.075994, -0.181335, 0.150211, -0.225272, -0.489089, -0.078891, -0.178676, -0.740558, 0.205851, 0.392087, -0.328261, -0.068016, -0.021789, -0.280372, 0.704844, -0.058202, 0.168101, 0.180238, 0.096060, -0.275457, 0.027325, 0.425901, -0.313618, 0.154550, 0.204825, -0.104279, 0.245843, -0.489933, -0.046835, -0.247613, 0.823351, 0.004220, 0.017303, -0.158378, 0.154119, -0.197591, -0.127734, 0.159808, -0.600171, -0.346363, 0.469721, -0.058461, -0.315804, -0.083556, 0.267933, -0.717538, -0.110205, -0.563653, 0.005439, 0.389236, 0.552098, 0.436608, -0.472080, 0.223911, -0.471215, -0.560872, -0.021037, 0.275148, 0.461694, -0.325049, 0.598732, 0.376293, -0.225930, -0.151626, 0.146455, 0.396804, 0.021290, 0.037224, 0.235271, 0.329889, 0.672245, -0.496795, -0.378117, -0.350688, 0.435732, 0.370599, 0.008810, 0.555823, 0.623420, 0.260685, -0.383603, -0.185294, 0.175743, 0.406610, -0.249284, 0.318281, 0.203903, 0.182324, -0.028281, -0.134342, 0.156111, -0.666054, -0.169002, 0.259389, -0.127781, -0.134607, 0.133519, -0.287695, -0.392834, 0.252281, -0.458701, 0.297617, 0.066121, 0.535986, -1.198022, -0.872793, -0.535140, 0.635081, -0.181788, 0.259800, 0.160934, 0.403854, -0.016975, 0.122155, 0.106455, 0.017354, 0.064465, -0.004753, 0.183455, 0.125073, 0.000588, -1.079189, -0.091745, 0.131509, -0.038783, 0.086098, -0.011477, 0.033550, -0.027044, -0.398735, -0.133224, -0.045345, -0.183940, 0.100738, 0.766663, 0.008661, -0.061123, 0.052512, 0.097162, 0.122948, -0.363722, -0.118078, -0.802726, -0.130973, -0.369868, 0.688861, 0.363402, -0.023863, 0.067200, -0.240462, 0.499130, -0.021514, -0.149011, -0.011722, -0.237259, 0.152696, 0.124860, 0.081450, 0.090567, 0.048832, 0.615275, 0.147335, -0.101912, -0.132456, 0.131634, -0.168211, 0.355089, 0.199154, -0.000686, -0.334698, 0.464978, 0.060418, 0.398211, 0.122107, 0.336332, -0.415999, 0.140270, 0.113768, -0.197597, -0.220913, -0.169208, 0.155395, 0.350888, -0.163269, -0.365437, 0.111591, 0.043267, 0.600786, -0.172549, -0.028790, 0.133079, 0.111489, -0.018018, 0.260471, -0.890617, 0.236967, 0.416443, 0.903602, -0.082193, -0.280290, 0.138442, 0.411884, -0.454041, 0.491140, -0.444857, -0.186720, -0.382473, -0.126291, 0.495247, -0.631967, -0.266918, -0.220935, 0.367287, 0.502838, 0.155025, -0.429546, -0.408211, 0.234250, -0.462584, -0.046278, -0.231486, 0.209515, 0.246387, -0.061538, 0.270009, -0.012469, -0.420804, 0.087525, -0.513991, 0.020571, 0.507510, -0.444389, -0.022836, -0.590260, 0.167235, -0.201333, 0.189617, 0.279683, -0.402719, 0.145037, 0.929912, 0.430638, -0.179808, 0.080103, 0.600420, -0.489557, 0.381116, -0.722508, -0.164676, -0.037822, -0.305011, -0.376997, 0.013216, -0.315066, 0.022070, 0.528256, 0.300673, 0.108121, 0.488978, -0.100333, -0.130812, 0.217841, -0.220755, -0.671549, -0.076320, 0.525022, 0.184758, -0.214599, 0.194860, 0.236146, -0.240089, -0.474762, -0.037878, 0.149301, -0.063512, 0.294585, 0.747633, -0.437204, 0.083148, 0.410454, 0.142592, -0.260462, 0.127561, -0.031248, 0.321641, 0.304835, -0.315456, 0.321474, -0.200811, -0.007041, -0.019529, 0.332829, 0.095737, 0.888721, -0.068599, 0.112251, 0.200350, 0.349384, 0.130674, -0.199802, 0.104813, -0.402484, 0.338873, 0.018662, -0.304823, 0.138016, 0.002506, -0.095239, -0.271009, -0.849811, -0.423410, -0.232685, -0.589317, 0.450318, -0.305014, 0.563061, -0.142598, 0.286005, 0.081525, 0.097474, 0.012287, 0.317698, -0.170248, -0.958868, 0.213176, 0.301248, 0.396288, -0.022001, 0.404562, -0.049691, -0.227430, -0.230833, 0.232825, 0.310583, 0.357731, 0.113404, 0.015757, 0.094021, 0.318617, 0.595829, -0.039896, 0.615338, -0.176179, -0.043411, 0.534391, -0.335011, 0.427954, -0.310139, -0.024028, -0.739826, -0.112875, -0.258219, 0.677319, -0.274854, -0.202554, -0.027695, 0.908598, -0.016939, 0.387993, 0.037429, -0.101158, 0.166008, 0.416612, 0.189825, -0.642134, -0.106222, 0.141566, -0.026880, 0.021668, 0.221566, 0.267000, 0.196498, -0.181309, -0.062393, 0.203500, 0.037145, -0.128068, -0.645994, 0.417619, 0.601422, 0.012565, 0.457200, -0.532447, 0.277037, -0.485728, -0.274002, 0.261037, -0.255880, -0.009387, 0.491182, 0.383511, 0.125899, -0.204434, 0.205015, 0.109285, -0.415707, 0.095736, 0.147818, 0.122518, 0.038847, 0.232760, 0.166897, 0.331865, -0.357069, 0.314145, -0.216854, -0.337515, 0.259433, 0.320100, -0.172233, -0.315187, 0.197327, 0.046211, -0.521370, 0.391666, 0.248245, -0.153588, -0.275701, -0.000683, -0.205512, 0.000457, -0.134299, 0.452796, -0.099954, 0.194279, -0.210376, -0.530722, -0.265526, -0.408304, 0.263296, 0.311573, 0.364050, 0.212423, 0.355866, -0.102873, -0.300132, -1.024923, 0.019980, 0.381418, 0.513570, -0.051673, 0.091931, 0.043775, 0.022401, 0.230052, 0.140274, -0.147261, 0.173270, 0.150905, -0.167662, 0.099411, -0.022456, -0.727629, -0.310803, -0.555541, -0.286311, -0.483686, -0.054392, 0.234199, -0.675458, -0.605178, -0.033194, 0.591152, -0.440875],
-	"embeddinggemma": [-0.180607, -0.005889, 0.056060, 0.003927, -0.000914, 0.039077, -0.014656, 0.043961, 0.019580, -0.035296, -0.007450, -0.005179, 0.019225, -0.019774, 0.091014, 0.019635, 0.013230, -0.058614, -0.087619, -0.020779, 0.013899, 0.020425, 0.020929, -0.013275, 0.006773, 0.035736, -0.000525, 0.041411, -0.040545, -0.020314, 0.050535, 0.011460, 0.006180, 0.004620, 0.009491, 0.063213, 0.026650, -0.078453, 0.014068, -0.007105, -0.067159, 0.070726, -0.005620, -0.001726, 0.039832, -0.008431, -0.042386, -0.074533, 0.017800, 0.028184, -0.036976, 0.010219, -0.076775, -0.004639, -0.032891, -0.027700, -0.001458, 0.025879, -0.049648, 0.054045, -0.085062, -0.058225, -0.033515, 0.008202, 0.031972, -0.016915, -0.008515, 0.009412, 0.030835, 0.290311, -0.003487, -0.034788, -0.038132, -0.043061, 0.256677, -0.011666, 0.014681, -0.028823, -0.018269, 0.039487, 0.031672, 0.019415, 0.005047, -0.028207, 0.068291, -0.014481, -0.038045, -0.002510, 0.018587, -0.037189, 0.005428, -0.023473, -0.014835, -0.017678, 0.018800, -0.063929, 0.020199, -0.006755, -0.017839, 0.010022, 0.013381, -0.012818, 0.049738, 0.103187, 0.042391, -0.026812, -0.025366, -0.020568, -0.027174, 0.016077, -0.039851, 0.020666, -0.000882, -0.005518, -0.019171, 0.019656, -0.034325, -0.011952, -0.007715, 0.016382, 0.023478, 0.008794, 0.008553, -0.029300, -0.008735, 0.039593, -0.000716, 0.020169, -0.052826, -0.029727, -0.012380, 0.010375, 0.018598, 0.022578, -0.026641, 0.029909, -0.029175, 0.023739, 0.107380, 0.020876, -0.023140, -0.044402, -0.022905, 0.007913, 0.022877, 0.023412, -0.039243, -0.043118, 0.028891, -0.026474, 0.058203, -0.009469, 0.043529, -0.001148, -0.039958, 0.014740, -0.004989, 0.036507, -0.051620, -0.031175, -0.044597, -0.012939, 0.042538, 0.040928, -0.017088, 0.067081, 0.034191, 0.039442, -0.032367, -0.025219, 0.016267, -0.075883, 0.012993, -0.009841, -0.031461, -0.003222, 0.025393, -0.014700, 0.137779, 0.059643, -0.022662, 0.071586, -0.021947, -0.016830, -0.051643, 0.079643, 0.031313, 0.028581, -0.046328, -0.030986, 0.032716, -0.026602, 0.006470, -0.039692, 0.014163, 0.026654, 0.093365, 0.032694, 0.015409, 0.056742, 0.010313, -0.042171, -0.028517, -0.032063, -0.032345, -0.007562, -0.016120, 0.027168, -0.013794, 0.007886, 0.012508, 0.041319, -0.045403, -0.017043, 0.020755, 0.023180, -0.032282, -0.001933, 0.004788, -0.008845, -0.008160, 0.060156, -0.013906, -0.023820, -0.079445, -0.008594, -0.053413, -0.022282, 0.047015, -0.032093, -0.024175, 0.047034, 0.008884, 0.004303, -0.028246, -0.051090, -0.052362, 0.023090, 0.039360, 0.022026, -0.030191, -0.013131, 0.070108, 0.026949, -0.021665, 0.004105, 0.020551, -0.040327, 0.019240, -0.016254, 0.044880, 0.002207, 0.014687, -0.025028, 0.019416, 0.024616, 0.041669, 0.006119, 0.037250, -0.028410, 0.009880, -0.040271, 0.002080, -0.025919, -0.004405, -0.000738, 0.056129, -0.012073, -0.076000, -0.029060, 0.023691, -0.032691, -0.007954, -0.007977, 0.004320, 0.034496, -0.004204, 0.010539, -0.016130, -0.017084, 0.008819, -0.045655, 0.039586, 0.072274, -0.001564, -0.018986, 0.008003, 0.014540, -0.020202, -0.047112, 0.031214, 0.016062, 0.018050, 0.010549, -0.011854, -0.018039, -0.069473, -0.009963, -0.051167, 0.009143, -0.153973, -0.004123, -0.005579, -0.040051, 0.035759, 0.006347, 0.101297, 0.023210, -0.027991, 0.018936, -0.015880, -0.005618, -0.015289, 0.004609, 0.021374, 0.039930, 0.006416, -0.023404, 0.011298, 0.049358, 0.030248, -0.003199, 0.037742, -0.005549, 0.002536, -0.009225, 0.075556, 0.025399, -0.105251, 0.051742, 0.024327, -0.009478, 0.085254, 0.022996, -0.011836, 0.017050, -0.033193, -0.024902, -0.010120, -0.008034, -0.030815, -0.036057, 0.012252, -0.011298, 0.039328, 0.059369, -0.006778, 0.022502, -0.035587, -0.017115, -0.009679, -0.038681, 0.008259, -0.032500, -0.032367, -0.006804, 0.017457, 0.009156, 0.081487, 0.022148, 0.067775, -0.093894, 0.031846, -0.020408, 0.030209, -0.025761, -0.002701, -0.010321, 0.027960, 0.033251, -0.036872, -0.050187, 0.000647, -0.006226, -0.023869, 0.016206, 0.037978, 0.010212, 0.015309, 0.010771, -0.021051, -0.025120, -0.015512, 0.014893, 0.025609, 0.020798, 0.017523, -0.003228, -0.039813, -0.000640, -0.072478, 0.023971, 0.017670, -0.000263, 0.027826, -0.016781, -0.062787, 0.033756, -0.054924, -0.016018, -0.014411, -0.023932, -0.019506, 0.053342, 0.010944, -0.042293, 0.008613, -0.034974, -0.021446, -0.009096, 0.009447, -0.047535, -0.026310, 0.052943, -0.018407, -0.067084, -0.015153, 0.022066, -0.094386, -0.020756, -0.018389, -0.007097, 0.023673, 0.029475, 0.016351, 0.056441, 0.041489, 0.022455, 0.045231, 0.016921, 0.038108, -0.050633, -0.011182, 0.043934, 0.004127, -0.002120, -0.043600, 0.029996, 0.030896, 0.053286, -0.040573, -0.056230, 0.005976, 0.023566, 0.002953, 0.024094, -0.013409, -0.004520, -0.008556, -0.013104, 0.021295, 0.055402, 0.010722, -0.001335, -0.017037, -0.005384, -0.001479, 0.011284, -0.036778, -0.025387, 0.011728, 0.031009, 0.013086, -0.033330, 0.001464, -0.042345, -0.004958, -0.047134, -0.046586, -0.009092, -0.035167, 0.028894, 0.016499, 0.027641, -0.018313, -0.030339, -0.007525, -0.002521, 0.016437, 0.038167, -0.039028, 0.018501, 0.011082, -0.038511, 0.052748, -0.026239, 0.013372, 0.048941, 0.024461, 0.028460, -0.032540, 0.030801, -0.004988, 0.019265, -0.000821, -0.003360, -0.008584, -0.010812, -0.038912, 0.021251, -0.007917, 0.009466, 0.044254, 0.025877, -0.041038, 0.021859, -0.028726, -0.025102, -0.027201, 0.021041, -0.024082, -0.019939, -0.028786, 0.035638, 0.012499, -0.056768, 0.017606, 0.013645, 0.063322, 0.019431, -0.012916, -0.014921, 0.021280, 0.038604, -0.051948, 0.014327, 0.000644, -0.037636, 0.044974, -0.034804, -0.018713, -0.015628, 0.004696, -0.059636, -0.028115, -0.026099, 0.039006, -0.008930, 0.003926, -0.002180, 0.026208, -0.013091, -0.008003, -0.021853, 0.025176, 0.002682, -0.005717, 0.015906, -0.016278, 0.059330, 0.037651, -0.019816, -0.002063, 0.078280, 0.038225, -0.007886, -0.019419, -0.013467, 0.036248, 0.022667, -0.008686, -0.018650, -0.003174, 0.005356, 0.015273, -0.042080, 0.017633, -0.010409, -0.034426, 0.015203, -0.015169, 0.030749, 0.026981, 0.009063, 0.007627, 0.091058, 0.009252, -0.058266, 0.020832, 0.024774, -0.028801, 0.026656, 0.022873, 0.016810, 0.016199, 0.007166, 0.024154, -0.028103, -0.006009, 0.022183, 0.005554, -0.028878, -0.010401, 0.013940, -0.019788, 0.003170, 0.049577, -0.023240, 0.052571, -0.011319, -0.015615, -0.054576, 0.005388, -0.000214, 0.044267, 0.015146, 0.001074, 0.064579, 0.004720, -0.008599, -0.013695, 0.012638, -0.040416, -0.000573, 0.011788, -0.003367, 0.008859, -0.018324, 0.011476, 0.018124, 0.010761, 0.046478, 0.010131, -0.050664, 0.021277, -0.018322, -0.003070, -0.019420, 0.029148, 0.012572, -0.004788, 0.040219, 0.039951, 0.043636, -0.005553, -0.006092, 0.066745, 0.027182, -0.029501, 0.014834, -0.021343, 0.023051, -0.000411, 0.026976, 0.030971, -0.004881, 0.003360, -0.006648, -0.008272, 0.041518, -0.027338, 0.001205, -0.006581, -0.024365, -0.033114, -0.024966, 0.010534, -0.012564, 0.045804, -0.004190, 0.057720, -0.022263, -0.003263, 0.040221, -0.028405, -0.004599, -0.023340, 0.005303, -0.001754, -0.057940, 0.006630, -0.015906, -0.024751, -0.005112, -0.024829, 0.034132, 0.027506, -0.011464, -0.000899, 0.065783, 0.021920, -0.007581, -0.001119, 0.025989, -0.010824, -0.017624, 0.003288, -0.036588, -0.003869, -0.009002, 0.033091, -0.091586, -0.008219, -0.033366, -0.006626, 0.005773, -0.006797, 0.028244, 0.020040, -0.012321, 0.039671, 0.017080, 0.055742, -0.003618, 0.025329, -0.000387, -0.003931, -0.011762, 0.029402, 0.019193, -0.035431, -0.012032, 0.005728, 0.017904, 0.013591, 0.010789, -0.027405, -0.014921, -0.042695, -0.036618, -0.013241, -0.041739, 0.027101, 0.030270, -0.025742, 0.034299, 0.014907, -0.028621, -0.017876, -0.010664, 0.016825, 0.028010, -0.032065, -0.031433, -0.011079, -0.015334, 0.020823, -0.020160, 0.015865, -0.003164, 0.008807, -0.026496, 0.028156, -0.040351, 0.024934, -0.084462, -0.008195, -0.022980, -0.019642, -0.000236, 0.005202, -0.016276, 0.005408, 0.056486, -0.031008, -0.032354, -0.018442, -0.006234, 0.005748, -0.000144, 0.004343, 0.045699, 0.014304, -0.011867, 0.004920, -0.057248, -0.006343, -0.005401],
+	"embeddinggemma": [-0.12450384,-0.008483137,0.07272157,0.03808375,0.0027417676,0.047930364,-0.005505235,0.00081325934,0.0021524606,-0.013728436,-0.010998805,0.0028627072,0.018985428,-0.012904515,0.046575595,-0.026254663,0.02541724,-0.09092873,-0.045962866,-0.063897476,-0.042216204,0.03813572,0.012773487,0.023928061,-0.005071306,0.028074779,-0.031752985,0.04053142,-0.06981053,0.016790077,0.04597915,-0.017675584,0.010519284,-0.014901164,-0.013032433,0.057271868,0.000064583524,-0.06714962,-0.035376225,-0.034597002,-0.029926235,0.06264552,-0.046644744,0.031053713,0.07082022,0.011814381,-0.040872667,-0.07961028,0.033026423,0.041717317,-0.03385331,-0.021152731,-0.08368527,0.043530088,-0.0050367205,-0.049875673,0.0071340417,0.03148767,0.006765968,0.09456071,-0.01868512,-0.044528134,-0.01701375,-0.04374692,0.04971617,0.000075865435,-0.020694783,0.0053787553,0.04033573,0.1439055,0.007873791,-0.0031398397,-0.045466643,-0.0024239568,0.15588446,-0.055446226,0.0043428917,0.00824842,-0.0074107186,0.018635016,0.03404217,0.009752813,-0.018802145,-0.012835157,0.08597584,-0.0049047936,-0.059129775,0.031021371,0.008241527,-0.059602488,0.037400663,-0.012791538,-0.02854689,-0.0477038,0.021985019,-0.012982959,0.032623935,0.016155735,-0.0548312,0.030141141,0.029879617,0.008901506,0.027251069,0.14231315,0.07549342,-0.024485651,0.0017742233,-0.03784917,0.010309572,0.035162512,-0.034220062,-0.0012057552,-0.0076939925,0.058243837,-0.022414884,0.040280543,-0.056814574,-0.036289215,-0.0012519328,0.014788234,-0.016093446,0.017444612,0.017992603,-0.047886696,-0.03070063,0.044969123,-0.020998936,0.032632202,-0.0043057296,-0.019854616,-0.022677094,-0.0030410273,0.028993422,0.02324583,-0.021304283,0.013425734,-0.026279647,0.056474093,0.05018277,0.010035705,-0.04195509,-0.045884486,-0.025506042,0.018347306,0.013686503,-0.023972843,-0.000119777214,-0.013079254,0.011026595,-0.01611124,0.05791198,-0.037585907,0.019498514,-0.005429899,-0.015343584,0.0027703664,-0.0013130747,0.029645067,-0.019444136,-0.034170702,-0.019605882,-0.054102674,0.048030864,0.046180286,-0.021762656,0.032757204,0.03607179,0.017384786,-0.027185217,-0.015453646,0.06411382,-0.04498513,0.02567064,0.023357876,-0.009279791,0.019822424,0.005856121,-0.03165952,0.049315553,0.033121206,-0.024565876,0.03354138,-0.0035943359,-0.06347322,-0.07019716,0.0515494,0.058004808,0.02715917,-0.07037138,-0.030366585,0.011517769,-0.08340346,-0.048579797,-0.0520211,-0.006948013,-0.03626045,0.07794296,0.003851089,0.034298096,0.030997105,0.027654605,-0.02270389,0.0187371,0.008358891,-0.007013505,-0.00031988585,0.033254288,0.02536488,-0.026883578,-0.028695108,0.013178631,0.06327781,-0.019510822,0.011175227,0.011245305,0.0022770206,-0.030883973,-0.017602483,-0.017449653,0.01061622,0.013697404,0.07494269,0.08221704,-0.007523185,-0.0541116,-0.06832936,-0.016762197,0.011599077,-0.00087946595,-0.021330856,-0.02668906,0.018692536,0.034061063,-0.035641596,-0.019047502,-0.0015957385,-0.051490173,0.022566674,0.03559061,0.0083686495,-0.07564111,-0.06432122,0.11565676,-0.0059672887,-0.02640817,0.018183239,0.020090617,0.017564762,0.0027294422,-0.050321367,0.09422991,-0.03446513,-0.015702741,-0.044097994,0.024012651,0.0014804677,0.010055886,0.02501882,0.05177506,-0.013784503,-0.026747849,-0.05052224,0.04112142,0.002696904,-0.0016453417,-0.050891995,0.046554837,-0.014886062,-0.027647587,-0.043601822,-0.02448816,0.012447993,0.0005095703,-0.021845091,-0.0045107277,0.03867541,0.04439301,0.030712662,-0.0075296448,-0.03053027,0.041036762,-0.046724595,0.048076984,0.046488848,-0.04342807,-0.002481413,-0.017864093,0.01817821,-0.005812813,-0.04505837,0.029866738,0.0066726934,-0.015718492,-0.015967507,0.0072315764,-0.044522196,0.034790166,-0.053740107,-0.024348693,-0.020744357,-0.13902192,0.005872423,-0.018595096,-0.035172023,0.019584833,-0.012963526,0.12255026,0.01783379,-0.076245755,0.00765967,-0.017991453,-0.019969562,-0.011706446,0.01684343,0.048394307,0.02363477,0.026784802,-0.0076567484,-0.010870576,0.06712688,0.0021803034,0.015060966,0.02293363,-0.014937443,0.00048647242,-0.03953886,0.061610103,0.03887944,-0.07725543,0.05459803,0.004954948,0.0064423676,0.048860658,0.01950372,0.033972315,-0.026307566,-0.015287088,0.0007196691,0.005659369,-0.0190877,-0.0025464743,-0.025070421,0.0052730707,-0.05032735,0.07065558,0.02621144,-0.009545725,-0.0026962324,-0.039304726,0.0017871617,0.0011180675,-0.08165134,-0.008923482,-0.042995863,0.053747553,-0.0028345839,0.04023364,-0.0060108756,0.065875456,0.017588433,0.023908684,-0.056050505,-0.013340056,-0.014317183,-0.010871281,-0.0072426754,0.009651993,0.037325352,0.023199487,0.036096834,-0.053489983,-0.05646688,-0.037928447,-0.01543437,-0.026246987,0.030358704,0.03521179,0.0020622846,0.0070019485,-0.0137162255,0.007876341,0.02483198,-0.030286592,-0.008539845,0.008532777,-0.0027765909,0.010427864,-0.014993019,-0.04049472,0.009484545,-0.0027442318,0.024136418,-0.051806044,0.035224486,-0.0058375313,-0.0037312477,-0.07460195,0.03339953,-0.05493711,-0.019096613,-0.051744744,-0.035870362,0.06843904,0.044834454,0.024328979,-0.006188006,0.040921327,-0.007126837,-0.01975571,0.025764871,0.0062305406,-0.04873346,0.0057622297,0.034758713,-0.027414966,-0.06783131,0.033828363,-0.02032774,-0.03303916,-0.015037942,-0.035833962,-0.020477079,0.017828338,0.012760347,0.015180421,0.056738965,0.04823815,0.023448944,0.050740086,0.054457445,0.06578877,-0.089593075,-0.024821788,0.036650658,-0.018632118,0.039117564,-0.047145713,0.02673321,0.09030577,0.041076384,-0.02285268,-0.040775813,-0.010094096,0.05287685,0.007306056,0.024563767,-0.011390829,-0.023497293,-0.013359603,-0.019801034,0.006772922,0.012355153,0.033961628,-0.024891187,-0.025527367,0.01085947,0.032637384,-0.028760977,-0.030053277,0.032991678,0.012635883,0.011710381,0.018566394,-0.02595292,0.006116506,-0.042825878,-0.0023800444,-0.04599584,0.011410713,0.03419128,-0.0017161333,0.03313885,0.03911471,-0.0018313229,-0.0273656,-0.016138256,0.024875658,0.027512563,-0.0039485293,0.016017763,-0.01840992,0.041588783,0.03671507,-0.011474432,0.035999324,-0.016315656,0.036528774,0.05013307,0.04739169,0.033244733,-0.050814494,0.0069757733,-0.020270811,0.027146911,-0.025266316,0.023859242,-0.0033093232,-0.058595367,-0.051715113,0.02741703,-0.030107668,-0.0000055609476,0.051227853,0.0029633176,-0.038901083,0.02169905,0.0152469035,0.0040733963,-0.068115056,0.013191014,-0.020702105,0.0015946822,-0.06885718,-0.015559467,0.0069767195,-0.031457193,0.06360454,0.032533906,0.07155586,0.0197944,-0.03866,0.010868674,0.04026781,-0.03669285,-0.026815189,0.0050425553,0.024366446,-0.042210426,0.016586194,-0.013075844,-0.0124990605,-0.03176111,0.04670986,-0.024721785,-0.038521677,0.001976238,0.0080452245,0.0018042477,0.01074977,0.0023429932,0.03982667,-0.021001004,-0.0037965016,-0.033933796,0.024376022,-0.0023602655,0.0063689444,0.018567858,-0.020147579,0.05128298,0.008444095,-0.047727082,0.029254336,0.0026799766,0.027745195,-0.06559633,-0.007476299,-0.015201028,0.009263398,0.028750904,0.052716527,-0.02467983,0.026396379,0.04058189,0.014792376,-0.049767688,-0.021554474,0.001729357,-0.035790008,-0.0004697369,-0.0277111,0.045796197,0.050971564,0.012513236,-0.020428395,0.10738898,-0.027244125,-0.008568922,0.06473041,0.0061174273,-0.0031493483,0.026139826,-0.022896044,0.015093715,0.05003743,0.02521619,-0.004138356,-0.021165386,-0.030641362,-0.0068215155,-0.050097536,-0.017210392,-0.015567674,0.040942892,-0.023066176,0.019531872,0.0264072,-0.013721839,-0.012999497,-0.028786486,0.0037803808,-0.04282046,0.008984594,-0.021370431,-0.014187944,0.0028228424,-0.033489857,0.028858682,-0.050260708,-0.048791647,0.056657813,0.026932131,-0.033477765,0.00927866,0.042335033,-0.057607625,0.0007818642,0.009859045,0.001588016,0.019881304,0.017137071,0.014625744,-0.008581171,-0.053978402,0.03914142,0.00044243276,-0.043228973,-0.0059901136,0.03208422,-0.03624427,0.008940004,-0.0019451237,0.006697995,0.0407854,-0.021887638,-0.02917045,0.025299028,0.03870893,-0.01941995,0.02656992,-0.040555358,0.02535364,-0.024873165,-0.012045761,-0.008307672,0.00042132268,0.00072480034,-0.015075901,0.039166786,0.065113276,-0.0046298304,0.004706694,-0.0067066127,0.0071328063,-0.037851557,-0.067036,0.016846696,-0.0628137,0.016423473,0.014328391,0.029858196,-0.015595487,0.012271108,0.104557395,-0.048545606,-0.012200588,0.017067945,-0.004852984,0.041462958,-0.0076732575,0.018091857,0.008693791,-0.023836294,-0.0076264897,0.013579509,0.057706412,0.012311129,0.011789438,0.037512988,0.0908352,0.012187852,0.03811732,-0.026136555,0.021751726,0.0199363,0.013310374,0.0044485684,-0.04435746,0.01144245,0.011252692,-0.014749893,-0.051932275,-0.027752198,-0.061082777,-0.04817749,-0.01734975,0.02614761,0.03502397,-0.00039512312,-0.044737943,0.030718843,0.018191503,0.045701083,-0.009324803,0.034122936,-0.027622027,0.024509057,-0.030870138,0.05555119,0.02358782,0.005562078,-0.02451215,-0.008925075,0.017327314,-0.0035636418,0.024336532,-0.019595658,-0.052014407,-0.0071620513,-0.020169249,-0.026411986,-0.03361832,-0.021364491,0.051738616,-0.07891967,0.025005521,-0.0083286315,-0.05174264,-0.02304524,-0.0123421205,0.0038670362,0.031034727,-0.00086876616,-0.044575743,-0.01584494,-0.010853503,0.055754658,-0.012482316,0.025672225,0.01881917,-0.0005929424,0.0013270101,0.046660583,-0.011521013,0.016179016,-0.058407955,0.0022543853,-0.0385371,-0.021859204,-0.009613782,-0.01426481,-0.048540797,0.0051520066,0.013659245,0.0051918225,0.024734985,0.013069076,-0.007599862,-0.010952326,0.03205845,-0.012642342,0.05577546,-0.004108365,-0.037127584,-0.025502354,-0.03549084,0.0045600906,0.015381297],
 	"qwen3-embedding": [0.031792, 0.004926, -0.018041, -0.021473, 0.011381, -0.021165, -0.029530, -0.013660, 0.001806, 0.011227, -0.030067, 0.022565, 0.017513, -0.002146, 0.007644, 0.010086, -0.001986, -0.008106, 0.028812, 0.051614, 0.000999, 0.035312, -0.004000, 0.011976, 0.031688, 0.074720, -0.001988, -0.008885, -0.022300, 0.007263, -0.006750, -0.011187, 0.008181, 0.020549, -0.008814, 0.027741, -0.035995, -0.010632, 0.025195, 0.003310, -0.013303, 0.023147, -0.006527, 0.017887, 0.007376, -0.015949, -0.025742, 0.000312, -0.002480, 0.024913, 0.003318, -0.016577, -0.009663, -0.012305, -0.007548, -0.020042, -0.018475, 0.012711, 0.013015, 0.019632, -0.044327, 0.009160, 0.023335, -0.044934, 0.004110, -0.002380, 0.007062, -0.024249, -0.014356, -0.022298, 0.022666, -0.009599, 0.008619, 0.021663, 0.011470, 0.004400, -0.026681, -0.027311, 0.013865, 0.029518, 0.002130, -0.003086, -0.016096, -0.029406, -0.011760, -0.003154, -0.022900, -0.002128, -0.006287, -0.004423, -0.013240, 0.008787, 0.015061, 0.008619, -0.012334, -0.023233, -0.024260, -0.000038, -0.021759, -0.017202, 0.036565, 0.007832, 0.020661, -0.000709, -0.010937, -0.006529, -0.021067, -0.003493, -0.019981, -0.007152, -0.010431, 0.016528, 0.009478, -0.000387, 0.011030, -0.015702, 0.004910, 0.019820, -0.009501, 0.032242, 0.009791, -0.012693, -0.012758, 0.005951, 0.002285, -0.021048, -0.003414, 0.008490, -0.009785, 0.010615, 0.007026, -0.024024, 0.009654, -0.000423, -0.012367, -0.000683, -0.023359, 0.022525, -0.038280, 0.005439, -0.008667, 0.000479, 0.016903, 0.003972, -0.017659, 0.009390, 0.001721, 0.014011, -0.008501, -0.005130, 0.005864, -0.015588, -0.023901, -0.018212, -0.017600, -0.000808, 0.003798, -0.000374, 0.002171, -0.010798, -0.002167, -0.002688, 0.011768, 0.008376, 0.008252, -0.014139, 0.011628, -0.017354, 0.020029, -0.003110, -0.006605, 0.000088, -0.008673, 0.031119, 0.012816, -0.001554, -0.028450, -0.031961, 0.000749, 0.008979, -0.011676, 0.021030, 0.011047, 0.031832, 0.011692, 0.027716, -0.013094, -0.004017, 0.009027, -0.003426, 0.021272, -0.011953, -0.002096, 0.014148, 0.004425, 0.019694, -0.002320, 0.009787, -0.016366, -0.006968, 0.044094, -0.021135, 0.014664, 0.012024, -0.001312, -0.010924, -0.018518, 0.004067, 0.005731, -0.022986, 0.018130, 0.005514, -0.000535, 0.014868, 0.006152, -0.004057, 0.021108, 0.017294, 0.010202, -0.006595, 0.001040, -0.014503, 0.010996, -0.001199, -0.008933, -0.011412, -0.001148, -0.021028, -0.012644, -0.027791, 0.011597, -0.006358, -0.002310, -0.008983, -0.016520, -0.016006, 0.005623, -0.011567, -0.006791, 0.018220, 0.010030, -0.016808, 0.017599, -0.038462, 0.001185, 0.001272, 0.029970, -0.011697, 0.004016, -0.022518, -0.005410, -0.011339, 0.006955, -0.019577, -0.009260, -0.025437, -0.009223, 0.046833, -0.012395, 0.026334, -0.003020, -0.041108, -0.012053, 0.013952, 0.008421, 0.000389, -0.008318, -0.026117, -0.013364, 0.003995, 0.001253, -0.008553, -0.014819, 0.042401, 0.008028, 0.015358, -0.006760, -0.003456, -0.010274, -0.005063, 0.019056, -0.019282, 0.028529, -0.023047, 0.001330, -0.014517, 0.001787, 0.035838, -0.002197, 0.017273, 0.013223, 0.008261, 0.011069, 0.016115, 0.009038, 0.003824, 0.007600, -0.028759, 0.007420, -0.019551, 0.000208, -0.027524, 0.001970, 0.015536, -0.015439, -0.025726, -0.006733, 0.015017, -0.004176, -0.030268, -0.016025, -0.001417, 0.012878, -0.006500, -0.003119, -0.003401, -0.007219, -0.007370, 0.000644, 0.021524, 0.008409, 0.005200, 0.003197, 0.014018, 0.013874, 0.007392, 0.008548, -0.001694, -0.023479, 0.026971, -0.019892, -0.011915, -0.009339, 0.010411, 0.000962, 0.026920, 0.020772, -0.015630, 0.042286, 0.003036, 0.002418, -0.000559, 0.016372, -0.001013, 0.019164, 0.011091, 0.007332, -0.012521, -0.026355, -0.012465, 0.012425, -0.012866, -0.003681, 0.010110, -0.004440, -0.017833, -0.004337, 0.003432, 0.012279, -0.013508, 0.002860, 0.007560, -0.013746, -0.007328, 0.006398, 0.012368, -0.031189, 0.010435, -0.026745, -0.002065, -0.000018, 0.008437, -0.020951, -0.014613, 0.027587, 0.021053, 0.008047, -0.019996, -0.002226, -0.008008, 0.013445, -0.034107, -0.000744, -0.001821, 0.014077, 0.005022, -0.002037, -0.008170, 0.020361, -0.036807, -0.040290, -0.033997, 0.010617, 0.018125, 0.007784, 0.011251, -0.019881, -0.029746, -0.016549, 0.027708, 0.017331, 0.000739, 0.012864, 0.015012, -0.003049, 0.001765, -0.016737, -0.004086, -0.019370, 0.012912, -0.004322, 0.006763, 0.024780, -0.001206, 0.009158, 0.008418, -0.004266, 0.030131, -0.000400, -0.017726, -0.017540, 0.011032, -0.011073, -0.013453, -0.022519, -0.003229, 0.006512, -0.001383, 0.009883, 0.016059, -0.000605, -0.000309, -0.018137, 0.014978, 0.002921, -0.007613, 0.027025, -0.005092, 0.009398, 0.006602, 0.007236, 0.008204, 0.020230, 0.018121, 0.002815, 0.002668, -0.030765, -0.012687, -0.002267, 0.008824, -0.005739, 0.022980, -0.040300, -0.003685, -0.023128, -0.001817, -0.014380, -0.001766, -0.012538, 0.011029, -0.009562, -0.025560, 0.000254, 0.000899, 0.020819, 0.003545, 0.006402, -0.010389, -0.004296, -0.000360, -0.009568, 0.006516, 0.006023, 0.001291, -0.014954, 0.006374, -0.011088, -0.004016, 0.024020, 0.003261, 0.009086, -0.006118, 0.012321, 0.008060, -0.001949, 0.045665, 0.010600, -0.028078, 0.015062, 0.019561, -0.008793, 0.000384, 0.012627, -0.015218, 0.005784, -0.003044, -0.002830, -0.003675, 0.017365, -0.005626, 0.012490, 0.001138, -0.004063, -0.023211, -0.015569, -0.011751, 0.020837, -0.020561, -0.022642, -0.000206, 0.010976, 0.004722, 0.006458, -0.002802, -0.009693, -0.025396, 0.009108, 0.001791, 0.006541, 0.016408, 0.001736, -0.018632, 0.000523, -0.018195, -0.008380, 0.003091, 0.007251, -0.013442, 0.009905, -0.010768, -0.005161, 0.002064, 0.010608, 0.002720, -0.021422, -0.009019, 0.009357, -0.007045, 0.010005, 0.009786, -0.011280, 0.003003, 0.008567, -0.016222, -0.021154, 0.001371, 0.009106, 0.008682, 0.028164, 0.037620, -0.014166, 0.033103, -0.002531, -0.004949, -0.010924, -0.007954, -0.011785, -0.001748, -0.014597, 0.009884, 0.004108, 0.001241, -0.000416, 0.003360, -0.021418, -0.026198, 0.006894, 0.008989, -0.021985, 0.004533, 0.011405, -0.001827, 0.008044, 0.002529, -0.014493, 0.016014, -0.020658, 0.003807, 0.010540, -0.025505, 0.015002, 0.004699, 0.017521, 0.008660, 0.017759, -0.007729, 0.010906, -0.012483, 0.006340, -0.017246, -0.006083, 0.002357, 0.016951, -0.022541, 0.000364, -0.018440, 0.003730, -0.018185, -0.006742, 0.008023, 0.003459, -0.031610, 0.003049, -0.003019, -0.002934, 0.029219, -0.001473, -0.013225, 0.023437, 0.002153, 0.008362, 0.009142, -0.023763, -0.008043, 0.004517, 0.009636, 0.014824, -0.028260, 0.004312, 0.015419, -0.005401, -0.003108, -0.017145, 0.006375, 0.006473, 0.017673, 0.003004, -0.006814, -0.005512, -0.018296, -0.024305, 0.022902, -0.025757, -0.022487, -0.026135, 0.013664, 0.001370, 0.003182, -0.037260, 0.007060, 0.011588, 0.004182, 0.035425, 0.012125, -0.004238, 0.010359, 0.035212, 0.008152, 0.011075, 0.023878, 0.002958, 0.038817, 0.008300, 0.007776, 0.010572, -0.042451, -0.001251, 0.005246, 0.002344, -0.019186, -0.033779, -0.006243, 0.007207, 0.017790, -0.017984, -0.024683, -0.001003, 0.022494, 0.022498, -0.013629, -0.026255, -0.013596, 0.001076, 0.006961, 0.013133, -0.005664, -0.006499, -0.001609, 0.007189, -0.021156, -0.003479, -0.002400, -0.020974, 0.014524, 0.010587, -0.010552, -0.000728, 0.022545, 0.001695, 0.001498, -0.004404, -0.007288, 0.017903, 0.011703, 0.012844, 0.028733, -0.005856, -0.026446, 0.017745, 0.012850, 0.022067, 0.013617, -0.010212, -0.021234, -0.008570, -0.015652, -0.023508, 0.011418, -0.039396, 0.005391, 0.003879, 0.001210, -0.006911, 0.008865, -0.003326, -0.003076, 0.019264, 0.001549, 0.007484, -0.030370, 0.053156, 0.013863, -0.027415, -0.003470, -0.002664, 0.008749, 0.020691, 0.009630, 0.028416, -0.037981, -0.015957, -0.010788, -0.012660, -0.000779, -0.016764, 0.033517, -0.013758, -0.000528, -0.003093, -0.002753, -0.011892, -0.005444, -0.009057, 0.023202, -0.036589, 0.012229, -0.019088, 0.011596, 0.010203, -0.029219, 0.004284, 0.006076, -0.005539, 0.006054, 0.009512, 0.007094, -0.028645, -0.003598, 0.013799, -0.027507, -0.006348, 0.013886, 0.006111, -0.003856, -0.003430, -0.001100, 0.001812, 0.005712, 0.024730, -0.018796, 0.000108, -0.006207, 0.005937, 0.011734, -0.007228, -0.007973, -0.012129, 0.006572, 0.000141, 0.030832, 0.005892, 0.003501, 0.001516, 0.004694, -0.022240, 0.007386, -0.023270, 0.044361, -0.000140, 0.028047, -0.014853, -0.016221, 0.017074, -0.002851, 0.010071, -0.015005, 0.015156, 0.009846, 0.007697, 0.005352, -0.009038, 0.005556, -0.002746, 0.009233, 0.006823, -0.000160, -0.021344, -0.006151, -0.012515, 0.072906, -0.013540, -0.008361, 0.008153, -0.001799, 0.018483, 0.010785, -0.011283, -0.016609, 0.004088, 0.014252, -0.004421, -0.020900, 0.029211, 0.011621, 0.004254, -0.004932, 0.005741, 0.006653, 0.013325, -0.010694, 0.007876, -0.002466, -0.018666, -0.018410, -0.010627, -0.003349, 0.003484, -0.011489, 0.014391, 0.003229, 0.007021, 0.000133, -0.014888, -0.026584, 0.010275, 0.007855, 0.001890, 0.015709, 0.009294, 0.008799, 0.008655, -0.018378, 0.008336, 0.000331, -0.013533, 0.002439, 0.021340, -0.009806, 0.003492, -0.001372, -0.013885, -0.021650, 0.020662, -0.017006, 0.004307, 0.011045, 0.006932, 0.023574, -0.011301, -0.025923, -0.006913, 0.016671, -0.023506, -0.013017, -0.003302, -0.022934, 0.010941, 0.014406, 0.014412, -0.022246, -0.005683, 0.020179, -0.010220, 0.005060, 0.002080, 0.002767, 0.000137, -0.014047, 0.002494, -0.007142, 0.010849, -0.017285, -0.012656, -0.014468, -0.005269, 0.012875, -0.022823, 0.000353, -0.003313, -0.003364, -0.026604, 0.022048, -0.008279, -0.005318, 0.021979, 0.022471, -0.005116, -0.032529, -0.040768, -0.020011, 0.017497, -0.014278, 0.010649, 0.001603, -0.022499, 0.010507, 0.036945, -0.029510, -0.001214, -0.020313, -0.012374, 0.014613, 0.003072, 0.007042, 0.020545, -0.012042, -0.006691, 0.001797, 0.014907, -0.006485, 0.008387, -0.000148, 0.001464, 0.005346, -0.004612, -0.015713, 0.005616, -0.005177, 0.018050, -0.003931, -0.000001, -0.008391, 0.011832, -0.003223, -0.010326, -0.019975, -0.016370, 0.006447, -0.002780, -0.010225, -0.005227, -0.011900, -0.016903, 0.005408, -0.010780, -0.006199, -0.008412, -0.013894, -0.006245, -0.000687, 0.018553, 0.004978, -0.014254, -0.001509, -0.005372, 0.019807, -0.001753, 0.023208, -0.027532, 0.009226, 0.011976, -0.011693, 0.002449, -0.004840, -0.002368, -0.009941, 0.013470, 0.014675, -0.027788, -0.031636, 0.002538, 0.008076, -0.007696, -0.035465, 0.004288, 0.043763, -0.016293, -0.006316, -0.031554, 0.021771, -0.000773, 0.001582, 0.004359, 0.025712, 0.009557, -0.010360, 0.006726, -0.001341, 0.017298, -0.028810, 0.039897, 0.007975, -0.010776, -0.027927, 0.000299, -0.015461, 0.005385, -0.008718, -0.015047, 0.003341, -0.009360, 0.011141, 0.004886, 0.010366, -0.029717, -0.009179, -0.011421, 0.009537, -0.014428, -0.008828, -0.027039, -0.022218, 0.001996, -0.014867, 0.015115, 0.011442, 0.007213, -0.000849, -0.015230, 0.003767, 0.012754, 0.048488, 0.019210, 0.009058, -0.017050, 0.004827, 0.008308, 0.002984, 0.004916, 0.009659, -0.032912, -0.014253, -0.012794, -0.013872, 0.002399, 0.002207, -0.001631, -0.020052, -0.006069, -0.013495, 0.003094, 0.006192, -0.005703, 0.025562, -0.013196, -0.001365, 0.025535, -0.007324, -0.008711, -0.005778, 0.006962, -0.005504, -0.007406, -0.009825, -0.008078, 0.004535, -0.001390, -0.075147, -0.021516, 0.001119, -0.001852, 0.016273, 0.000722, -0.012491, -0.005250, 0.004081, 0.005299, 0.007435, -0.001721, -0.009242, -0.013120, -0.008459, 0.004338, -0.019973, -0.004505, 0.005025, -0.000954, -0.001090, 0.015541, -0.002063, -0.003308, 0.000588, 0.022199, -0.003141, -0.010038, -0.010964, 0.002221, 0.009131, 0.010295, 0.005334, -0.008926, 0.013424, 0.006229, -0.038063, -0.002829, 0.011684, 0.038563, 0.004651, -0.024207, -0.009563, -0.007640, -0.008146, 0.025759, 0.003030, 0.026461, 0.003975, 0.018165, 0.021594, 0.008707, 0.005547, -0.000296, 0.012815, -0.003540, -0.011585, 0.005085, -0.022372, 0.033902, -0.003263, -0.024705, -0.042109, 0.009599, 0.038620, -0.003434, 0.036212, 0.036306, -0.009963, 0.028066, 0.002020, 0.005640, -0.001960, -0.004135, -0.010166, -0.018146, 0.028150, -0.058238, 0.020195, -0.017224, -0.009405, 0.008595, -0.004518, 0.005570, 0.002576, -0.001744, -0.004112, -0.003760, 0.022951, 0.011364, -0.004922, 0.024788, 0.020602, -0.025195, 0.000685, 0.060091, 0.014272, 0.029017, 0.000804, -0.016958, 0.002612, 0.008865, -0.016550, -0.004889, -0.003536, -0.005749, -0.008546, -0.008727, -0.018071, -0.006674, 0.001521, -0.007114, 0.007189, 0.012673, 0.036561, -0.016118, 0.012443, -0.008857, -0.013068, 0.006335, -0.005951, 0.013068, -0.013533, -0.003326, 0.003575, 0.006155, -0.000159, -0.008904, 0.022211, 0.013479, 0.004807, -0.013215, 0.010392, 0.016700, 0.008846, 0.003426, -0.013167, 0.021007, -0.007006, -0.019989, -0.018935, -0.009326, -0.020080, -0.012506, -0.022386, 0.006021, -0.018697, -0.038166, -0.015170, 0.006244, -0.021284, -0.012843, 0.025328, -0.021694, -0.004130, -0.021319, 0.008720, 0.011652, -0.005452, -0.003179, 0.060741, -0.001221, 0.013110, 0.015061, 0.009986, 0.017344, 0.016174, 0.008966, 0.018068, 0.011810, 0.001908, 0.007490, -0.019028, -0.006078, -0.005390, 0.017064, -0.001505, -0.021410, 0.002280, -0.005085, -0.014639, 0.003743, -0.024898, -0.014706, 0.006912, -0.000702, 0.013239, -0.024904, 0.005201, -0.013320, 0.015443, 0.001929, -0.011712, -0.006642, -0.003515, 0.002906, 0.015133, 0.005487, -0.008241, -0.001367, 0.010693, 0.004662, -0.003180, -0.009749, 0.011601, -0.004775, 0.016301, 0.004968, 0.008186, -0.002459, -0.011336, -0.013957, 0.009293, 0.013549, -0.013098, 0.008125, 0.011986, -0.007959, 0.004094, 0.019528, -0.004365, 0.003876, -0.016059, -0.003758, 0.001789, -0.014303, -0.005796, 0.040344, -0.003670, -0.002143, 0.034586, 0.026734, -0.000698, -0.001604, 0.025056, -0.017334, 0.015160, -0.009957, 0.019010, 0.020996, 0.003243, 0.018213, 0.001208, -0.020305, 0.029187, -0.011641, -0.011657, -0.013246, -0.022376, 0.009607, 0.016580, 0.017885, 0.017881, 0.003957, -0.003618, 0.009981, 0.011575, -0.017757, 0.021916, 0.017002, 0.007537, 0.026295, 0.008167, 0.016813, -0.018921, -0.019569, 0.013323, -0.043220, -0.017883, 0.001302, -0.014914, 0.013275, 0.018250, 0.001672, 0.019226, 0.000573, 0.025074, -0.030046, 0.021336, -0.000482, 0.005240, 0.003336, -0.012777, -0.049509, 0.006522, -0.012608, -0.032835, 0.009286, 0.039234, 0.020901, -0.005669, -0.033840, -0.029497, 0.015623, 0.007640, 0.016760, 0.023920, 0.009403, 0.006438, 0.017191, 0.010603, 0.002468, 0.009694, 0.011633, 0.013090, -0.007263, -0.019116, -0.004664, 0.017793, -0.000072, 0.027779, -0.022312, 0.000518, 0.034992, -0.003028, 0.010675, 0.010700, 0.017552, 0.007521, 0.009735, -0.007287, -0.000703, 0.000518, -0.003049, -0.007723, 0.001756, 0.007471, -0.002170, 0.004427, 0.002457, -0.032252, 0.011692, -0.010095, 0.007934, -0.044980, 0.004782, 0.006804, -0.017519, -0.010744, 0.019077, 0.006702, -0.004973, -0.011182, 0.022724, -0.011923, -0.019493, -0.003578, 0.027181, -0.002335, -0.015360, 0.003101, 0.025972, 0.016819, -0.000467, -0.006585, 0.046542, 0.007835, 0.007061, -0.012299, -0.013939, -0.014781, 0.017238, 0.015341, 0.016411, -0.002476, -0.000096, 0.005822, -0.004916, 0.021976, 0.002665, -0.017337, -0.005119, -0.014974, -0.013682, -0.016552, 0.015179, 0.039968, -0.004849, -0.025470, -0.032710, -0.000492, 0.002529, -0.014715, -0.001878, 0.000461, -0.001201, -0.024754, -0.024242, 0.011683, 0.000981, -0.003004, 0.020895, -0.007614, -0.036281, -0.015148, -0.016843, -0.015740, 0.008739, 0.008076, -0.001046, 0.000070, -0.015041, -0.016683, 0.029050, -0.011142, 0.019542, 0.006395, 0.012477, 0.013558, 0.019579, 0.002287, 0.000364, -0.010508, 0.004982, 0.021573, 0.012588, 0.013008, -0.017638, -0.009140, 0.014363, -0.005592, 0.008980, -0.003731, 0.006250, -0.003962, 0.022276, 0.014456, 0.008127, -0.016733, -0.006529, 0.031901, -0.011931, 0.001025, 0.015881, -0.006654, -0.003508, 0.014181, 0.013726, 0.000726, -0.025121, -0.004030, 0.011454, 0.009010, 0.007005, -0.003615, 0.016721, -0.006014, -0.017870, 0.004426, 0.019319, 0.017792, 0.018068, 0.003201, -0.017899, -0.008119, -0.029222, -0.010760, -0.007811, 0.027540, 0.010730, 0.013286, -0.004819, -0.001144, -0.031621, 0.016147, 0.003982, -0.015991, -0.002804, 0.002829, 0.020540, -0.026591, 0.010923, 0.010854, -0.004079, 0.000688, -0.026974, -0.022422, 0.004963, 0.011102, 0.002007, 0.014523, -0.009108, 0.009736, 0.026097, -0.008907, -0.001358, -0.017569, 0.002859, 0.032701, -0.003909, -0.024595, -0.030636, 0.013448, 0.018823, 0.016989, 0.017027, -0.020685, 0.001997, 0.004204, 0.012288, 0.024839, -0.000365, 0.008064, -0.023655, -0.000214, -0.011491, -0.010244, 0.016034, 0.021091, 0.007094, -0.003997, -0.002698, 0.017125, -0.001379, 0.003748, -0.017694, 0.005844, -0.015642, 0.016894, -0.025795, -0.021323, -0.009326, 0.023170, -0.005597, -0.022418, 0.004685, -0.018928, -0.011539, -0.017003, -0.009692, 0.013745, -0.008849, -0.006986, 0.020762, 0.001902, -0.001151, 0.005636, 0.016969, -0.017418, -0.013400, -0.003218, 0.017248, -0.024777, 0.045254, -0.010008, -0.018173, 0.022667, 0.002803, 0.023494, 0.032903, -0.014638, 0.001442, 0.030762, 0.002861, 0.008452, -0.004196, -0.018943, 0.010758, 0.019940, -0.005218, -0.004911, 0.002166, 0.000062, 0.018939, -0.003565, -0.040918, -0.001216, 0.024237, -0.002687, -0.001567, -0.016337, 0.015927, 0.039783, 0.007293, -0.010945, -0.024960, -0.005193, 0.010885, 0.000692, 0.000611, -0.004277, 0.016390, 0.025758, -0.004503, 0.027956, -0.020453, -0.022293, 0.009417, 0.012242, -0.009043, 0.001688, -0.008467, 0.001545, 0.016667, 0.015859, 0.015847, -0.029128, -0.016145, -0.016548, -0.000915, -0.005255, 0.001502, 0.006229, -0.000733, -0.016100, -0.019398, 0.022031, 0.004469, 0.008908, -0.016122, 0.000040, -0.008888, 0.008074, -0.040070, -0.001359, 0.006614, 0.008660, 0.011839, -0.030364, 0.008786, -0.004480, -0.005094, 0.020516, -0.012271, 0.017133, -0.001555, 0.013039, -0.005642, 0.015864, -0.008735, 0.018597, -0.018773, 0.026437, -0.017914, 0.010521, -0.031799, 0.026542, -0.002553, -0.011440, 0.022807, -0.001484, -0.013086, -0.005393, -0.041449, 0.023232, -0.024994, -0.011003, 0.014226, -0.014660, -0.012297, 0.010081, 0.016016, 0.023430, 0.003944, -0.021434, 0.001499, 0.015885, -0.015178, 0.052111, 0.013777, 0.003943, -0.004159, -0.018207, 0.019766, -0.024061, -0.030762, -0.018855, 0.000095, -0.014928, -0.015209, 0.017462, 0.002385, 0.000187, -0.014586, -0.017039, 0.000806, 0.006072, 0.035368, -0.000529, 0.017466, 0.003279, 0.022002, 0.015390, -0.017172, -0.004862, -0.033992, -0.007625, -0.005381, 0.014700, 0.004541, 0.004763, -0.005538, -0.011130, -0.005827, 0.015927, -0.027840, 0.017271, 0.007639, 0.024618, 0.015270, 0.019440, 0.017037, 0.018614, 0.006260, 0.002318, 0.012834, -0.007415, -0.022029, -0.010439, 0.010957, 0.003316, 0.013317, -0.007644, -0.029696, -0.007906, 0.013180, -0.004121, -0.004793, -0.003164, 0.002117, 0.016471, -0.041044, 0.018250, -0.013665, -0.013043, -0.001555, -0.002679, -0.026402, -0.012257, 0.027655, -0.013110, -0.004093, -0.008459, -0.017506, 0.013686, -0.006936, 0.014952, -0.009130, -0.021595, 0.009934, -0.014897, 0.002735, 0.012240, 0.000107, 0.004421, -0.005080, -0.007473, -0.013056, 0.005994, 0.023927, 0.014086, -0.010669, 0.002883, 0.004913, 0.011917, 0.006067, 0.006099, 0.028509, 0.016327, 0.019547, 0.000761, -0.008872, -0.013328, 0.007887, 0.000593, 0.010895, -0.011474, -0.007090, 0.011083, -0.004068, -0.013910, 0.000002, -0.013572, 0.005778, -0.003331, -0.000280, -0.005848, -0.018626, -0.010224, -0.001178, 0.003822, 0.005855, 0.001250, 0.005114, -0.020260, 0.034792, 0.018608, -0.003275, 0.000991, 0.005417, -0.007322, -0.012350, -0.021752, 0.009537, 0.008009, -0.009680, -0.000582, -0.016834, -0.007484, 0.001159, -0.022297, 0.003660, -0.010565, -0.019750, -0.005773, 0.015054, 0.016563, 0.014081, 0.009023, 0.036565, -0.030304, 0.027378, -0.016617, 0.000151, -0.010887, -0.016542, 0.004438, 0.002592, 0.000920, 0.000102, -0.010652, -0.023235, 0.101960, -0.017048, 0.001956, 0.010342, -0.008566, -0.005541, -0.017047, -0.012462, 0.010076, 0.001959, 0.006444, -0.001760, 0.001517, -0.008421, -0.014456, -0.021142, -0.005687, 0.007755, -0.016494, 0.003861, -0.002703, -0.003307, -0.009360, 0.002867, 0.000226, -0.020640, 0.004909, -0.018447, 0.017833, 0.022051, 0.014006, -0.017507, -0.005500, -0.006043, -0.007814, 0.018392, -0.006371, 0.018850, 0.029652, -0.003573, -0.008146, 0.018313, -0.015838, -0.032720, -0.042324, -0.000093, -0.005138, 0.005588, 0.016665, -0.009604, -0.001978, -0.029234, -0.025235, 0.010030, -0.001410, 0.019863, -0.004580, -0.009004, -0.016924, 0.003690, -0.010201, 0.016367, 0.008306, -0.001806, 0.038056, 0.017252, 0.009558, -0.013220, 0.003652, 0.016436, 0.006446, -0.004599, 0.008749, -0.020319, 0.000831, -0.005372, 0.016846, -0.009377, -0.009748, -0.026560, 0.011980, 0.014937, 0.006341, 0.000422, 0.002159, -0.021079, 0.001828, 0.002897, 0.015790, 0.007269, -0.002133, 0.020799, 0.004535, -0.009252, 0.014515, -0.018034, 0.005088, 0.014639, -0.000818, -0.005400, -0.012085, 0.018262, 0.004450, 0.015766, 0.005318, 0.025644, -0.049883, 0.004744, 0.005378, 0.009072, 0.014824, 0.023132, 0.002685, -0.001183, -0.002213, 0.015892, 0.005347, -0.022873, 0.034731, -0.006599, -0.016648, 0.028667, 0.004957, -0.010771, 0.004812, -0.003598, -0.015015, -0.010878, 0.011263, -0.024440, -0.003584, 0.001943, -0.013649, -0.005871, -0.004335, -0.024247, 0.018355, 0.009756, 0.022101, 0.012232, 0.000029, 0.009751, -0.009421, -0.010585, 0.018912, 0.003387, 0.011882, -0.008308, -0.016522, -0.009758, -0.001156, 0.015289, 0.019122, 0.000015, 0.004118, 0.039255, -0.003367, -0.002975, -0.006581, -0.003712, 0.034320, -0.022950, -0.021703, 0.021714, 0.003876, -0.001524, 0.006148, 0.015376, -0.003583, 0.013684, 0.008504, 0.002071, -0.006866, -0.016622, 0.028972, -0.002585, -0.012830, 0.007892, 0.000639, -0.018131, -0.018077, -0.003100, -0.005005, -0.013567, 0.003568, 0.002382, -0.019491, 0.021040, 0.014864, 0.032373, -0.002519, -0.007588, 0.005639, 0.016072, -0.001837, 0.005916, 0.021606, -0.004785, 0.016915, -0.008056, -0.014667, 0.007789, -0.005898, -0.003012, -0.000263, -0.011757, 0.004057, -0.013413, -0.011619, 0.016374, -0.014115, -0.001854, 0.014490, 0.005928, 0.005582, 0.005524, 0.019696, 0.007976, -0.002337, 0.017389, 0.027090, -0.001294, -0.026454, -0.012785, -0.000151, 0.005695, 0.018820, -0.005554, -0.010554, -0.037088, -0.015285, 0.013529, -0.002270, 0.002447, -0.013967, -0.002778, 0.022457, 0.006619, -0.010586, -0.014883, -0.017480, -0.000678, 0.010898, -0.018060, 0.005616, 0.000099, 0.012023, -0.003565, -0.002615, -0.012217, -0.030788, 0.008546, 0.007993, 0.003866, -0.012082, -0.016117, 0.015401, -0.008101, 0.003709, 0.000091, 0.010832, 0.018474, 0.017259, -0.005851, -0.031973, 0.015791, 0.012643, 0.003244, 0.014998, 0.019063, -0.001472, -0.025990, 0.015169, -0.009884, 0.002544, 0.020427, -0.000470, 0.005721, 0.008325, 0.007087, -0.006803, 0.020466, -0.017335, 0.003829, -0.003448, 0.007477, -0.021420, -0.014349, 0.018811, -0.019868, -0.032201, -0.000424, 0.002255, 0.016701, -0.019483, 0.000868, -0.000312, -0.011390, 0.014417, -0.005372, -0.018477, -0.013866, -0.001135, -0.013151, -0.016340, 0.022038, 0.028332, 0.018423, -0.000885, -0.016016, -0.000506, 0.007382, 0.002883, -0.060843, -0.005289, -0.008497, 0.013998, 0.028891, 0.003624, 0.000382, 0.005699, -0.017407, 0.011960, -0.007124, -0.022642, 0.009878, 0.010962, -0.000292, -0.018771, 0.005196, 0.003887, 0.007990, 0.003359, -0.004517, 0.015622, -0.001508, -0.017210, -0.013518, -0.018791, 0.000493, 0.012015, -0.001230, -0.005306, -0.006177, -0.006319, 0.012276, 0.002216, -0.010670, -0.010702, -0.024221, -0.013020, -0.010832, -0.004789, -0.020057, 0.009258, -0.000225, -0.031841, 0.000593, -0.015819, -0.016449, -0.010948, -0.008769, -0.011786, 0.001202, 0.007093, 0.016759, -0.034051, -0.001936, -0.005886, 0.006068, 0.029942, -0.008858, -0.005155, 0.030633, 0.012225, 0.018284, 0.009850, -0.013926, 0.010475, -0.009784, 0.024091, -0.009334, 0.006966, -0.009209, -0.020398, -0.009779, -0.042508, -0.000022, -0.006571, 0.022690, -0.014969, -0.016340, -0.012293, -0.013041, 0.023558, -0.093774, -0.012834, 0.001748, -0.015414, -0.003389, 0.013077, -0.008845, 0.000491, 0.004804, 0.008608, -0.017550, 0.005078, -0.009128, 0.019334, 0.021869, -0.002114, 0.007375, -0.004183, -0.021008, -0.008093, -0.031298, 0.007937, -0.006032, -0.004382, -0.024452, 0.006999, -0.023552, 0.018541, -0.007993, 0.012050, -0.010784, -0.006336, 0.014794, -0.009498, -0.003932, -0.009129, -0.015602, -0.003046, 0.016863, -0.000131, -0.016725, -0.001143, 0.000428, 0.005294, -0.013933, -0.006820, -0.010661, 0.000711, -0.008837, -0.007209, -0.017701, 0.004061, 0.014703, 0.001840, 0.009885, -0.021338, 0.015789, -0.017123, -0.003605, 0.004870, -0.006961, -0.014616, 0.001046, -0.018480, -0.021327, 0.003602, -0.012612, -0.013908, -0.004274, -0.010954, 0.008409, -0.026547, 0.009493, 0.025824, -0.017831, -0.014135, -0.021283, 0.006764, 0.004242, -0.010836, -0.003256, -0.014130, -0.004335, -0.012133, 0.001019, 0.010675, 0.004686, 0.012308, -0.013302, -0.022552, -0.004927, -0.025277, -0.005335, 0.011381, 0.025372, 0.005224, 0.035451, 0.001242, -0.007717, -0.022296, -0.001692, 0.008255, 0.037308, 0.033247, 0.004620, -0.009462, 0.009099, -0.001126, 0.007144, 0.016147, -0.002579, -0.009192, 0.021881, -0.018137, -0.016120, 0.023954, 0.026693, 0.008363, -0.002968, -0.007836, -0.002710, 0.017337, -0.005227, 0.000648, -0.001666, 0.003518, -0.003102, 0.012371, -0.003384, 0.021508, 0.006837, 0.006620, -0.020704, -0.015379, 0.006124, -0.000815, -0.002005, 0.020740, -0.000341, 0.003189, -0.005670, 0.001827, -0.006109, 0.015929, -0.003354, 0.002359, -0.001033, 0.027072, -0.005968, 0.032495, 0.010132, -0.004043, 0.021537, 0.001836, 0.004311, 0.027786, -0.008344, -0.013441, -0.001841, 0.000048, 0.010439, 0.004804, -0.008400, -0.004262, -0.010300, -0.003906, -0.003394, -0.003375, -0.000944, -0.019556, 0.004310, 0.039962, 0.027445, -0.016895, -0.021711, -0.010279, 0.005174, -0.011083, -0.010566, 0.011690, -0.001378, 0.004322, -0.006402, 0.009215, 0.002703, -0.002887, 0.009664, 0.008476, 0.024891, -0.004103, 0.007138, -0.026648, 0.001226, 0.003641, -0.014135, -0.017579, 0.006830, -0.005211, -0.010327, 0.011596, -0.004247, 0.004704, 0.026018, -0.013983, -0.026755, 0.000465, -0.009860, 0.021312, 0.029785, -0.004714, 0.010407, -0.005684, 0.011187, 0.012351, 0.000256, 0.016947, 0.005220, -0.015514, -0.006897, 0.004795, 0.017589, -0.020394, 0.005371, -0.001247, -0.003036, -0.015652, -0.004967, -0.017463, 0.009540, -0.013814, -0.023903, -0.024999, 0.022597, 0.017431, 0.011907, 0.000741, 0.000395, 0.014146, -0.025263, 0.004919, -0.013391, -0.038441, -0.020687, 0.013218, -0.014276, 0.016215, -0.003140, 0.017917, -0.015071, 0.006642, -0.020299, 0.018196, 0.000165, -0.017600, -0.022149, -0.007831, -0.016498, -0.005095, -0.011056, 0.002061, 0.007088, 0.001588, 0.010239, -0.005201, -0.008742, -0.009925, 0.001828, -0.022987, 0.011902, 0.004328, 0.009978, -0.002052, 0.021294, 0.020016, 0.024059, -0.006338, 0.005658, -0.003601, 0.005725, -0.012885, -0.003567, -0.008227, 0.021375, 0.001915, 0.001100, 0.009236, 0.014495, 0.009794, 0.032031, -0.022061, 0.002740, -0.035699, 0.017734, -0.007188, -0.007257, 0.009609, -0.002559, 0.008155, -0.006893, -0.001755, 0.004039, -0.007587, 0.001309, -0.014647, 0.012599, -0.023093, 0.009309, -0.002869, -0.014727, -0.012037, -0.003156, -0.004096, 0.016965, 0.019384, 0.000002, 0.004630, 0.013427, -0.025145, -0.009610, -0.010228, -0.010881, -0.018150, 0.001892, 0.020061, -0.003203, -0.011193, 0.008656, -0.021301, 0.027418, -0.044232, 0.015832, 0.004668, 0.012090, 0.027355, 0.001649, -0.019923, 0.022830, -0.010257, 0.006284, 0.016087, -0.012424, -0.005967, 0.017051, -0.007909, 0.036533, 0.004785, 0.013536, 0.005001, -0.020018, -0.006978, 0.025895, 0.009551, 0.013015, 0.043835, -0.000942, 0.021205, -0.010978, -0.002646, 0.013192, -0.012961, -0.004818, -0.003524, -0.002655, -0.011494, -0.021520, 0.005068, -0.005938, -0.005708, -0.044458, 0.012405, 0.013281, 0.035163, -0.008479, 0.005200, 0.006298, -0.013604, 0.001231, 0.035308, -0.010445, -0.021337, -0.002967, -0.015408, -0.002867, -0.020344, 0.009895, 0.009303, -0.001393, -0.011840, 0.008937, -0.003710, 0.017104, -0.022924, 0.004528, -0.009281, 0.013223, 0.044217, -0.005570, -0.003403, -0.004183, -0.013926, -0.013133, 0.000399, -0.019108, 0.002664, -0.000750, -0.016516, -0.028828, 0.040569, 0.028743, -0.002339, -0.001530, 0.005358, 0.000831, 0.009249, -0.002734, 0.005316, -0.011899, 0.002943, -0.030287, 0.013423, 0.011075, -0.007595, 0.019828, 0.002101, -0.008739, -0.023729, -0.015603, 0.013206, 0.006018, 0.018934, 0.010430, 0.003697, 0.000756, -0.005511, -0.003178, -0.013694, 0.012796, 0.012422, -0.008806, -0.002501, 0.012712, 0.010711, -0.029260, 0.003970, -0.015845, 0.007692, 0.002803, -0.019664, -0.036282, -0.009719, -0.008491, 0.015094, -0.011584, 0.030892, 0.013343, -0.004266, 0.018714, -0.013045, 0.016724, -0.017056, -0.020940, 0.013137, -0.006146, -0.003954, -0.005386, -0.005451, -0.024084, -0.020846, -0.023871, 0.022407, 0.016726, 0.010994, -0.019705, -0.012944, -0.011736, -0.016800, 0.016154, 0.037629, -0.003424, -0.014073, 0.012585, -0.009890, -0.012940, -0.011151, -0.003971, -0.057809, -0.003960, 0.005835, 0.019932, -0.002561, -0.014869, -0.005486, -0.022046, 0.023892, -0.004431, 0.020073, 0.014859, 0.031115, -0.005770, 0.025947, -0.015123, 0.004376, 0.000216, 0.014397, -0.010126, 0.019828, -0.023535, -0.006688, -0.004238, 0.003923, -0.001913, 0.013896, -0.020261, -0.000211, -0.017331, -0.004555, 0.017812, 0.019288, -0.000217, 0.029406, 0.004155, 0.021335, -0.004290, -0.000817, -0.009499, -0.004489, 0.016444, -0.003451, 0.017868, 0.010835, -0.003867, 0.009663, -0.006793, -0.008544, 0.004528, -0.004728, -0.013121, 0.001463, -0.005069, -0.009838, -0.009709, -0.011030, 0.020544, 0.016891, -0.008413, 0.009241, -0.020013, 0.004533, -0.000644, -0.003671, 0.002443, 0.005531, 0.012014, -0.015547, 0.032867, -0.010345, -0.002676, 0.015909, -0.014906, -0.000076, 0.001226, 0.004505, -0.001700, 0.015689, 0.008147, -0.008414, 0.018174, -0.023157, -0.002639, 0.001052, -0.011672, 0.034863, 0.007373, 0.003746, 0.020635, -0.001293, -0.007285, -0.005697, 0.011585, 0.011641, 0.005903, -0.015321, -0.019069, 0.001474, -0.002128, -0.001428, -0.013760, 0.006829, 0.008988, -0.000711, 0.011416, 0.013476, 0.005795, 0.010170, 0.019851, -0.006154, -0.025921, 0.028450, 0.015028, -0.007077, -0.005037, 0.014004, 0.001087, 0.002007, 0.034379, 0.022603, 0.011928, 0.010591, 0.013465, -0.007109, 0.008027, 0.009830, 0.015632, -0.008051, 0.000997, 0.005566, -0.012794, -0.004673, 0.020111, 0.010783, 0.014006, -0.014628, -0.005732, 0.002201, -0.014574, 0.002813, -0.008896, 0.005612, -0.010093, 0.009163, 0.032559, -0.008701, 0.020292, -0.003993, -0.005449, 0.047966, -0.018202, -0.009047, 0.002837, 0.000065, 0.012356, -0.001373, -0.011784, 0.003937, 0.019357, 0.017805, 0.017216, 0.013104, 0.006715, -0.001720, 0.000405, 0.000307, 0.000554, 0.023089, -0.008537, 0.003480, 0.011162, 0.028809, -0.028040, 0.015474, -0.020833, -0.027994, -0.006712, 0.013394, 0.030260, 0.019467, -0.004163, 0.022857, 0.005267, -0.002363, 0.003092, 0.002819, 0.002405, 0.015851, 0.002322, -0.015501, 0.002296, -0.001550, -0.010819, -0.003535, -0.031813, -0.009316, 0.039037, 0.013429, -0.004979, 0.007779, -0.008625, 0.022815, -0.003904, -0.045269, -0.002160, 0.024076, 0.025454, 0.021919, 0.001096, 0.009633, 0.003316, -0.002369, 0.010195, -0.008795, -0.012336, -0.002107, -0.002632, 0.024025, -0.012306, -0.002433, -0.011226, 0.013108, 0.013018, -0.012350, -0.008192, 0.008925, -0.012370, -0.006352, 0.008407, 0.000015, -0.018896, -0.002139, 0.022095, -0.021460, -0.028576, 0.000539, -0.014611, 0.008801, 0.005960, -0.003063, -0.014593, -0.002300, 0.014949, 0.008666, 0.001127, 0.001032, 0.008120, 0.008157, 0.012913, -0.013515, -0.016257, -0.007906, -0.001303, 0.000938, -0.002996, 0.018525, 0.001288, 0.002526, -0.009045, -0.000344, 0.043224, 0.008138, -0.026972, -0.036458, -0.005729, -0.014122, -0.022735, -0.026821, 0.032595, 0.033623, -0.018001, 0.025555, 0.032314, 0.026786, 0.007475, -0.029497, -0.009322, -0.007920, 0.000674, -0.025910, -0.006128, 0.005208, 0.004453, -0.001005, 0.002987, 0.018911, 0.013004, 0.003351, -0.031948, -0.011428, -0.005920, 0.011624, -0.007659, -0.012838, 0.017949, 0.019967, 0.007770, 0.028680, 0.026473, 0.003621, 0.004719, -0.001039, -0.008586, -0.007310, 0.016093, 0.000288, -0.009725, 0.003192, -0.001937, 0.004935, 0.016688, 0.012862, 0.006541, 0.010781, 0.004346, -0.006409, -0.017303, -0.027135, -0.014983, -0.020721, -0.001405, 0.007737, 0.017811, 0.025829, -0.018499, 0.033446, -0.008173, 0.025652, -0.002779, -0.036197, 0.035352, -0.023758, -0.006748, -0.018221, 0.025066, 0.007794, 0.019503, 0.000028, -0.006134, -0.015939, 0.001823, -0.003823, 0.015975, 0.008037, -0.001529, 0.013992, 0.006494, 0.006369, -0.025116, -0.011833, -0.022861, 0.027657, -0.011983, 0.043682, 0.015773, 0.004072, 0.015403, -0.000675, -0.016528, 0.000549, -0.005763, -0.014040, 0.009783, 0.017142, -0.005261, 0.011948, -0.015775, 0.016749, 0.003973, -0.011806, -0.008391, 0.002960, 0.024964, -0.013062, -0.002914, 0.010767, -0.004168, -0.016841, -0.017812, -0.003733, -0.014694, -0.009252, 0.001167, -0.008773, -0.002801, -0.012193, -0.020946, -0.007018, 0.005875, 0.007153, 0.019214, -0.024508, 0.011197, -0.022353, -0.015757, -0.001978, -0.011591, 0.004546, 0.025104, -0.065326, -0.000773, -0.001352, 0.007841, -0.024962, 0.000702, -0.026619, -0.004280, -0.001047, 0.008003, -0.010382, -0.002991, 0.028899, 0.019583, 0.018180, 0.002436, 0.024709, -0.008453, 0.000247, 0.010228, -0.029503, 0.025050, 0.000883, -0.003178, -0.029948, -0.000435, -0.000472, 0.018712, -0.022477, 0.026339, -0.014619, 0.034320, -0.015215, -0.020927, -0.007180, -0.019584, -0.009599, 0.014168, -0.021145, -0.009943, 0.003515, 0.004487, 0.024646, 0.015464, 0.012207, 0.004644, -0.001743, 0.017812, -0.016363, 0.015604, 0.009810, 0.013646, -0.017065, -0.022975, 0.005958, 0.022131, 0.005337, 0.027475, -0.001793, -0.007428, 0.025136, 0.009438, -0.013006, 0.027094, 0.009010, -0.015209, -0.005203, -0.016260, 0.015376, -0.005864, -0.009721, -0.007073, -0.010791, -0.009775, -0.004087, 0.005616, 0.026532, -0.004916, -0.006672, -0.003260, -0.029856, 0.005316, 0.002688, 0.001484, -0.008383, -0.000859, 0.013535, -0.010640, 0.019049, -0.018435, -0.009252, -0.040983, 0.017240, -0.002047, -0.000440, -0.013463, 0.002644, -0.011559, -0.010040, -0.002651, 0.019927, -0.013669, 0.022540, -0.012926, 0.001236, 0.014573, 0.002343, -0.014226, 0.027290, -0.002921, 0.017484, -0.001449, 0.011943, 0.024709, 0.009541, -0.019108, 0.014974, 0.003024, 0.013132, -0.025489, 0.000146, -0.019084, -0.021294, 0.001326, 0.002435, -0.011663, -0.017603, 0.003963, -0.036307, 0.002392, 0.002865, -0.005224, 0.008638, 0.017922, 0.022342, 0.008152, 0.009601, -0.006566, 0.019362, 0.016708, 0.009775, 0.010324, -0.015168, -0.013288, 0.005146, 0.003812, 0.018278, 0.003083, 0.005744, 0.023287, 0.031175, -0.010414, 0.015014, -0.006324, -0.002789, 0.014685, -0.017423, 0.007766, -0.001774, -0.003090, 0.017457, -0.005279, -0.017417, 0.002423, 0.005983, -0.000100, -0.003857, -0.011357, 0.006177, -0.013336, 0.015153, -0.005920, 0.008527, 0.012864, -0.016132, 0.030787, 0.000947, 0.005183, -0.010514, -0.002621, 0.000255, -0.003783, -0.006133, 0.013980, -0.001777, 0.056108, 0.003873, 0.015884, 0.018199, -0.007962, -0.008492, 0.001115, 0.027627, -0.024726, -0.001219, 0.007042, -0.014110, 0.008502, 0.003677, -0.003958, -0.019703, 0.006495, -0.028127, 0.022678, 0.012794, -0.023379, -0.007287, 0.048726, 0.003756, -0.007684, 0.006204, -0.006883, -0.020040, 0.011968, 0.004371, 0.006712, 0.001578, 0.010776, 0.006446, 0.004054, -0.008785, -0.004622, 0.015532, 0.024050, -0.004696, 0.005004, 0.047457, 0.016234, -0.013847, -0.003231, 0.001491, -0.006422, -0.007664, 0.000159, 0.022566, 0.015440, 0.014559, 0.017279, -0.001534, 0.010613, 0.008598, 0.012202, 0.012841, -0.008078, -0.022893, -0.011717, 0.000308, 0.012272, 0.010973, -0.015097, 0.011852, -0.002605, -0.002470, -0.019322, 0.028891, 0.028117, -0.012933, -0.011691, -0.001118, 0.005864, 0.014090, -0.001960, -0.011202, 0.028238, 0.003765, 0.012505, 0.016722, -0.008177, -0.013111, -0.011508, -0.009641, 0.029831, -0.004879, -0.005077, -0.032950, -0.023307, -0.007336, 0.000371, 0.003904, -0.007409, -0.006993, 0.004508, -0.004245, 0.018904, -0.006684, 0.015313, 0.002921, -0.011040, -0.002182, 0.012927, 0.015248, 0.024934, -0.006216, 0.022404, 0.020427, -0.002124, -0.014849, -0.008371, -0.020697, 0.006873, -0.006633, -0.053067, 0.020209, -0.007383, -0.000910, -0.002166, -0.002127, 0.015922, -0.005251, -0.010564, -0.017466, 0.009684, 0.005759, 0.019960, 0.033270, 0.012287, -0.000100, -0.006858, 0.006734, 0.025588, 0.005950, -0.006609, 0.013516, 0.005205, 0.002285, -0.000606, 0.037977, -0.003159, -0.001248, -0.028815, 0.029236, -0.010141, -0.004224, -0.004079, 0.017874, -0.015790, 0.016889, 0.012398, 0.009094, 0.015814, 0.014839, -0.005277, -0.002516, -0.013254, 0.004981, -0.009693, -0.001848, 0.004601, 0.014224, 0.031232, -0.024458, 0.002672, -0.009351, 0.007968, 0.015294, 0.001275, 0.016084, 0.023957, -0.002103, 0.003466, 0.002882, -0.004706, -0.013566, 0.005811, 0.008504, -0.001594, 0.003923, 0.009754, 0.024187, -0.004900, 0.042902, 0.009232, -0.007876, -0.028139, 0.007731, -0.014280, -0.012051, 0.011560, 0.005460, -0.007022, 0.016041, 0.001226, 0.001036, -0.018123, 0.005008, -0.017481, -0.011131, -0.008724, 0.001174, 0.021142, 0.005457, 0.008019, 0.000416, 0.044647, -0.014745, 0.016356, -0.003489, -0.009914, -0.027851, 0.015946, 0.015048, 0.014239, -0.018476, 0.018326, 0.010209, 0.003926, -0.021824, 0.015518, 0.006414, 0.015442, -0.013744, 0.003908, -0.007205, 0.020117, 0.003079, 0.018629, -0.004714, -0.011078, 0.009998, -0.014231, 0.000272, 0.009207, -0.006673, -0.007812, -0.005450, 0.007986, -0.012454, 0.017887, -0.007598, 0.006841, 0.006608, -0.001630, -0.032937, 0.007242, -0.012077, 0.001551, -0.001544, 0.008404, -0.017311, 0.011169, -0.011868, -0.005969, 0.020850, 0.001749, -0.015342, 0.018045, -0.032569, 0.013476, 0.003299, 0.018967, -0.014875, 0.002329, -0.022181, -0.024022, -0.005146, 0.001831, 0.040513, -0.011674, -0.013500, -0.004823, 0.004701, 0.008984, 0.008307, -0.020678, -0.000369, 0.005629, 0.006012, 0.019657, 0.002078, 0.024057, -0.006275, -0.006897, -0.011263, -0.016342, 0.010246, 0.009262, -0.003426, -0.030797, -0.008025, -0.018327, -0.010448, 0.003397, 0.011466, 0.007359, 0.012101, 0.010924, -0.011682, 0.004035, -0.013045, 0.008647, 0.006065, -0.001575, -0.012142, -0.002077, -0.011405, -0.005507, 0.022302, -0.005649, -0.020798, -0.000002, 0.003886, 0.016590, -0.012743, -0.005405, 0.025257, -0.010380, 0.009266, 0.034503, 0.005400, -0.000897, -0.015368, 0.014782, -0.004936, -0.009334, -0.005339, 0.026014, -0.017054, 0.001600, 0.001218, -0.007068, -0.001957, -0.025544, -0.018504, -0.009509, -0.001614, 0.012602, 0.010126, 0.006851, 0.003219, -0.019446, -0.015196, -0.008934, -0.014628, -0.007057, -0.011605, 0.000150, -0.005216, 0.005098, 0.015251, -0.020344, 0.009604, 0.000319, -0.003701, -0.005448, 0.006905, -0.008316, 0.017366, -0.008151, -0.008407, -0.003523, -0.017969, -0.016836, -0.004794, -0.010536, 0.000728, 0.015541, 0.007297, 0.020033, 0.014789, 0.008347, -0.014012, -0.009191, -0.011634, 0.001047, -0.004078, -0.032395, -0.007056, -0.006072, 0.017237, -0.010169, -0.002082, -0.012120, -0.016639, 0.003092, 0.015185, 0.024824, 0.016023, -0.012708, -0.021427, 0.006886, -0.013677, -0.015014, -0.044346, -0.008770, -0.006189, -0.003424, 0.015988, -0.001982, -0.012779, -0.008387, 0.013574, -0.021576, 0.024866, -0.001767, 0.016229, 0.019212, 0.014702, 0.012229, -0.010593, 0.008917, 0.005296, -0.010568, 0.001254, -0.007352, 0.004297, 0.014843, -0.000107, 0.008463, -0.001418, -0.004644, -0.012905, -0.009090, 0.026492, 0.010213, 0.004233, 0.013997, 0.027515, -0.006066, -0.008489, -0.009575, 0.005357, -0.002819, 0.012562, -0.001755, -0.021962, 0.028935, 0.019488, -0.000949, 0.046604, -0.005296, -0.017023, 0.013773, 0.014090, 0.001501, -0.019164, -0.018293, -0.015383, -0.003715, -0.017315, -0.001150, 0.010238, 0.013849, 0.007795, -0.009511, -0.005155, -0.002302, -0.030114, -0.000130, -0.026576, -0.011283, -0.014823, 0.010253, -0.008892, 0.003799, 0.022423, 0.008893, 0.008317, 0.005401, -0.020923, -0.007466, 0.012005, 0.017001, -0.002136, -0.015182, -0.005745, 0.013719, 0.008295, 0.001176, 0.009355, 0.005882, -0.005260, -0.002908, 0.010056, -0.009662, 0.012986, 0.011305, 0.003803, -0.003433, 0.004537, 0.000067, 0.010801, 0.007846, 0.000285, 0.002437, 0.013732, -0.000136, 0.016116, -0.009388, -0.028867, 0.015769, -0.009709, 0.007497, 0.004109, 0.005993, -0.016153, 0.019901, -0.000855, -0.008279, 0.019159, -0.000092, -0.009310, -0.002351, -0.018273, 0.016507, -0.015380, 0.012349, 0.010352, -0.015944, -0.019335, -0.022712, 0.001215, 0.018114, 0.007969, 0.018223, 0.003234, 0.034172, 0.011018, -0.008155, -0.003510, -0.055344, -0.008135, 0.002642, -0.016795, 0.008877, -0.032482, 0.020419, -0.006696, -0.019295, 0.012625, -0.010185, 0.013550, 0.006444, 0.010389, -0.024405, 0.004697, -0.006485, -0.006578, 0.012772, -0.003774, 0.010779, -0.016861, 0.006165, 0.000787, 0.009266, -0.022250, 0.022082, 0.017113, 0.009719, 0.017075, 0.022148, -0.003797, 0.012725, 0.010799, -0.013215, 0.005114, -0.001655, 0.011833, 0.005005, -0.011579, 0.017267, 0.016029, -0.001224, -0.022074, -0.016803, -0.000424, -0.001202, -0.000810, 0.008266, -0.001548, -0.008027, -0.009839, -0.010535, 0.006481, 0.007259, 0.036436, -0.001130, -0.040727, 0.013779, 0.001592, -0.025086, 0.014909, 0.015090, 0.000607, -0.011528, -0.000361, -0.011167, 0.017288, 0.011351, 0.007367, 0.004935, 0.006236, 0.007734, 0.000988, 0.013061, -0.001021, -0.010461, -0.010896, -0.003681, 0.000912, 0.008134, 0.006001, -0.001354, -0.008708, 0.000808, 0.009372, 0.001674, 0.013925, -0.007881, -0.019357, 0.010177, 0.010353, 0.010437, 0.002275, 0.013243, 0.007057, -0.025082, -0.016305, -0.011859, 0.009043, -0.025206, -0.004534, 0.004452, -0.006219, -0.004143, 0.003171, 0.007724, 0.008987, 0.000169, 0.013302, 0.006299, 0.007297, 0.016530, -0.023153, 0.010610, -0.003937, 0.014075, 0.007115, -0.022596, 0.033165, -0.018001, 0.004484, -0.021658, -0.009335, 0.018308, 0.008660, -0.018094, -0.008676, 0.012363, 0.012372, -0.027894, 0.005208, -0.009708, -0.012411, 0.016168, -0.000753, 0.008323, -0.001104, -0.021953, 0.019505, 0.003769, -0.006093, -0.002687, -0.005854, -0.000456, 0.011746, 0.006733, -0.008888, 0.005915, 0.059429, -0.016033, 0.005489, 0.006918, 0.004521, -0.000074, -0.001815, -0.001053, -0.001441, 0.016997, -0.006283, -0.002506, -0.006677, -0.057522, -0.005242, 0.009781, 0.008025, 0.022570, -0.002895, 0.006240, -0.011600, 0.010416, 0.037326, 0.019760, -0.006887, -0.015520, -0.003295, 0.004990, -0.000308, 0.036522, 0.004534, 0.006574, 0.001989, 0.007110, 0.002790, -0.005056, -0.035095, 0.001534, -0.007598, 0.004227, 0.000755, 0.026881, 0.020648, 0.011147, -0.044771, -0.004757, -0.009377, 0.005339, -0.005997, -0.003715, 0.029131, 0.002203, 0.007175, -0.007573, -0.003617, -0.017187, -0.009481, 0.014506, 0.000882]
 }
diff --git a/integration/utils_test.go b/integration/utils_test.go
index c438aa93..c0bac5e1 100644
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -248,12 +248,14 @@ var (
 		"zephyr",
 	}
 	libraryEmbedModels = []string{
+		"qwen3-embedding",
+		"embeddinggemma",
+		"nomic-embed-text",
 		"all-minilm",
 		"bge-large",
 		"bge-m3",
 		"granite-embedding",
 		"mxbai-embed-large",
-		"nomic-embed-text",
 		"paraphrase-multilingual",
 		"snowflake-arctic-embed",
 		"snowflake-arctic-embed2",
@@ -321,7 +323,7 @@ func GetTestEndpoint() (*api.Client, string) {
 		}
 	}
 
-	if os.Getenv("OLLAMA_TEST_EXISTING") == "" && port == defaultPort {
+	if os.Getenv("OLLAMA_TEST_EXISTING") == "" && runtime.GOOS != "windows" && port == defaultPort {
 		port = FindPort()
 	}
 
@@ -335,15 +337,20 @@ func GetTestEndpoint() (*api.Client, string) {
 		http.DefaultClient), fmt.Sprintf("%s:%s", host, port)
 }
 
-var serverMutex sync.Mutex
-var serverReady bool
-var serverLogFile string
+// Server lifecycle management
+var (
+	serverMutex sync.Mutex
+	serverReady bool
+	serverLog   bytes.Buffer
+	serverDone  chan int
+	serverCmd   *exec.Cmd
+)
 
 func startServer(t *testing.T, ctx context.Context, ollamaHost string) error {
 	// Make sure the server has been built
 	CLIName, err := filepath.Abs("../ollama")
 	if err != nil {
-		return err
+		return fmt.Errorf("failed to get absolute path: %w", err)
 	}
 
 	if runtime.GOOS == "windows" {
@@ -351,72 +358,42 @@ func startServer(t *testing.T, ctx context.Context, ollamaHost string) error {
 	}
 	_, err = os.Stat(CLIName)
 	if err != nil {
-		return fmt.Errorf("CLI missing, did you forget to build first?  %w", err)
+		return fmt.Errorf("CLI missing, did you forget to 'go build .' first?  %w", err)
 	}
 	serverMutex.Lock()
 	defer serverMutex.Unlock()
 	if serverReady {
 		return nil
 	}
+	serverDone = make(chan int)
+	serverLog.Reset()
 
 	if tmp := os.Getenv("OLLAMA_HOST"); tmp != ollamaHost {
 		slog.Info("setting env", "OLLAMA_HOST", ollamaHost)
 		t.Setenv("OLLAMA_HOST", ollamaHost)
 	}
 
-	logDir := t.TempDir()
-	slog.Info("starting server", "url", ollamaHost)
-	done, err := SpawnServer(ctx, "../ollama", logDir)
-	if err != nil {
-		return fmt.Errorf("failed to start server: %w", err)
-	}
-
+	serverCmd = exec.Command(CLIName, "serve")
+	serverCmd.Stderr = &serverLog
+	serverCmd.Stdout = &serverLog
 	go func() {
-		<-ctx.Done()
-		serverMutex.Lock()
-		defer serverMutex.Unlock()
-		exitCode := <-done
-		if exitCode > 0 {
-			slog.Warn("server failure", "exit", exitCode)
-		}
-		serverReady = false
-	}()
-
-	// TODO wait only long enough for the server to be responsive...
-	time.Sleep(500 * time.Millisecond)
-
-	serverReady = true
-	return nil
-}
-
-func SpawnServer(ctx context.Context, command, logDir string) (chan int, error) {
-	done := make(chan int)
-	fp, err := os.CreateTemp(logDir, "ollama-server-*.log")
-	if err != nil {
-		return nil, fmt.Errorf("failed to create log file: %w", err)
-	}
-	serverLogFile = fp.Name()
-
-	cmd := exec.CommandContext(ctx, command, "serve")
-	cmd.Stderr = fp
-	cmd.Stdout = fp
-
-	go func() {
-		slog.Info("starting server...")
-		if err := cmd.Run(); err != nil {
-			// "signal: killed" expected
+		slog.Info("starting server", "url", ollamaHost)
+		if err := serverCmd.Run(); err != nil {
+			// "signal: killed" expected during normal shutdown
 			if !strings.Contains(err.Error(), "signal") {
 				slog.Info("failed to run server", "error", err)
 			}
 		}
 		var code int
-		if cmd.ProcessState != nil {
-			code = cmd.ProcessState.ExitCode()
+		if serverCmd.ProcessState != nil {
+			code = serverCmd.ProcessState.ExitCode()
 		}
 		slog.Info("server exited")
-		done <- code
+		serverDone <- code
 	}()
-	return done, nil
+
+	serverReady = true
+	return nil
 }
 
 func PullIfMissing(ctx context.Context, client *api.Client, modelName string) error {
@@ -477,52 +454,65 @@ var serverProcMutex sync.Mutex
 // Starts the server if needed
 func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, string, func()) {
 	client, testEndpoint := GetTestEndpoint()
-	if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
-		serverProcMutex.Lock()
-		if err := startServer(t, ctx, testEndpoint); err != nil {
+	cleanup := func() {}
+	if os.Getenv("OLLAMA_TEST_EXISTING") == "" && runtime.GOOS != "windows" {
+		var err error
+		err = startServer(t, ctx, testEndpoint)
+		if err != nil {
 			t.Fatal(err)
 		}
+		cleanup = func() {
+			serverMutex.Lock()
+			defer serverMutex.Unlock()
+			serverReady = false
+
+			slog.Info("shutting down server")
+			serverCmd.Process.Signal(os.Interrupt)
+			slog.Info("waiting for server to exit")
+			<-serverDone
+			slog.Info("terminate complete")
+
+			if t.Failed() {
+				slog.Warn("SERVER LOG FOLLOWS")
+				io.Copy(os.Stderr, &serverLog)
+				slog.Warn("END OF SERVER")
+			}
+			slog.Info("cleanup complete", "failed", t.Failed())
+		}
 	}
 	// Make sure server is online and healthy before returning
-	listCtx, cancel := context.WithDeadlineCause(
-		ctx,
-		time.Now().Add(120*time.Second),
-		fmt.Errorf("list models took too long"),
-	)
-	defer cancel()
-	models, err := client.ListRunning(listCtx)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if len(models.Models) > 0 {
-		names := make([]string, len(models.Models))
-		for i, m := range models.Models {
-			names[i] = m.Name
+	for {
+		select {
+		case <-ctx.Done():
+			t.Fatalf("context done before server ready: %v", ctx.Err())
+			break
+		default:
 		}
-		slog.Info("currently loaded", "models", names)
+		listCtx, cancel := context.WithDeadlineCause(
+			ctx,
+			time.Now().Add(10*time.Second),
+			fmt.Errorf("list models took too long"),
+		)
+		defer cancel()
+		models, err := client.ListRunning(listCtx)
+		if err != nil {
+			if runtime.GOOS == "windows" {
+				t.Fatalf("did you forget to start the server: %v", err)
+			}
+			time.Sleep(10 * time.Millisecond)
+			continue
+		}
+		if len(models.Models) > 0 {
+			names := make([]string, len(models.Models))
+			for i, m := range models.Models {
+				names[i] = m.Name
+			}
+			slog.Info("currently loaded", "models", names)
+		}
+		break
 	}
 
-	return client, testEndpoint, func() {
-		if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
-			defer serverProcMutex.Unlock()
-			if t.Failed() {
-				fp, err := os.Open(serverLogFile)
-				if err != nil {
-					slog.Error("failed to open server log", "logfile", serverLogFile, "error", err)
-					return
-				}
-				defer fp.Close()
-				data, err := io.ReadAll(fp)
-				if err != nil {
-					slog.Error("failed to read server log", "logfile", serverLogFile, "error", err)
-					return
-				}
-				slog.Warn("SERVER LOG FOLLOWS")
-				os.Stderr.Write(data)
-				slog.Warn("END OF SERVER")
-			}
-		}
-	}
+	return client, testEndpoint, cleanup
 }
 
 func ChatTestHelper(ctx context.Context, t *testing.T, req api.ChatRequest, anyResp []string) {
diff --git a/kvcache/causal.go b/kvcache/causal.go
index 543a65a6..d804f3bf 100644
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -40,11 +40,6 @@ type Causal struct {
 
 	// ** current forward pass **
 
-	// curReserve indicates that this forward pass is only for
-	// memory reservation and we should not update our metadata
-	// based on it.
-	curReserve bool
-
 	// the active layer for Get and Put
 	curLayer int
 
@@ -206,13 +201,12 @@ func (c *Causal) Close() {
 }
 
 func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-	c.curReserve = reserve
 	c.curBatchSize = len(batch.Positions)
 	c.curSequences = batch.Sequences
 	c.curPositions = batch.Positions
 	c.opts.Except = nil
 
-	if !c.curReserve {
+	if !reserve {
 		c.updateSlidingWindow()
 
 		var err error
@@ -379,10 +373,6 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 
 	length := c.curCellRange.max - c.curCellRange.min + 1
 
-	if c.curReserve {
-		return ctx.Input().Empty(c.config.MaskDType, length, batchSize)
-	}
-
 	mask := make([]float32, batchSize*length)
 
 	for i := range c.curBatchSize {
@@ -403,7 +393,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 		mask[i] = float32(math.Inf(-1))
 	}
 
-	maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
+	maskTensor := ctx.Input().FromFloats(mask, length, batchSize)
 
 	if c.config.MaskDType != ml.DTypeF32 {
 		maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
@@ -735,7 +725,7 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 		offsets = offsets[batchFirst : batchLast+1]
 
 		ctx := c.backend.NewContext()
-		kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
+		kShift := ctx.Input().FromInts(offsets, len(offsets))
 
 		for i, key := range c.keys {
 			if key == nil {
diff --git a/kvcache/causal_test.go b/kvcache/causal_test.go
index 7e4fc3b1..dd0c0442 100644
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -477,7 +477,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
 			}
 
 			cache.SetLayer(0)
-			tensor := context.FromFloatSlice(test.in, test.inShape...)
+			tensor := context.FromFloats(test.in, test.inShape...)
 			cache.Put(context, tensor, tensor)
 
 			out, _, mask := cache.Get(context)
@@ -519,7 +519,7 @@ func TestCanResume(t *testing.T) {
 	}
 
 	cache.SetLayer(0)
-	tensor := context.FromFloatSlice([]float32{1, 2, 3, 4, 5}, 1, 1, 5)
+	tensor := context.FromFloats([]float32{1, 2, 3, 4, 5}, 1, 1, 5)
 	cache.Put(context, tensor, tensor)
 
 	// with window size 4, nothing has slid out of the window yet
@@ -549,7 +549,7 @@ func TestCanResume(t *testing.T) {
 	}
 
 	cache.SetLayer(0)
-	tensor = context.FromFloatSlice([]float32{6}, 1, 1, 1)
+	tensor = context.FromFloats([]float32{6}, 1, 1, 1)
 	cache.Put(context, tensor, tensor)
 
 	// only the latest position has overlapping windows
@@ -594,7 +594,7 @@ func TestCanResumeSWAMem(t *testing.T) {
 	}
 
 	cache.SetLayer(0)
-	tensor := context.FromFloatSlice([]float32{1, 2, 3, 4, 5, 6, 7}, 1, 1, 7)
+	tensor := context.FromFloats([]float32{1, 2, 3, 4, 5, 6, 7}, 1, 1, 7)
 	cache.Put(context, tensor, tensor)
 
 	// shift window by adding position 7
@@ -607,7 +607,7 @@ func TestCanResumeSWAMem(t *testing.T) {
 	}
 
 	cache.SetLayer(0)
-	tensor = context.FromFloatSlice([]float32{8}, 1, 1, 1)
+	tensor = context.FromFloats([]float32{8}, 1, 1, 1)
 	cache.Put(context, tensor, tensor)
 
 	// only the latest position has overlapping windows
@@ -670,7 +670,7 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	return c.Empty(dtype, shape...)
 }
 
-func (c *testContext) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
+func (c *testContext) FromFloats(s []float32, shape ...int) ml.Tensor {
 	t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
 
 	copy(t.data, s)
@@ -678,13 +678,13 @@ func (c *testContext) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
 	return t
 }
 
-func (c *testContext) FromIntSlice(s []int32, shape ...int) ml.Tensor {
+func (c *testContext) FromInts(s []int32, shape ...int) ml.Tensor {
 	f := make([]float32, len(s))
 	for i := range f {
 		f[i] = float32(s[i])
 	}
 
-	out := c.FromFloatSlice(f, shape...)
+	out := c.FromFloats(f, shape...)
 	out.(*testTensor).dtype = ml.DTypeI32
 
 	return out
@@ -696,7 +696,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso
 		s = append(s, i)
 	}
 
-	out := c.FromFloatSlice(s, len(s))
+	out := c.FromFloats(s, len(s))
 	out.(*testTensor).dtype = dtype
 	return out
 }
diff --git a/llama/patches/0026-GPU-discovery-enhancements.patch b/llama/patches/0026-GPU-discovery-enhancements.patch
index 82513e34..807a4689 100644
--- a/llama/patches/0026-GPU-discovery-enhancements.patch
+++ b/llama/patches/0026-GPU-discovery-enhancements.patch
@@ -5,24 +5,33 @@ Subject: [PATCH] GPU discovery enhancements
 
 Expose more information about the devices through backend props, and leverage
 management libraries for more accurate VRAM usage reporting if available.
+
+vulkan: get GPU ID (ollama v0.11.5)
+
+Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
+
+Vulkan PCI and Memory
+
+fix vulkan PCI ID and ID handling
 ---
- ggml/include/ggml-backend.h        |  11 +
- ggml/src/CMakeLists.txt            |   2 +
- ggml/src/ggml-cuda/ggml-cuda.cu    |  74 +++++
- ggml/src/ggml-cuda/vendors/hip.h   |   3 +
- ggml/src/ggml-impl.h               |   8 +
- ggml/src/ggml-metal/ggml-metal.cpp |   2 +
- ggml/src/mem_hip.cpp               | 449 +++++++++++++++++++++++++++++
- ggml/src/mem_nvml.cpp              | 209 ++++++++++++++
- 8 files changed, 758 insertions(+)
+ ggml/include/ggml-backend.h          |   8 +
+ ggml/src/CMakeLists.txt              |   2 +
+ ggml/src/ggml-cuda/ggml-cuda.cu      |  65 ++++
+ ggml/src/ggml-cuda/vendors/hip.h     |   3 +
+ ggml/src/ggml-impl.h                 |   8 +
+ ggml/src/ggml-metal/ggml-metal.cpp   |   2 +
+ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 212 +++++++++++--
+ ggml/src/mem_hip.cpp                 | 452 +++++++++++++++++++++++++++
+ ggml/src/mem_nvml.cpp                | 209 +++++++++++++
+ 9 files changed, 931 insertions(+), 30 deletions(-)
  create mode 100644 ggml/src/mem_hip.cpp
  create mode 100644 ggml/src/mem_nvml.cpp
 
 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index ba181d09d..094fc3c82 100644
+index ba181d09d..809835243 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
-@@ -169,6 +169,17 @@ extern "C" {
+@@ -169,6 +169,14 @@ extern "C" {
          const char * device_id;
          // device capabilities
          struct ggml_backend_dev_caps caps;
@@ -31,9 +40,6 @@ index ba181d09d..094fc3c82 100644
 +        int compute_major;
 +        int compute_minor;
 +        int integrated;
-+        int pci_bus_id;
-+        int pci_device_id;
-+        int pci_domain_id;
 +        const char *library;
 +        // number with which the devices are accessed (Vulkan)
 +        const char *numeric_id;
@@ -54,7 +60,7 @@ index 0609c6503..aefe43bdd 100644
  
  target_include_directories(ggml-base PRIVATE .)
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 87c6c34a4..816597d2f 100644
+index 87c6c34a4..b075a18be 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
@@ -86,7 +92,7 @@ index 87c6c34a4..816597d2f 100644
          GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
                          id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
                          ggml_cuda_parse_uuid(prop, id).c_str());
-@@ -3484,6 +3499,14 @@ struct ggml_backend_cuda_device_context {
+@@ -3484,6 +3499,11 @@ struct ggml_backend_cuda_device_context {
      std::string description;
      std::string pci_bus_id;
      std::string id;
@@ -95,22 +101,19 @@ index 87c6c34a4..816597d2f 100644
 +    int driver_major;
 +    int driver_minor;
 +    int integrated;
-+    int pciBusID;
-+    int pciDeviceID;
-+    int pciDomainID;
  };
  
  static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
-@@ -3504,6 +3527,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
+@@ -3504,6 +3524,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
  static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
      ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
      ggml_cuda_set_device(ctx->device);
 +
 +#if defined(GGML_USE_HIP)
 +    if (ggml_hip_mgmt_init() == 0) {
-+        int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
++        int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total);
 +        if (status == 0) {
-+            GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
++            GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
 +            ggml_hip_mgmt_release();
 +            return;
 +        }
@@ -120,7 +123,7 @@ index 87c6c34a4..816597d2f 100644
 +    if (ggml_nvml_init() == 0) {
 +        int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
 +        if (status == 0) {
-+            GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
++            GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total);
 +            ggml_nvml_release();
 +            return;
 +        }
@@ -130,7 +133,7 @@ index 87c6c34a4..816597d2f 100644
      CUDA_CHECK(cudaMemGetInfo(free, total));
  }
  
-@@ -3512,6 +3557,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
+@@ -3512,6 +3554,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
      return GGML_BACKEND_DEVICE_TYPE_GPU;
  }
  
@@ -138,7 +141,7 @@ index 87c6c34a4..816597d2f 100644
  static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
      ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
  
-@@ -3525,6 +3571,22 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+@@ -3525,6 +3568,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
      // If you need the memory data, call ggml_backend_dev_memory() explicitly.
      props->memory_total = props->memory_free = 0;
  
@@ -153,15 +156,12 @@ index 87c6c34a4..816597d2f 100644
 +    props->driver_major = ctx->driver_major;
 +    props->driver_minor = ctx->driver_minor;
 +    props->integrated = ctx->integrated;
-+    props->pci_bus_id = ctx->pciBusID;
-+    props->pci_device_id = ctx->pciDeviceID;
-+    props->pci_domain_id = ctx->pciDomainID;
 +    props->library = GGML_CUDA_NAME;
 +
      bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
  #ifdef GGML_CUDA_NO_PEER_COPY
      bool events = false;
-@@ -4087,6 +4149,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -4087,6 +4143,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
          std::lock_guard<std::mutex> lock(mutex);
          if (!initialized) {
              ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
@@ -169,7 +169,7 @@ index 87c6c34a4..816597d2f 100644
  
              for (int i = 0; i < ggml_cuda_info().device_count; i++) {
                  ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
-@@ -4102,6 +4165,17 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -4102,6 +4159,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                  snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
                  dev_ctx->pci_bus_id = pci_bus_id;
  
@@ -181,9 +181,6 @@ index 87c6c34a4..816597d2f 100644
 +                dev_ctx->driver_major = driverVersion / 1000;
 +                dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
 +                dev_ctx->integrated = prop.integrated;
-+                dev_ctx->pciBusID = prop.pciBusID;
-+                dev_ctx->pciDeviceID = prop.pciDeviceID;
-+                dev_ctx->pciDomainID = prop.pciDomainID;
                  ggml_backend_dev_t dev = new ggml_backend_device {
                      /* .iface   = */ ggml_backend_cuda_device_interface,
                      /* .reg     = */ &reg,
@@ -209,7 +206,7 @@ index 1f06be80e..2f9ef2dc0 100644
  #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
  #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
 diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
-index d0fb3bcca..80597b6ea 100644
+index d0fb3bcca..b63edd0c1 100644
 --- a/ggml/src/ggml-impl.h
 +++ b/ggml/src/ggml-impl.h
 @@ -638,6 +638,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
@@ -221,7 +218,7 @@ index d0fb3bcca..80597b6ea 100644
 +GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
 +GGML_API void ggml_nvml_release();
 +GGML_API int ggml_hip_mgmt_init();
-+GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
++GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
 +GGML_API void ggml_hip_mgmt_release();
 +
  #ifdef __cplusplus
@@ -247,12 +244,319 @@ index f2ff9f322..f356e4a0a 100644
      props->caps = {
          /* .async                 = */ true,
          /* .host_buffer           = */ false,
+diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+index ed83236f4..0bbcecd01 100644
+--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
++++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+@@ -231,6 +231,7 @@ class vk_memory_logger;
+ #endif
+ class vk_perf_logger;
+ static void ggml_vk_destroy_buffer(vk_buffer& buf);
++static std::string ggml_vk_get_device_id(int device);
+ 
+ static constexpr uint32_t mul_mat_vec_max_cols = 8;
+ static constexpr uint32_t p021_max_gqa_ratio = 8;
+@@ -11585,6 +11586,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
+     snprintf(description, description_size, "%s", props.deviceName.data());
+ }
+ 
++static std::string ggml_vk_get_device_id(int device) {
++    ggml_vk_instance_init();
++
++    std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
++
++    vk::PhysicalDeviceProperties2 props;
++    vk::PhysicalDeviceIDProperties deviceIDProps;
++    props.pNext = &deviceIDProps;
++    devices[device].getProperties2(&props);
++
++    const auto& uuid = deviceIDProps.deviceUUID;
++    char id[64];
++    snprintf(id, sizeof(id),
++        "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
++        uuid[0], uuid[1], uuid[2], uuid[3],
++        uuid[4], uuid[5],
++        uuid[6], uuid[7],
++        uuid[8], uuid[9],
++        uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]
++    );
++    return std::string(id);
++}
++
+ // backend interface
+ 
+ #define UNUSED GGML_UNUSED
+@@ -12391,31 +12415,103 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
+     ggml_vk_get_device_description(dev_idx, description, description_size);
+ }
+ 
+-void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
++std::string ggml_backend_vk_get_device_id(int device) {
+     GGML_ASSERT(device < (int) vk_instance.device_indices.size());
+-    GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
++    int dev_idx = vk_instance.device_indices[device];
++    return ggml_vk_get_device_id(dev_idx);
++}
++
++//////////////////////////
++
++struct ggml_backend_vk_device_context {
++    size_t device;
++    std::string name;
++    std::string description;
++    bool is_integrated_gpu;
++    // Combined string id in the form "dddd:bb:dd.f" (domain:bus:device.function)
++    std::string pci_id;
++    std::string id;
++    std::string uuid;
++    std::string numeric_id;
++    int major;
++    int minor;
++    int driver_major;
++    int driver_minor;
++};
++
++void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
++    GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size());
++    GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size());
++
++    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]];
+ 
+-    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
+-    vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops;
+-    vk::PhysicalDeviceMemoryProperties2 memprops = {};
+-    bool membudget_supported = vk_instance.device_supports_membudget[device];
++    vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
++    vk::PhysicalDeviceProperties2 props2;
++    vkdev.getProperties2(&props2);
+ 
+-    if (membudget_supported) {
+-        memprops.pNext = &budgetprops;
++    if (!ctx->is_integrated_gpu)
++    {
++        // Use vendor specific management libraries for best VRAM reporting if available
++        switch (props2.properties.vendorID) {
++        case VK_VENDOR_ID_AMD:
++            if (ggml_hip_mgmt_init() == 0) {
++                int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total);
++                if (status == 0) {
++                    GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
++                    ggml_hip_mgmt_release();
++                    return;
++                }
++                ggml_hip_mgmt_release();
++            }
++            break;
++        case VK_VENDOR_ID_NVIDIA:
++            if (ggml_nvml_init() == 0) {
++                int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
++                if (status == 0) {
++                    GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total);
++                    ggml_nvml_release();
++                    return;
++                }
++                ggml_nvml_release();
++            }
++            break;
++        }
+     }
+-    vkdev.getMemoryProperties2(&memprops);
++    // else fallback to memory budget if supported
+ 
+-    for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) {
+-        const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i];
++    *total = 0;
++    *free = 0;
++    vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
++    vk::PhysicalDeviceMemoryProperties2 memprops2;
++    memprops2.pNext = &mem_budget_props;
++    vkdev.getMemoryProperties2(&memprops2);
++    for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
++        if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
++            *total += memprops2.memoryProperties.memoryHeaps[i].size;
++        } else if (ctx->is_integrated_gpu) {
++            // Include shared memory on iGPUs
++            *total += memprops2.memoryProperties.memoryHeaps[i].size;
++        }
++    }
++    for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
++        if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
++            *free += mem_budget_props.heapBudget[i];
++        } else if (ctx->is_integrated_gpu) {
++            *free += mem_budget_props.heapBudget[i];
++        }
++    }
++    if (*total > 0 && *free > 0) {
++        return;
++    } else if (*total > 0) {
++        *free = *total;
++        return;
++    }
+ 
++    // else just report the physical memory
++    for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) {
+         if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+             *total = heap.size;
+-
+-            if (membudget_supported && i < budgetprops.heapUsage.size()) {
+-                *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i];
+-            } else {
+-                *free = heap.size;
+-            }
++            *free = heap.size;
+             break;
+         }
+     }
+@@ -12448,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
+         }
+     }
+ 
++    vk::PhysicalDeviceProperties2 props2;
+     if (!ext_support) {
+-        return "";
++        device.getProperties2(&props2);
++        if (props2.properties.vendorID != VK_VENDOR_ID_AMD) {
++            return "";
++        }
++        // AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero
+     }
+ 
+     vk::PhysicalDeviceProperties2 props = {};
+@@ -12466,19 +12567,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
+ 
+     char pci_bus_id[16] = {};
+     snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
++    if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) {
++        return "";
++    }
+ 
+     return std::string(pci_bus_id);
+ }
+ 
+-//////////////////////////
+-
+-struct ggml_backend_vk_device_context {
+-    size_t device;
+-    std::string name;
+-    std::string description;
+-    bool is_integrated_gpu;
+-    std::string pci_bus_id;
+-};
++static bool ggml_backend_vk_parse_pci_bus_id(const std::string & id, int *domain, int *bus, int *device) {
++    if (id.empty()) return false;
++    unsigned int d = 0, b = 0, dev = 0, func = 0;
++    // Expected format: dddd:bb:dd.f (all hex)
++    int n = sscanf(id.c_str(), "%4x:%2x:%2x.%1x", &d, &b, &dev, &func);
++    if (n < 4) return false;
++    if (domain) *domain = (int) d;
++    if (bus) *bus = (int) b;
++    if (device) *device = (int) dev;
++    return true;
++}
+ 
+ static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
+     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+@@ -12490,9 +12596,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
+     return ctx->description.c_str();
+ }
+ 
++static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
++    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
++    return ctx->id.c_str();
++}
++
+ static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
+     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
+-    ggml_backend_vk_get_device_memory(ctx->device, free, total);
++    ggml_backend_vk_get_device_memory(ctx, free, total);
+ }
+ 
+ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
+@@ -12516,8 +12627,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
+ 
+     props->name        = ggml_backend_vk_device_get_name(dev);
+     props->description = ggml_backend_vk_device_get_description(dev);
++    props->id          = ggml_backend_vk_device_get_id(dev);
+     props->type        = ggml_backend_vk_device_get_type(dev);
+-    props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
++    props->device_id   = ctx->pci_id.empty() ? nullptr : ctx->pci_id.c_str();
+     ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
+     props->caps = {
+         /* .async                 = */ false,
+@@ -12525,6 +12637,14 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
+         /* .buffer_from_host_ptr  = */ false,
+         /* .events                = */ false,
+     };
++
++    props->compute_major = ctx->major;
++    props->compute_minor = ctx->minor;
++    props->driver_major = ctx->driver_major;
++    props->driver_minor = ctx->driver_minor;
++    props->integrated = ctx->is_integrated_gpu;
++    props->library = GGML_VK_NAME;
++    props->numeric_id = ctx->numeric_id.c_str();
+ }
+ 
+ static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
+@@ -12953,6 +13073,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
+         static std::mutex mutex;
+         std::lock_guard<std::mutex> lock(mutex);
+         if (!initialized) {
++            std::vector<vk::PhysicalDevice> vk_devices = vk_instance.instance.enumeratePhysicalDevices();
++
+             for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
+                 ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
+                 char desc[256];
+@@ -12961,12 +13083,42 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
+                 ctx->name = GGML_VK_NAME + std::to_string(i);
+                 ctx->description = desc;
+                 ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
+-                ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
++                ctx->pci_id = ggml_backend_vk_get_device_pci_id(i);
++                ctx->id = ggml_backend_vk_get_device_id(i);
+                 devices.push_back(new ggml_backend_device {
+                     /* .iface   = */ ggml_backend_vk_device_i,
+                     /* .reg     = */ reg,
+                     /* .context = */ ctx,
+                 });
++
++                // Gather additional information about the device
++                int dev_idx = vk_instance.device_indices[i];
++                vk::PhysicalDeviceProperties props1;
++                vk_devices[dev_idx].getProperties(&props1);
++                vk::PhysicalDeviceProperties2 props2;
++                vk::PhysicalDeviceIDProperties device_id_props;
++                vk::PhysicalDevicePCIBusInfoPropertiesEXT  pci_bus_props;
++                vk::PhysicalDeviceDriverProperties driver_props;
++                props2.pNext = &device_id_props;
++                device_id_props.pNext = &pci_bus_props;
++                pci_bus_props.pNext = &driver_props;
++                vk_devices[dev_idx].getProperties2(&props2);
++                std::ostringstream oss;
++                oss << std::hex << std::setfill('0');
++                int byteIdx = 0;
++                for (int i = 0; i < 16; ++i, ++byteIdx) {
++                    oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
++                    if (byteIdx == 3 || byteIdx == 5 || byteIdx == 7 || byteIdx == 9) {
++                        oss << '-';
++                    }
++                }
++                ctx->uuid = oss.str();
++                ctx->major = 0;
++                ctx->minor = 0;
++                // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
++                ctx->driver_major = 0;
++                ctx->driver_minor = 0;
++                ctx->numeric_id = std::to_string(i);
+             }
+             initialized = true;
+         }
 diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
 new file mode 100644
-index 000000000..8ef19b8cf
+index 000000000..5a7f5d465
 --- /dev/null
 +++ b/ggml/src/mem_hip.cpp
-@@ -0,0 +1,449 @@
+@@ -0,0 +1,452 @@
 +#include "ggml.h"
 +
 +#ifdef _WIN32
@@ -586,7 +890,7 @@ index 000000000..8ef19b8cf
 +    if (gpus != NULL) gpus->pVtbl->Release(gpus); \
 +    if (gpu != NULL) gpu->pVtbl->Release(gpu)
 +
-+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
++int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
 +    std::lock_guard<std::mutex> lock(ggml_adlx_lock);
 +    if (adlx.handle == NULL) {
 +        GGML_LOG_INFO("%s ADLX was not initialized\n", __func__);
@@ -598,9 +902,13 @@ index 000000000..8ef19b8cf
 +    IADLXGPU* gpu = NULL;
 +    IADLXGPUMetrics *gpuMetrics = NULL;
 +    ADLX_RESULT status;
-+    // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs 
-+    adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff);
 +
++    uint32_t pci_domain, pci_bus, pci_device, pci_function;
++    if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) {
++        // TODO - parse other formats?
++        GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id);
++        return ADLX_NOT_FOUND;
++    }
 +    status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices);
 +    if (ADLX_FAILED(status)) {
 +        GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status);
@@ -623,16 +931,15 @@ index 000000000..8ef19b8cf
 +            GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status);
 +            continue;
 +        }
-+        adlx_int id;
-+        status = gpu->pVtbl->UniqueId(gpu, &id);
++        adlx_int uniqueID;
++        status = gpu->pVtbl->UniqueId(gpu, &uniqueID);
 +        if (ADLX_FAILED(status)) {
 +            GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status);
 +            gpu->pVtbl->Release(gpu);
 +            gpu = NULL;
 +            continue;
 +        }
-+        if (id != target) {
-+            GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id);
++        if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) {
 +            gpu->pVtbl->Release(gpu);
 +            gpu = NULL;
 +            continue;
@@ -695,7 +1002,7 @@ index 000000000..8ef19b8cf
 +    return -1;
 +}
 +void ggml_hip_mgmt_release() {}
-+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
++int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
 +    return -1;
 +}
 +
diff --git a/llama/patches/0029-NVML-fallback-for-unified-memory-GPUs.patch b/llama/patches/0027-NVML-fallback-for-unified-memory-GPUs.patch
similarity index 99%
rename from llama/patches/0029-NVML-fallback-for-unified-memory-GPUs.patch
rename to llama/patches/0027-NVML-fallback-for-unified-memory-GPUs.patch
index 9ba11168..ec3fdbaa 100644
--- a/llama/patches/0029-NVML-fallback-for-unified-memory-GPUs.patch
+++ b/llama/patches/0027-NVML-fallback-for-unified-memory-GPUs.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] NVML fallback for unified memory GPUs
  1 file changed, 68 insertions(+), 3 deletions(-)
 
 diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
-index c9073cef..f473a2a2 100644
+index c9073cef0..f473a2a2c 100644
 --- a/ggml/src/mem_nvml.cpp
 +++ b/ggml/src/mem_nvml.cpp
 @@ -13,6 +13,7 @@
diff --git a/llama/patches/0027-vulkan-get-GPU-ID-ollama-v0.11.5.patch b/llama/patches/0027-vulkan-get-GPU-ID-ollama-v0.11.5.patch
deleted file mode 100644
index 997dd386..00000000
--- a/llama/patches/0027-vulkan-get-GPU-ID-ollama-v0.11.5.patch
+++ /dev/null
@@ -1,95 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Xiaodong Ye <xiaodong.ye@mthreads.com>
-Date: Mon, 18 Aug 2025 12:48:07 +0800
-Subject: [PATCH] vulkan: get GPU ID (ollama v0.11.5)
-
-Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
----
- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 37 ++++++++++++++++++++++++++++
- 1 file changed, 37 insertions(+)
-
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 061cd078..adea7783 100644
---- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -11588,6 +11588,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
-     snprintf(description, description_size, "%s", props.deviceName.data());
- }
-
-+static std::string ggml_vk_get_device_id(int device) {
-+    ggml_vk_instance_init();
-+
-+    std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
-+
-+    vk::PhysicalDeviceProperties2 props;
-+    vk::PhysicalDeviceIDProperties deviceIDProps;
-+    props.pNext = &deviceIDProps;
-+    devices[device].getProperties2(&props);
-+
-+    const auto& uuid = deviceIDProps.deviceUUID;
-+    char id[64];
-+    snprintf(id, sizeof(id),
-+        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-+        uuid[0], uuid[1], uuid[2], uuid[3],
-+        uuid[4], uuid[5],
-+        uuid[6], uuid[7],
-+        uuid[8], uuid[9],
-+        uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]
-+    );
-+    return std::string(id);
-+}
-+
- // backend interface
-
- #define UNUSED GGML_UNUSED
-@@ -12394,6 +12417,12 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
-     ggml_vk_get_device_description(dev_idx, description, description_size);
- }
-
-+std::string ggml_backend_vk_get_device_id(int device) {
-+    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
-+    int dev_idx = vk_instance.device_indices[device];
-+    return ggml_vk_get_device_id(dev_idx);
-+}
-+
- void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
-     GGML_ASSERT(device < (int) vk_instance.device_indices.size());
-     GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
-@@ -12481,6 +12510,7 @@ struct ggml_backend_vk_device_context {
-     std::string description;
-     bool is_integrated_gpu;
-     std::string pci_bus_id;
-+    std::string id;
- };
-
- static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
-@@ -12493,6 +12523,11 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
-     return ctx->description.c_str();
- }
-
-+static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
-+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-+    return ctx->id.c_str();
-+}
-+
- static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
-     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
-     ggml_backend_vk_get_device_memory(ctx->device, free, total);
-@@ -12519,6 +12554,7 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
-
-     props->name        = ggml_backend_vk_device_get_name(dev);
-     props->description = ggml_backend_vk_device_get_description(dev);
-+    props->id          = ggml_backend_vk_device_get_id(dev);
-     props->type        = ggml_backend_vk_device_get_type(dev);
-     props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
-     ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
-@@ -12965,6 +13001,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
-                 ctx->description = desc;
-                 ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
-                 ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
-+                ctx->id = ggml_backend_vk_get_device_id(i);
-                 devices.push_back(new ggml_backend_device {
-                     /* .iface   = */ ggml_backend_vk_device_i,
-                     /* .reg     = */ reg,
--- 
-2.51.0
\ No newline at end of file
diff --git a/llama/patches/0030-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch b/llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch
similarity index 97%
rename from llama/patches/0030-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch
rename to llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch
index c3c7fedf..f5861a8c 100644
--- a/llama/patches/0030-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch
+++ b/llama/patches/0028-CUDA-Changing-the-CUDA-scheduling-strategy-to-spin-1.patch
@@ -28,7 +28,7 @@ Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
  1 file changed, 9 insertions(+)
 
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 6a278b5e9..87941f872 100644
+index b075a18be..d62f412d6 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -340,6 +340,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
diff --git a/llama/patches/0028-vulkan-pci-and-memory.patch b/llama/patches/0028-vulkan-pci-and-memory.patch
deleted file mode 100644
index c20ccf5c..00000000
--- a/llama/patches/0028-vulkan-pci-and-memory.patch
+++ /dev/null
@@ -1,254 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Daniel Hiltgen <daniel@ollama.com>
-Date:   Fri Sep 5 08:25:03 2025 -0700
-Subject: [PATCH] Vulkan PCI and Memory
-
----
- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 176 ++++++++++++++++++++++-----
- 1 file changed, 145 insertions(+), 31 deletions(-)
-
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index adea7783..fb7204ce 100644
---- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -12423,31 +12423,99 @@ std::string ggml_backend_vk_get_device_id(int device) {
-     return ggml_vk_get_device_id(dev_idx);
- }
- 
--void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
--    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
--    GGML_ASSERT(device < (int) vk_instance.device_supports_membudget.size());
-+//////////////////////////
-+
-+struct ggml_backend_vk_device_context {
-+    size_t device;
-+    std::string name;
-+    std::string description;
-+    bool is_integrated_gpu;
-+    // Combined string id in the form "dddd:bb:dd.f" (domain:bus:device.function)
-+    std::string pci_id;
-+    std::string id;
-+    std::string uuid;
-+    int major;
-+    int minor;
-+    int driver_major;
-+    int driver_minor;
-+    int pci_bus_id;
-+    int pci_device_id;
-+    int pci_domain_id;
-+};
-+
-+void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
-+    GGML_ASSERT(ctx->device < (int) vk_instance.device_indices.size());
-+    GGML_ASSERT(ctx->device < (int) vk_instance.device_supports_membudget.size());
-+
-+    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[ctx->device]];
- 
--    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
--    vk::PhysicalDeviceMemoryBudgetPropertiesEXT budgetprops;
--    vk::PhysicalDeviceMemoryProperties2 memprops = {};
--    bool membudget_supported = vk_instance.device_supports_membudget[device];
-+    vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
-+    vk::PhysicalDeviceProperties2 props2;
-+    vkdev.getProperties2(&props2);
- 
--    if (membudget_supported) {
--        memprops.pNext = &budgetprops;
-+    if (!ctx->is_integrated_gpu)
-+    {
-+        // Use vendor specific management libraries for best VRAM reporting if available
-+        switch (props2.properties.vendorID) {
-+        case VK_VENDOR_ID_AMD:
-+            if (ggml_hip_mgmt_init() == 0) {
-+                int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
-+                if (status == 0) {
-+                    GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
-+                    ggml_hip_mgmt_release();
-+                    return;
-+                }
-+                ggml_hip_mgmt_release();
-+            }
-+            break;
-+        case VK_VENDOR_ID_NVIDIA:
-+            if (ggml_nvml_init() == 0) {
-+                int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
-+                if (status == 0) {
-+                    GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
-+                    ggml_nvml_release();
-+                    return;
-+                }
-+                ggml_nvml_release();
-+            }
-+            break;
-+        }
-     }
--    vkdev.getMemoryProperties2(&memprops);
-+    // else fallback to memory budget if supported
- 
--    for (uint32_t i = 0; i < memprops.memoryProperties.memoryHeapCount; ++i) {
--        const vk::MemoryHeap & heap = memprops.memoryProperties.memoryHeaps[i];
-+    *total = 0;
-+    *free = 0;
-+    vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
-+    vk::PhysicalDeviceMemoryProperties2 memprops2;
-+    memprops2.pNext = &mem_budget_props;
-+    vkdev.getMemoryProperties2(&memprops2);
-+    for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
-+        if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
-+            *total += memprops2.memoryProperties.memoryHeaps[i].size;
-+        } else if (ctx->is_integrated_gpu) {
-+            // Include shared memory on iGPUs
-+            *total += memprops2.memoryProperties.memoryHeaps[i].size;
-+        }
-+    }
-+    for (int i = 0; i < memprops2.memoryProperties.memoryHeapCount; i++) {
-+        if (memprops2.memoryProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
-+            *free += mem_budget_props.heapBudget[i];
-+        } else if (ctx->is_integrated_gpu) {
-+            *free += mem_budget_props.heapBudget[i];
-+        }
-+    }
-+    if (*total > 0 && *free > 0) {
-+        return;
-+    } else if (*total > 0) {
-+        *free = *total;
-+        return;
-+    }
- 
-+    // else just report the physical memory
-+    for (const vk::MemoryHeap& heap : memprops2.memoryProperties.memoryHeaps) {
-         if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
-             *total = heap.size;
--
--            if (membudget_supported && i < budgetprops.heapUsage.size()) {
--                *free = budgetprops.heapBudget[i] - budgetprops.heapUsage[i];
--            } else {
--                *free = heap.size;
--            }
-+            *free = heap.size;
-             break;
-         }
-     }
-@@ -12502,16 +12570,17 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
-     return std::string(pci_bus_id);
- }
- 
--//////////////////////////
--
--struct ggml_backend_vk_device_context {
--    size_t device;
--    std::string name;
--    std::string description;
--    bool is_integrated_gpu;
--    std::string pci_bus_id;
--    std::string id;
--};
-+static bool ggml_backend_vk_parse_pci_bus_id(const std::string & id, int *domain, int *bus, int *device) {
-+    if (id.empty()) return false;
-+    unsigned int d = 0, b = 0, dev = 0, func = 0;
-+    // Expected format: dddd:bb:dd.f (all hex)
-+    int n = sscanf(id.c_str(), "%4x:%2x:%2x.%1x", &d, &b, &dev, &func);
-+    if (n < 4) return false;
-+    if (domain) *domain = (int) d;
-+    if (bus) *bus = (int) b;
-+    if (device) *device = (int) dev;
-+    return true;
-+}
- 
- static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
-     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-@@ -12530,7 +12599,7 @@ static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
- 
- static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
-     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
--    ggml_backend_vk_get_device_memory(ctx->device, free, total);
-+    ggml_backend_vk_get_device_memory(ctx, free, total);
- }
- 
- static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
-@@ -12556,7 +12625,7 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
-     props->description = ggml_backend_vk_device_get_description(dev);
-     props->id          = ggml_backend_vk_device_get_id(dev);
-     props->type        = ggml_backend_vk_device_get_type(dev);
--    props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
-+    props->device_id   = ctx->pci_id.empty() ? nullptr : ctx->pci_id.c_str();
-     ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
-     props->caps = {
-         /* .async                 = */ false,
-@@ -12564,6 +12633,17 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
-         /* .buffer_from_host_ptr  = */ false,
-         /* .events                = */ false,
-     };
-+
-+    props->compute_major = ctx->major;
-+    props->compute_minor = ctx->minor;
-+    props->driver_major = ctx->driver_major;
-+    props->driver_minor = ctx->driver_minor;
-+    props->integrated = ctx->is_integrated_gpu;
-+    props->pci_bus_id = ctx->pci_bus_id;
-+    props->pci_device_id = ctx->pci_device_id;
-+    props->pci_domain_id = ctx->pci_domain_id;
-+    props->library = GGML_VK_NAME;
-+    props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str();
- }
- 
- static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
-@@ -12992,6 +13071,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
-         static std::mutex mutex;
-         std::lock_guard<std::mutex> lock(mutex);
-         if (!initialized) {
-+            std::vector<vk::PhysicalDevice> vk_devices = vk_instance.instance.enumeratePhysicalDevices();
-+
-             for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
-                 ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
-                 char desc[256];
-@@ -13000,13 +13081,46 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
-                 ctx->name = GGML_VK_NAME + std::to_string(i);
-                 ctx->description = desc;
-                 ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
--                ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
-+                ctx->pci_id = ggml_backend_vk_get_device_pci_id(i);
-                 ctx->id = ggml_backend_vk_get_device_id(i);
-                 devices.push_back(new ggml_backend_device {
-                     /* .iface   = */ ggml_backend_vk_device_i,
-                     /* .reg     = */ reg,
-                     /* .context = */ ctx,
-                 });
-+
-+                // Gather additional information about the device
-+                int dev_idx = vk_instance.device_indices[i];
-+                vk::PhysicalDeviceProperties props1;
-+                vk_devices[dev_idx].getProperties(&props1);
-+                vk::PhysicalDeviceProperties2 props2;
-+                vk::PhysicalDeviceIDProperties device_id_props;
-+                vk::PhysicalDevicePCIBusInfoPropertiesEXT  pci_bus_props;
-+                vk::PhysicalDeviceDriverProperties driver_props;
-+                props2.pNext = &device_id_props;
-+                device_id_props.pNext = &pci_bus_props;
-+                pci_bus_props.pNext = &driver_props;
-+                vk_devices[dev_idx].getProperties2(&props2);
-+                std::ostringstream oss;
-+                oss << std::hex << std::setfill('0');
-+                oss << "GPU-";
-+                int byteIdx = 0;
-+                for (int i = 0; i < 16; ++i, ++byteIdx) {
-+                    oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
-+                    if (byteIdx == 3 || byteIdx == 5 || byteIdx == 7 || byteIdx == 9) {
-+                        oss << '-';
-+                    }
-+                }
-+                ctx->uuid = oss.str();
-+                ctx->pci_bus_id = pci_bus_props.pciBus;
-+                ctx->pci_device_id = pci_bus_props.pciDevice;
-+                ctx->pci_domain_id = pci_bus_props.pciDomain;
-+                ctx->id = std::to_string(i);
-+                ctx->major = 0;
-+                ctx->minor = 0;
-+                // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
-+                ctx->driver_major = 0;
-+                ctx->driver_minor = 0;
-             }
-             initialized = true;
-         }
--- 
-2.51.0
\ No newline at end of file
diff --git a/llama/patches/0031-report-LoadLibrary-failures.patch b/llama/patches/0029-report-LoadLibrary-failures.patch
similarity index 100%
rename from llama/patches/0031-report-LoadLibrary-failures.patch
rename to llama/patches/0029-report-LoadLibrary-failures.patch
diff --git a/llm/memory.go b/llm/memory.go
index aa4927f1..15558109 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -4,27 +4,28 @@ import (
 	"fmt"
 	"log/slog"
 	"os"
+	"slices"
 	"sort"
 	"strings"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/ml"
 )
 
 // pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
 // The list of GPUs returned will always be the same brand (library)
 // If the model can not be fit fully within the available GPU(s) nil is returned
-func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
-	for _, gl := range gpus.ByLibrary() {
-		sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
+func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo {
+	for _, gl := range ml.ByLibrary(gpus) {
+		sgl := append(make([]ml.DeviceInfo, 0, len(gl)), gl...)
 
 		// TODO - potentially sort by performance capability, existing models loaded, etc.
 		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
 		// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
-		sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
+		sort.Sort(sort.Reverse(ml.ByFreeMemory(sgl)))
 
 		if !envconfig.SchedSpread() {
 			// Try to pack into as few GPUs as possible, starting from 1 GPU
@@ -63,8 +64,8 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
 }
 
 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
-func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
-	byLibrary := gpus.ByLibrary()
+func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo {
+	byLibrary := ml.ByLibrary(gpus)
 	if len(byLibrary) <= 1 {
 		return gpus
 	}
@@ -81,10 +82,10 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
 }
 
 // This algorithm looks for a complete fit to determine if we need to unload other models
-func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
+func predictServerFit(allGpus []ml.DeviceInfo, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
-	for _, gpus := range allGpus.ByLibrary() {
+	for _, gpus := range ml.ByLibrary(allGpus) {
 		var layerCount int
 		estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
 		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
@@ -97,14 +98,23 @@ func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj
 				return true, estimatedVRAM
 			}
 		}
-
-		if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory {
-			return true, estimatedVRAM
-		}
 	}
 	return false, estimatedVRAM
 }
 
+func verifyCPUFit(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, systemInfo ml.SystemInfo, numParallel int) bool {
+	estimate := estimateGPULayers(nil, f, projectors, opts, numParallel)
+	if estimate.TotalSize > systemInfo.FreeMemory {
+		return false
+	}
+	slog.Info("new model will fit in available system memory for CPU inference, loading",
+		"model", modelPath,
+		"parallel", numParallel,
+		"required", format.HumanBytes2(estimate.TotalSize),
+	)
+	return true
+}
+
 type MemoryEstimate struct {
 	// How many layers we predict we can load
 	Layers int
@@ -141,7 +151,7 @@ type MemoryEstimate struct {
 
 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
+func estimateGPULayers(gpus []ml.DeviceInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
 	// Graph size for a partial offload, applies to all GPUs
 	var graphPartialOffload uint64
 
@@ -175,10 +185,17 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 
 	overhead := envconfig.GpuOverhead()
 	availableList := make([]string, len(gpus))
+	libraries := []string{}
 	for i, gpu := range gpus {
 		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
+		if !slices.Contains(libraries, gpu.Library) {
+			libraries = append(libraries, gpu.Library)
+		}
 	}
-	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
+	if len(libraries) == 0 {
+		libraries = []string{"cpu"}
+	}
+	slog.Debug("evaluating", "library", strings.Join(libraries, ","), "gpu_count", len(gpus), "available", availableList)
 
 	for _, projector := range projectors {
 		llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
@@ -196,7 +213,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 
 	useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) &&
-		(discover.GpuInfoList)(gpus).FlashAttentionSupported() &&
+		ml.FlashAttentionSupported(gpus) &&
 		f.SupportsFlashAttention()
 
 	var kvct string
@@ -231,7 +248,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 
 	// on metal there's no partial offload overhead
-	if gpus[0].Library == "Metal" {
+	if len(gpus) > 0 && gpus[0].Library == "Metal" {
 		graphPartialOffload = graphFullOffload
 	} else if len(gpus) > 1 {
 		// multigpu should always use the partial graph size
@@ -256,7 +273,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	gpuAllocations := make([]uint64, len(gpus))
 	type gs struct {
 		i int
-		g *discover.GpuInfo
+		g *ml.DeviceInfo
 	}
 	gpusWithSpace := []gs{}
 	for i := range gpus {
@@ -265,19 +282,11 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 			gzo = gpuZeroOverhead
 		}
 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
-		if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
-			var compute string
-			if gpus[i].Library == "ROCm" {
-				compute = fmt.Sprintf("gfx%x%02x", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
-			} else {
-				compute = fmt.Sprintf("%d.%d", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
-			}
-
+		if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory()+2*layerSize {
 			slog.Debug("gpu has too little memory to allocate any layers",
 				"id", gpus[i].ID,
 				"library", gpus[i].Library,
-				"variant", gpus[i].Variant,
-				"compute", compute,
+				"compute", gpus[i].Compute(),
 				"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
 				"name", gpus[i].Name,
 				"total", format.HumanBytes2(gpus[i].TotalMemory),
@@ -291,7 +300,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 			continue
 		}
 		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
-		gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
+		gpuAllocations[i] += gpus[i].MinimumMemory() + layerSize // We hold off on graph until we know partial vs. full
 	}
 
 	var gpuZeroID int
@@ -397,7 +406,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		VRAMSize:  0,
 		GPUSizes:  []uint64{},
 
-		inferenceLibrary:    gpus[0].Library,
+		inferenceLibrary:    strings.Join(libraries, ","),
 		layersRequested:     opts.NumGPU,
 		layersModel:         int(f.KV().BlockCount()) + 1,
 		availableList:       availableList,
@@ -411,7 +420,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		projectorGraph:      ollamaEngineProjectorGraph,
 	}
 
-	if gpus[0].Library == "cpu" {
+	if len(gpus) == 0 {
 		return estimate
 	}
 	if layerCount == 0 {
diff --git a/llm/memory_test.go b/llm/memory_test.go
index 553214b9..fce17b9c 100644
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -10,7 +10,7 @@ import (
 	"github.com/stretchr/testify/require"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/discover"
+	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/ml"
 )
@@ -54,13 +54,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	}
 
 	// Simple CPU scenario
-	gpus := []discover.GpuInfo{
-		{
-			DeviceID: ml.DeviceID{
-				Library: "cpu",
-			},
-		},
-	}
+	gpus := []ml.DeviceInfo{}
 	projectors := []string{}
 	opts := api.DefaultOptions()
 	t.Run("cpu", func(t *testing.T) {
@@ -77,19 +71,17 @@ func TestEstimateGPULayers(t *testing.T) {
 	memoryLayerOutput := uint64(4)
 
 	// Dual CUDA scenario with asymmetry
-	gpuMinimumMemory := uint64(2048)
-	gpus = []discover.GpuInfo{
+	gpuMinimumMemory := uint64(457 * format.MebiByte)
+	gpus = []ml.DeviceInfo{
 		{
 			DeviceID: ml.DeviceID{
-				Library: "cuda",
+				Library: "CUDA",
 			},
-			MinimumMemory: gpuMinimumMemory,
 		},
 		{
 			DeviceID: ml.DeviceID{
-				Library: "cuda",
+				Library: "CUDA",
 			},
-			MinimumMemory: gpuMinimumMemory,
 		},
 	}
 	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
diff --git a/llm/server.go b/llm/server.go
index 6ba8f8d2..f8b232df 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -27,7 +27,6 @@ import (
 	"golang.org/x/sync/semaphore"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
@@ -66,7 +65,7 @@ func (e filteredEnv) LogValue() slog.Value {
 
 type LlamaServer interface {
 	ModelPath() string
-	Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error)
+	Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error)
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
 	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
@@ -115,7 +114,7 @@ type llamaServer struct {
 	llmServer
 
 	ggml     *ggml.GGML
-	gpus     discover.GpuInfoList // The set of GPUs covered by the memory estimate
+	gpus     []ml.DeviceInfo // The set of GPUs covered by the memory estimate
 	estimate MemoryEstimate
 }
 
@@ -146,7 +145,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 }
 
 // NewLlamaServer will run a server for the given GPUs
-func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
+func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
 	var llamaModel *llama.Model
 	var textProcessor model.TextProcessor
 	var err error
@@ -179,7 +178,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 
 	loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()}
 
-	defaultThreads := discover.GetSystemInfo().GetOptimalThreadCount()
+	defaultThreads := systemInfo.ThreadCount
 	if opts.NumThread > 0 {
 		loadRequest.NumThreads = opts.NumThread
 	} else if defaultThreads > 0 {
@@ -200,7 +199,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 
 	// This will disable flash attention unless all GPUs on the system support it, even if we end up selecting a subset
 	// that can handle it.
-	if fa && !gpus.FlashAttentionSupported() {
+	if fa && !ml.FlashAttentionSupported(gpus) {
 		slog.Warn("flash attention enabled but not supported by gpu")
 		fa = false
 	}
@@ -227,218 +226,170 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
 	}
 
-	availableLibs := make(map[string]string)
-	if entries, err := os.ReadDir(discover.LibOllamaPath); err == nil {
-		for _, entry := range entries {
-			availableLibs[entry.Name()] = filepath.Join(discover.LibOllamaPath, entry.Name())
-		}
+	gpuLibs := ml.LibraryPaths(gpus)
+	status := NewStatusWriter(os.Stderr)
+	cmd, port, err := StartRunner(
+		textProcessor != nil,
+		modelPath,
+		gpuLibs,
+		status,
+		ml.GetVisibleDevicesEnv(gpus),
+	)
+
+	s := llmServer{
+		port:           port,
+		cmd:            cmd,
+		status:         status,
+		options:        opts,
+		modelPath:      modelPath,
+		loadRequest:    loadRequest,
+		llamaModel:     llamaModel,
+		llamaModelLock: &sync.Mutex{},
+		textProcessor:  textProcessor,
+		numParallel:    numParallel,
+		sem:            semaphore.NewWeighted(int64(numParallel)),
+		totalLayers:    f.KV().BlockCount() + 1,
+		loadStart:      time.Now(),
+		done:           make(chan error, 1),
 	}
 
-	var gpuLibs []string
-	for _, gpu := range gpus {
-		gpuLibs = append(gpuLibs, gpu.RunnerName())
-	}
-
-	requested := envconfig.LLMLibrary()
-	if availableLibs[requested] != "" {
-		slog.Info("using requested gpu library", "requested", requested)
-		gpuLibs = []string{requested}
-	}
-
-	var compatible []string
-	for _, gpuLib := range gpuLibs {
-		var matchingLibs []string
-		for k := range availableLibs {
-			// exact match first
-			if k == gpuLib {
-				matchingLibs = append([]string{k}, matchingLibs...)
-				continue
-			}
-
-			// then match the family (e.g. 'cuda')
-			if strings.Split(k, "_")[0] == strings.Split(gpuLib, "_")[0] {
-				matchingLibs = append(matchingLibs, k)
-			}
-		}
-
-		if len(matchingLibs) > 0 {
-			compatible = append(compatible, matchingLibs[0])
-		}
-	}
-
-	exe, err := os.Executable()
 	if err != nil {
-		return nil, fmt.Errorf("unable to lookup executable path: %w", err)
+		var msg string
+		if s.status != nil && s.status.LastErrMsg != "" {
+			msg = s.status.LastErrMsg
+		}
+		err := fmt.Errorf("error starting runner: %v %s", err, msg)
+		if llamaModel != nil {
+			llama.FreeModel(llamaModel)
+		}
+		return nil, err
+	}
+
+	// reap subprocess when it exits
+	go func() {
+		err := s.cmd.Wait()
+		// Favor a more detailed message over the process exit status
+		if err != nil && s.status != nil && s.status.LastErrMsg != "" {
+			slog.Error("llama runner terminated", "error", err)
+			if strings.Contains(s.status.LastErrMsg, "unknown model") {
+				s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
+			}
+			s.done <- errors.New(s.status.LastErrMsg)
+		} else {
+			s.done <- err
+		}
+	}()
+
+	if textProcessor != nil {
+		return &ollamaServer{llmServer: s}, nil
+	} else {
+		return &llamaServer{llmServer: s, ggml: f}, nil
+	}
+}
+
+func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) {
+	var exe string
+	exe, err = os.Executable()
+	if err != nil {
+		return nil, 0, fmt.Errorf("unable to lookup executable path: %w", err)
 	}
 
 	if eval, err := filepath.EvalSymlinks(exe); err == nil {
 		exe = eval
 	}
 
-	// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
-	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
-	// without any LD_LIBRARY_PATH flags
-	for {
-		port := 0
-		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
-			var l *net.TCPListener
-			if l, err = net.ListenTCP("tcp", a); err == nil {
-				port = l.Addr().(*net.TCPAddr).Port
-				l.Close()
-			}
-		}
-		if port == 0 {
-			slog.Debug("ResolveTCPAddr failed, using random port")
-			port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
-		}
-		params := []string{"runner"}
-		if textProcessor != nil {
-			// New engine
-			// TODO - if we have failure to load scenarios, add logic to retry with the old runner
-			params = append(params, "--ollama-engine")
-		}
-		params = append(params, "--model", modelPath)
-		params = append(params, "--port", strconv.Itoa(port))
-
-		var pathEnv string
-		switch runtime.GOOS {
-		case "windows":
-			pathEnv = "PATH"
-		case "darwin":
-			pathEnv = "DYLD_LIBRARY_PATH"
-		default:
-			pathEnv = "LD_LIBRARY_PATH"
-		}
-
-		// Note: we always put our dependency paths first
-		// since these are the exact version we compiled/linked against
-		libraryPaths := []string{discover.LibOllamaPath}
-		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
-			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
-		}
-
-		ggmlPaths := []string{discover.LibOllamaPath}
-		for _, c := range compatible {
-			if libpath, ok := availableLibs[c]; ok {
-				slog.Debug("adding gpu library", "path", libpath)
-				libraryPaths = append([]string{libpath}, libraryPaths...)
-				ggmlPaths = append(ggmlPaths, libpath)
-			}
-		}
-
-		for _, gpu := range gpus {
-			if gpu.DependencyPath != nil {
-				slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath)
-				libraryPaths = append(gpu.DependencyPath, libraryPaths...)
-				ggmlPaths = append(ggmlPaths, gpu.DependencyPath...)
-			}
-		}
-
-		// finally, add the root library path
-		libraryPaths = append(libraryPaths, discover.LibOllamaPath)
-
-		s := llmServer{
-			port:           port,
-			cmd:            exec.Command(exe, params...),
-			status:         NewStatusWriter(os.Stderr),
-			options:        opts,
-			modelPath:      modelPath,
-			loadRequest:    loadRequest,
-			llamaModel:     llamaModel,
-			llamaModelLock: &sync.Mutex{},
-			textProcessor:  textProcessor,
-			numParallel:    numParallel,
-			sem:            semaphore.NewWeighted(int64(numParallel)),
-			totalLayers:    f.KV().BlockCount() + 1,
-			loadStart:      time.Now(),
-			done:           make(chan error, 1),
-		}
-
-		s.cmd.Env = os.Environ()
-		s.cmd.Stdout = os.Stdout
-		s.cmd.Stderr = s.status
-		s.cmd.SysProcAttr = LlamaServerSysProcAttr
-
-		// Always filter down the set of GPUs in case there are any unsupported devices that might crash
-		envWorkarounds := gpus.GetVisibleDevicesEnv()
-		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
-
-		// Update or add the path variable with our adjusted version
-		pathNeeded := true
-		ollamaPathNeeded := true
-		envWorkaroundDone := make([]bool, len(envWorkarounds))
-		for i := range s.cmd.Env {
-			cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
-			if strings.EqualFold(cmp[0], pathEnv) {
-				s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
-				pathNeeded = false
-			} else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") {
-				s.cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(ggmlPaths, string(filepath.ListSeparator))
-				ollamaPathNeeded = false
-			} else if len(envWorkarounds) != 0 {
-				for j, kv := range envWorkarounds {
-					tmp := strings.SplitN(kv, "=", 2)
-					if strings.EqualFold(cmp[0], tmp[0]) {
-						s.cmd.Env[i] = kv
-						envWorkaroundDone[j] = true
-					}
-				}
-			}
-		}
-		if pathNeeded {
-			s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
-		}
-		if ollamaPathNeeded {
-			s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
-		}
-		for i, done := range envWorkaroundDone {
-			if !done {
-				s.cmd.Env = append(s.cmd.Env, envWorkarounds[i])
-			}
-		}
-
-		slog.Info("starting runner", "cmd", s.cmd)
-		slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))
-
-		if err = s.cmd.Start(); err != nil {
-			var msg string
-			if s.status != nil && s.status.LastErrMsg != "" {
-				msg = s.status.LastErrMsg
-			}
-			err := fmt.Errorf("error starting runner: %v %s", err, msg)
-			if len(compatible) == 0 {
-				if llamaModel != nil {
-					llama.FreeModel(llamaModel)
-				}
-				return nil, err
-			}
-
-			slog.Warn("unable to start runner with compatible gpu", "error", err, "compatible", compatible)
-			compatible = compatible[1:]
-			continue
-		}
-
-		// reap subprocess when it exits
-		go func() {
-			err := s.cmd.Wait()
-			// Favor a more detailed message over the process exit status
-			if err != nil && s.status != nil && s.status.LastErrMsg != "" {
-				slog.Error("llama runner terminated", "error", err)
-				if strings.Contains(s.status.LastErrMsg, "unknown model") {
-					s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
-				}
-				s.done <- errors.New(s.status.LastErrMsg)
-			} else {
-				s.done <- err
-			}
-		}()
-
-		if textProcessor != nil {
-			return &ollamaServer{llmServer: s}, nil
-		} else {
-			return &llamaServer{llmServer: s, ggml: f}, nil
+	port = 0
+	if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
+		var l *net.TCPListener
+		if l, err = net.ListenTCP("tcp", a); err == nil {
+			port = l.Addr().(*net.TCPAddr).Port
+			l.Close()
 		}
 	}
+	if port == 0 {
+		slog.Debug("ResolveTCPAddr failed, using random port")
+		port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
+	}
+	params := []string{"runner"}
+	if ollamaEngine {
+		params = append(params, "--ollama-engine")
+	}
+	if modelPath != "" {
+		params = append(params, "--model", modelPath)
+	}
+	params = append(params, "--port", strconv.Itoa(port))
+
+	var pathEnv string
+	switch runtime.GOOS {
+	case "windows":
+		pathEnv = "PATH"
+	case "darwin":
+		pathEnv = "DYLD_LIBRARY_PATH"
+	default:
+		pathEnv = "LD_LIBRARY_PATH"
+	}
+
+	// Note: we always put our dependency paths first
+	// since these are the exact version we compiled/linked against
+	libraryPaths := append([]string{}, gpuLibs...)
+	if libraryPath, ok := os.LookupEnv(pathEnv); ok {
+		libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
+	}
+
+	cmd = exec.Command(exe, params...)
+
+	cmd.Env = os.Environ()
+	cmd.Stdout = out
+	cmd.Stderr = out
+	cmd.SysProcAttr = LlamaServerSysProcAttr
+
+	// Always filter down the set of GPUs in case there are any unsupported devices that might crash
+	pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
+
+	// Update or add the path variable with our adjusted version
+	pathNeeded := true
+	ollamaPathNeeded := true
+	extraEnvsDone := map[string]bool{}
+	for k := range extraEnvs {
+		extraEnvsDone[k] = false
+	}
+	for i := range cmd.Env {
+		cmp := strings.SplitN(cmd.Env[i], "=", 2)
+		if strings.EqualFold(cmp[0], pathEnv) {
+			cmd.Env[i] = pathEnv + "=" + pathEnvVal
+			pathNeeded = false
+		} else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") {
+			cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(gpuLibs, string(filepath.ListSeparator))
+			ollamaPathNeeded = false
+		} else if len(extraEnvs) != 0 {
+			for k, v := range extraEnvs {
+				if strings.EqualFold(cmp[0], k) {
+					cmd.Env[i] = k + "=" + v
+					extraEnvsDone[k] = true
+				}
+			}
+		}
+	}
+	if pathNeeded {
+		cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
+	}
+	if ollamaPathNeeded {
+		cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(gpuLibs, string(filepath.ListSeparator)))
+	}
+	for k, done := range extraEnvsDone {
+		if !done {
+			cmd.Env = append(cmd.Env, k+"="+extraEnvs[k])
+		}
+	}
+
+	slog.Info("starting runner", "cmd", cmd)
+	slog.Debug("subprocess", "", filteredEnv(cmd.Env))
+
+	if err = cmd.Start(); err != nil {
+		return nil, 0, err
+	}
+	err = nil
+	return
 }
 
 func (s *llmServer) ModelPath() string {
@@ -497,47 +448,58 @@ type LoadResponse struct {
 
 var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
 
-func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
-	systemInfo := discover.GetSystemInfo()
-	systemTotalMemory := systemInfo.System.TotalMemory
-	systemFreeMemory := systemInfo.System.FreeMemory
-	systemSwapFreeMemory := systemInfo.System.FreeSwap
+func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
+	systemTotalMemory := systemInfo.TotalMemory
+	systemFreeMemory := systemInfo.FreeMemory
+	systemSwapFreeMemory := systemInfo.FreeSwap
 	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
 
-	g := pickBestFullFitByLibrary(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
-	if g == nil {
-		if !requireFull {
-			g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
-		} else {
+	if len(gpus) == 0 || s.options.NumGPU == 0 {
+		if !verifyCPUFit(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, systemInfo, s.numParallel) {
 			slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
-			return nil, ErrLoadRequiredFull
+			return nil, fmt.Errorf("model requires more system memory than is currently available %w", ErrLoadRequiredFull)
 		}
+	} else {
+		g := pickBestFullFitByLibrary(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
+		if g == nil {
+			if !requireFull {
+				g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
+			} else {
+				slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
+				return nil, ErrLoadRequiredFull
+			}
+		}
+		gpus = g
 	}
 
-	gpus = g
 	s.estimate = estimateGPULayers(gpus, s.ggml, []string{s.loadRequest.ProjectorPath}, s.options, s.numParallel)
 
-	if len(gpus) > 1 || gpus[0].Library != "cpu" {
+	if len(gpus) >= 1 {
 		switch {
-		case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
+		case s.options.NumGPU == 0:
+			gpus = []ml.DeviceInfo{}
+		case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.TotalMemory:
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			s.options.NumGPU = 0
+			gpus = []ml.DeviceInfo{}
 		case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
-			gpus = discover.GpuInfoList{discover.GetCPUInfo()}
-		case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu":
+			gpus = []ml.DeviceInfo{}
+		case s.options.NumGPU < 0 && s.estimate.Layers > 0:
 			s.options.NumGPU = s.estimate.Layers
 		}
+	} else {
+		s.options.NumGPU = 0
 	}
 
 	// On linux and windows, over-allocating CPU memory will almost always result in an error
 	// Darwin has fully dynamic swap so has no direct concept of free swap space
 	if runtime.GOOS != "darwin" {
 		systemMemoryRequired := s.estimate.TotalSize - s.estimate.VRAMSize
-		available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
+		available := systemInfo.FreeMemory + systemInfo.FreeSwap
 		if systemMemoryRequired > available {
-			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
+			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.TotalMemory), "free", format.HumanBytes2(systemInfo.FreeMemory), "swap", format.HumanBytes2(systemInfo.FreeSwap))
 			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
 		}
 	}
@@ -564,10 +526,10 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		// Windows CUDA should not use mmap for best performance
 		// Linux  with a model larger than free space, mmap leads to thrashing
 		// For CPU loads we want the memory to be allocated, not FS cache
-		if (runtime.GOOS == "windows" && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
-			(runtime.GOOS == "linux" && systemInfo.System.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
-			(gpus[0].Library == "cpu" && s.options.UseMMap == nil) ||
-			(gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
+		if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
+			(runtime.GOOS == "linux" && systemInfo.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
+			(len(gpus) == 0 && s.options.UseMMap == nil) ||
+			(len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
 			(s.options.UseMMap != nil && !*s.options.UseMMap) {
 			s.loadRequest.UseMmap = false
 		}
@@ -605,8 +567,8 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 
 // createGPULayers maps from the tensor splits assigned by the memory estimates to explicit assignment
 // of particular layers onto GPUs
-func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.GpuInfoList, numGPU int) ml.GPULayersList {
-	if numGPU <= 0 {
+func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus []ml.DeviceInfo, numGPU int) ml.GPULayersList {
+	if numGPU <= 0 || len(gpus) == 0 {
 		return nil
 	}
 
@@ -662,7 +624,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
 // allowing for faster iteration, but may return less information.
 //
 // Returns the list of GPU IDs that were used in the final allocation on success
-func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
+func (s *ollamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
 	var success bool
 	defer func() {
 		if !success {
@@ -675,24 +637,21 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
 
 	slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)
 
-	systemInfo := discover.GetSystemInfo()
-	systemTotalMemory := systemInfo.System.TotalMemory
-	systemFreeMemory := systemInfo.System.FreeMemory
-	systemSwapFreeMemory := systemInfo.System.FreeSwap
+	systemTotalMemory := systemInfo.TotalMemory
+	systemFreeMemory := systemInfo.FreeMemory
+	systemSwapFreeMemory := systemInfo.FreeSwap
 	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
 
-	if !(len(gpus) == 1 && gpus[0].Library == "cpu") {
-		for _, gpu := range gpus {
-			available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory
-			if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory {
-				available = 0
-			}
-			slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
-				"available", format.HumanBytes2(available),
-				"free", format.HumanBytes2(gpu.FreeMemory),
-				"minimum", format.HumanBytes2(gpu.MinimumMemory),
-				"overhead", format.HumanBytes2(envconfig.GpuOverhead()))
+	for _, gpu := range gpus {
+		available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory()
+		if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory() {
+			available = 0
 		}
+		slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
+			"available", format.HumanBytes2(available),
+			"free", format.HumanBytes2(gpu.FreeMemory),
+			"minimum", format.HumanBytes2(gpu.MinimumMemory()),
+			"overhead", format.HumanBytes2(envconfig.GpuOverhead()))
 	}
 
 	pastAllocations := make(map[uint64]struct{})
@@ -762,7 +721,6 @@ nextOperation:
 						if err != nil {
 							return nil, err
 						}
-
 						slog.Debug("new layout created", "layers", newGPULayers)
 
 						s.loadRequest.GPULayers = newGPULayers
@@ -808,15 +766,12 @@ nextOperation:
 				// Memory allocation failed even though we created a layout that we thought should
 				// fit in available memory. This could happen if either our free memory reports
 				// are incorrect or if available memory is changing between layout and allocation
-				// time. Apply an exponential backoff to try to find the real amount of available
-				// space.
+				// time. Apply a backoff to try to find the real amount of available space.
 				if backoff > 1 {
 					slog.Warn("memory layout cannot be allocated", "memory", resp.Memory)
 					return nil, errors.New("memory layout cannot be allocated")
-				} else if backoff == 0 {
-					backoff = 0.01
 				} else {
-					backoff *= 2
+					backoff += 0.1
 				}
 
 				slog.Info("model layout did not fit, applying backoff", "backoff", fmt.Sprintf("%.2f", backoff))
@@ -864,20 +819,27 @@ func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
 // - Calculating how much space each GPU has available for layers, based on free memory and space occupied by the graph
 // - Assigning layers
 // - Ensuring that we don't exceed limits, such as requirements about partial offloading or system memory
-func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs discover.GpuInfoList, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) {
-	if s.totalLayers == 0 || s.options.NumGPU == 0 || len(systemGPUs) == 0 || (len(systemGPUs) == 1 && systemGPUs[0].Library == "cpu") {
-		return ml.GPULayersList{}, nil
-	}
-
-	gpus := append(make(discover.GpuInfoList, 0, len(systemGPUs)), systemGPUs...)
-	sort.Sort(sort.Reverse(discover.ByFreeMemory(gpus)))
-
+func (s *ollamaServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) {
 	if memory == nil {
 		memory = &ml.BackendMemory{CPU: ml.DeviceMemory{
 			Weights: make([]uint64, s.totalLayers),
 			Cache:   make([]uint64, s.totalLayers),
 		}}
 	}
+	gpuLayers, layers, err := s.buildLayout(systemGPUs, memory, requireFull, backoff)
+	if err != nil {
+		return nil, err
+	}
+	err = s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
+	if err != nil {
+		return nil, err
+	}
+	return gpuLayers, nil
+}
+
+func (s *ollamaServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, []uint64, error) {
+	gpus := append(make([]ml.DeviceInfo, 0, len(systemGPUs)), systemGPUs...)
+	sort.Sort(sort.Reverse(ml.ByFreeMemory(gpus)))
 
 	layers := make([]uint64, len(memory.CPU.Weights))
 	for i := range layers {
@@ -891,7 +853,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 	}
 
 	gpuLayers := ml.GPULayersList{}
-	for _, gl := range gpus.ByLibrary() {
+	for _, gl := range ml.ByLibrary(gpus) {
 		// If a GPU already has a graph allocated on it, then we should continue to use it.
 		// Otherwise, we lose information that we got from previous allocations, which can
 		// cause cycling. Plus, we get more information about required allocation from each
@@ -905,7 +867,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 						lastUsedGPU = i
 					}
 
-					reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph
+					reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory() + envconfig.GpuOverhead() + memory.GPUs[j].Graph
 					if gl[i].FreeMemory > reserved {
 						gl[i].FreeMemory -= reserved
 					} else {
@@ -914,7 +876,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 
 					slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library,
 						"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
-						"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
+						"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory()),
 						"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
 						"graph", format.HumanBytes2(memory.GPUs[j].Graph))
 
@@ -933,7 +895,11 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 			gpuLayers = libraryGpuLayers
 		}
 	}
+	return gpuLayers, layers, nil
+}
 
+// verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory
+func (s *ollamaServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
 	// These sizes will only increase as we go through additional iterations and get additional information.
 	cpuSize := memory.InputWeights + memory.CPU.Graph
 	var vramSize uint64
@@ -961,24 +927,24 @@ nextLayer:
 
 	if requireFull {
 		if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
-			return nil, ErrLoadRequiredFull
+			return ErrLoadRequiredFull
 		}
 
-		if cpuSize > systemInfo.System.FreeMemory {
-			return nil, ErrLoadRequiredFull
+		if cpuSize > systemInfo.FreeMemory {
+			return ErrLoadRequiredFull
 		}
 	}
 
 	// On linux and windows, over-allocating CPU memory will almost always result in an error
 	// Darwin has fully dynamic swap so has no direct concept of free swap space
 	if runtime.GOOS != "darwin" {
-		available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
+		available := systemInfo.FreeMemory + systemInfo.FreeSwap
 		if cpuSize > available {
-			slog.Warn("model request too large for system", "requested", format.HumanBytes2(cpuSize), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
-			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(cpuSize), format.HumanBytes2(available))
+			slog.Warn("model request too large for system", "requested", format.HumanBytes2(cpuSize), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.TotalMemory), "free", format.HumanBytes2(systemInfo.FreeMemory), "swap", format.HumanBytes2(systemInfo.FreeSwap))
+			return fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(cpuSize), format.HumanBytes2(available))
 		}
 	} else {
-		if vramSize > systemInfo.System.TotalMemory {
+		if vramSize > systemInfo.TotalMemory {
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			s.options.NumGPU = 0
@@ -990,11 +956,11 @@ nextLayer:
 		slog.Debug("insufficient VRAM to load any model layers")
 	}
 
-	return gpuLayers, nil
+	return nil
 }
 
 // assignLayers packs the maximum number of layers onto the smallest set of GPUs and comes up with a layer assignment
-func assignLayers(layers []uint64, gpus discover.GpuInfoList, requireFull bool, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) {
+func assignLayers(layers []uint64, gpus []ml.DeviceInfo, requireFull bool, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) {
 	// If we can't fit everything then prefer offloading layers other than the output layer
 	for range 2 {
 		// requestedLayers may be -1 if nothing was requested
@@ -1028,7 +994,7 @@ func assignLayers(layers []uint64, gpus discover.GpuInfoList, requireFull bool,
 // findBestFit binary searches to find the smallest capacity factor that can fit
 // the max number of layers. The capacity factor is multiplied by the free space on
 // each GPU and a small one will force even balancing.
-func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int, forceRequest bool) (gpuLayers ml.GPULayersList) {
+func findBestFit(layers []uint64, gpus []ml.DeviceInfo, requestedLayers int, forceRequest bool) (gpuLayers ml.GPULayersList) {
 	var high float32 = 1
 	var low float32 = 0
 
@@ -1053,12 +1019,11 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int
 			low = mid
 		}
 	}
-
 	return bestAssignments
 }
 
 // greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
-func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
+func greedyFit(layers []uint64, gpus []ml.DeviceInfo, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
 	device := len(gpus) - 1
 	gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}
 	freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity)
@@ -1082,7 +1047,6 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req
 			freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity)
 		}
 	}
-
 	return gpuLayers
 }
 
@@ -1814,7 +1778,7 @@ func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
 }
 
 func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
-	devices, err := discover.GetDevicesFromRunner(ctx, s)
+	devices, err := ml.GetDevicesFromRunner(ctx, s)
 	if err != nil {
 		if s.cmd != nil && s.cmd.ProcessState == nil {
 			// Still running but hit an error, log
diff --git a/llm/server_test.go b/llm/server_test.go
index bdedc960..2d3bf6be 100644
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -8,7 +8,6 @@ import (
 	"testing"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/ml"
 	"golang.org/x/sync/semaphore"
@@ -20,6 +19,8 @@ func TestLLMServerFitGPU(t *testing.T) {
 		free int
 	}
 
+	minMemory := 457 * format.MebiByte
+
 	tests := []struct {
 		name        string
 		gpus        []gpu
@@ -37,91 +38,91 @@ func TestLLMServerFitGPU(t *testing.T) {
 		},
 		{
 			name:     "Full single GPU",
-			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
 		},
 		{
 			name:     "Partial single GPU",
-			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:   -1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
 		},
 		{
 			name:     "Single GPU with numGPU 1",
-			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Single GPU with numGPU 0",
-			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   0,
 			expected: ml.GPULayersList{},
 		},
 		{
 			name:     "Single GPU with numGPU 999",
-			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:   999,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
 		},
 		{
 			name:     "Multi GPU fits on one",
-			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
 		},
 		{
 			name:     "Multi GPU split",
-			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
 		},
 		{
 			name:     "Multi GPU partial",
-			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 1",
-			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 2",
-			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   2,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 999",
-			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   999,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
 		},
 		{
 			name:     "Multi GPU different libraries",
-			gpus:     []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
 			layers:   []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
 			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
 		},
 		{
 			name:        "requireFull",
-			gpus:        []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
+			gpus:        []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
 			layers:      []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:      -1,
 			requireFull: true,
@@ -139,12 +140,12 @@ func TestLLMServerFitGPU(t *testing.T) {
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			var systemInfo discover.SystemInfo
-			systemInfo.System.TotalMemory = format.GibiByte
-			systemInfo.System.FreeMemory = 512 * format.MebiByte
-			systemInfo.System.FreeSwap = 256 * format.MebiByte
+			var systemInfo ml.SystemInfo
+			systemInfo.TotalMemory = format.GibiByte
+			systemInfo.FreeMemory = 512 * format.MebiByte
+			systemInfo.FreeSwap = 256 * format.MebiByte
 
-			gpus := make(discover.GpuInfoList, len(tt.gpus))
+			gpus := make([]ml.DeviceInfo, len(tt.gpus))
 			for i := range tt.gpus {
 				gpus[i].DeviceID = tt.gpus[i].id
 				gpus[i].FreeMemory = uint64(tt.gpus[i].free)
diff --git a/middleware/openai.go b/middleware/openai.go
index 826a2111..b2e43f16 100644
--- a/middleware/openai.go
+++ b/middleware/openai.go
@@ -7,6 +7,7 @@ import (
 	"io"
 	"math/rand"
 	"net/http"
+	"strings"
 
 	"github.com/gin-gonic/gin"
 
@@ -44,7 +45,8 @@ type RetrieveWriter struct {
 
 type EmbedWriter struct {
 	BaseWriter
-	model string
+	model          string
+	encodingFormat string
 }
 
 func (w *BaseWriter) writeError(data []byte) (int, error) {
@@ -254,7 +256,7 @@ func (w *EmbedWriter) writeResponse(data []byte) (int, error) {
 	}
 
 	w.ResponseWriter.Header().Set("Content-Type", "application/json")
-	err = json.NewEncoder(w.ResponseWriter).Encode(openai.ToEmbeddingList(w.model, embedResponse))
+	err = json.NewEncoder(w.ResponseWriter).Encode(openai.ToEmbeddingList(w.model, embedResponse, w.encodingFormat))
 	if err != nil {
 		return 0, err
 	}
@@ -348,6 +350,14 @@ func EmbeddingsMiddleware() gin.HandlerFunc {
 			return
 		}
 
+		// Validate encoding_format parameter
+		if req.EncodingFormat != "" {
+			if !strings.EqualFold(req.EncodingFormat, "float") && !strings.EqualFold(req.EncodingFormat, "base64") {
+				c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, fmt.Sprintf("Invalid value for 'encoding_format' = %s. Supported values: ['float', 'base64'].", req.EncodingFormat)))
+				return
+			}
+		}
+
 		if req.Input == "" {
 			req.Input = []string{""}
 		}
@@ -371,8 +381,9 @@ func EmbeddingsMiddleware() gin.HandlerFunc {
 		c.Request.Body = io.NopCloser(&b)
 
 		w := &EmbedWriter{
-			BaseWriter: BaseWriter{ResponseWriter: c.Writer},
-			model:      req.Model,
+			BaseWriter:     BaseWriter{ResponseWriter: c.Writer},
+			model:          req.Model,
+			encodingFormat: req.EncodingFormat,
 		}
 
 		c.Writer = w
diff --git a/middleware/openai_encoding_format_test.go b/middleware/openai_encoding_format_test.go
new file mode 100644
index 00000000..52107d6e
--- /dev/null
+++ b/middleware/openai_encoding_format_test.go
@@ -0,0 +1,220 @@
+package middleware
+
+import (
+	"encoding/base64"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"github.com/gin-gonic/gin"
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/openai"
+)
+
+func TestEmbeddingsMiddleware_EncodingFormats(t *testing.T) {
+	testCases := []struct {
+		name           string
+		encodingFormat string
+		expectType     string // "array" or "string"
+		verifyBase64   bool
+	}{
+		{"float format", "float", "array", false},
+		{"base64 format", "base64", "string", true},
+		{"default format", "", "array", false},
+	}
+
+	gin.SetMode(gin.TestMode)
+
+	endpoint := func(c *gin.Context) {
+		resp := api.EmbedResponse{
+			Embeddings:      [][]float32{{0.1, -0.2, 0.3}},
+			PromptEvalCount: 5,
+		}
+		c.JSON(http.StatusOK, resp)
+	}
+
+	router := gin.New()
+	router.Use(EmbeddingsMiddleware())
+	router.Handle(http.MethodPost, "/api/embed", endpoint)
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			body := `{"input": "test", "model": "test-model"`
+			if tc.encodingFormat != "" {
+				body += `, "encoding_format": "` + tc.encodingFormat + `"`
+			}
+			body += `}`
+
+			req, _ := http.NewRequest(http.MethodPost, "/api/embed", strings.NewReader(body))
+			req.Header.Set("Content-Type", "application/json")
+
+			resp := httptest.NewRecorder()
+			router.ServeHTTP(resp, req)
+
+			if resp.Code != http.StatusOK {
+				t.Fatalf("expected status 200, got %d", resp.Code)
+			}
+
+			var result openai.EmbeddingList
+			if err := json.Unmarshal(resp.Body.Bytes(), &result); err != nil {
+				t.Fatalf("failed to unmarshal response: %v", err)
+			}
+
+			if len(result.Data) != 1 {
+				t.Fatalf("expected 1 embedding, got %d", len(result.Data))
+			}
+
+			switch tc.expectType {
+			case "array":
+				if _, ok := result.Data[0].Embedding.([]interface{}); !ok {
+					t.Errorf("expected array, got %T", result.Data[0].Embedding)
+				}
+			case "string":
+				embStr, ok := result.Data[0].Embedding.(string)
+				if !ok {
+					t.Errorf("expected string, got %T", result.Data[0].Embedding)
+				} else if tc.verifyBase64 {
+					decoded, err := base64.StdEncoding.DecodeString(embStr)
+					if err != nil {
+						t.Errorf("invalid base64: %v", err)
+					} else if len(decoded) != 12 {
+						t.Errorf("expected 12 bytes, got %d", len(decoded))
+					}
+				}
+			}
+		})
+	}
+}
+
+func TestEmbeddingsMiddleware_BatchWithBase64(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	endpoint := func(c *gin.Context) {
+		resp := api.EmbedResponse{
+			Embeddings: [][]float32{
+				{0.1, 0.2},
+				{0.3, 0.4},
+				{0.5, 0.6},
+			},
+			PromptEvalCount: 10,
+		}
+		c.JSON(http.StatusOK, resp)
+	}
+
+	router := gin.New()
+	router.Use(EmbeddingsMiddleware())
+	router.Handle(http.MethodPost, "/api/embed", endpoint)
+
+	body := `{
+		"input": ["hello", "world", "test"],
+		"model": "test-model",
+		"encoding_format": "base64"
+	}`
+
+	req, _ := http.NewRequest(http.MethodPost, "/api/embed", strings.NewReader(body))
+	req.Header.Set("Content-Type", "application/json")
+
+	resp := httptest.NewRecorder()
+	router.ServeHTTP(resp, req)
+
+	if resp.Code != http.StatusOK {
+		t.Fatalf("expected status 200, got %d", resp.Code)
+	}
+
+	var result openai.EmbeddingList
+	if err := json.Unmarshal(resp.Body.Bytes(), &result); err != nil {
+		t.Fatalf("failed to unmarshal response: %v", err)
+	}
+
+	if len(result.Data) != 3 {
+		t.Fatalf("expected 3 embeddings, got %d", len(result.Data))
+	}
+
+	// All should be base64 strings
+	for i := range 3 {
+		embeddingStr, ok := result.Data[i].Embedding.(string)
+		if !ok {
+			t.Errorf("embedding %d: expected string, got %T", i, result.Data[i].Embedding)
+			continue
+		}
+
+		// Verify it's valid base64
+		if _, err := base64.StdEncoding.DecodeString(embeddingStr); err != nil {
+			t.Errorf("embedding %d: invalid base64: %v", i, err)
+		}
+
+		// Check index
+		if result.Data[i].Index != i {
+			t.Errorf("embedding %d: expected index %d, got %d", i, i, result.Data[i].Index)
+		}
+	}
+}
+
+func TestEmbeddingsMiddleware_InvalidEncodingFormat(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	endpoint := func(c *gin.Context) {
+		c.Status(http.StatusOK)
+	}
+
+	router := gin.New()
+	router.Use(EmbeddingsMiddleware())
+	router.Handle(http.MethodPost, "/api/embed", endpoint)
+
+	testCases := []struct {
+		name           string
+		encodingFormat string
+		shouldFail     bool
+	}{
+		{"valid: float", "float", false},
+		{"valid: base64", "base64", false},
+		{"valid: FLOAT (uppercase)", "FLOAT", false},
+		{"valid: BASE64 (uppercase)", "BASE64", false},
+		{"valid: Float (mixed)", "Float", false},
+		{"valid: Base64 (mixed)", "Base64", false},
+		{"invalid: json", "json", true},
+		{"invalid: hex", "hex", true},
+		{"invalid: invalid_format", "invalid_format", true},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			body := `{
+				"input": "test",
+				"model": "test-model",
+				"encoding_format": "` + tc.encodingFormat + `"
+			}`
+
+			req, _ := http.NewRequest(http.MethodPost, "/api/embed", strings.NewReader(body))
+			req.Header.Set("Content-Type", "application/json")
+
+			resp := httptest.NewRecorder()
+			router.ServeHTTP(resp, req)
+
+			if tc.shouldFail {
+				if resp.Code != http.StatusBadRequest {
+					t.Errorf("expected status 400, got %d", resp.Code)
+				}
+
+				var errResp openai.ErrorResponse
+				if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
+					t.Fatalf("failed to unmarshal error response: %v", err)
+				}
+
+				if errResp.Error.Type != "invalid_request_error" {
+					t.Errorf("expected error type 'invalid_request_error', got %q", errResp.Error.Type)
+				}
+
+				if !strings.Contains(errResp.Error.Message, "encoding_format") {
+					t.Errorf("expected error message to mention encoding_format, got %q", errResp.Error.Message)
+				}
+			} else {
+				if resp.Code != http.StatusOK {
+					t.Errorf("expected status 200, got %d: %s", resp.Code, resp.Body.String())
+				}
+			}
+		})
+	}
+}
diff --git a/ml/backend.go b/ml/backend.go
index 351942d5..bf390c01 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -98,8 +98,9 @@ func NewBackend(modelPath string, params BackendParams) (Backend, error) {
 type Context interface {
 	Empty(dtype DType, shape ...int) Tensor
 	Zeros(dtype DType, shape ...int) Tensor
-	FromFloatSlice(s []float32, shape ...int) Tensor
-	FromIntSlice(s []int32, shape ...int) Tensor
+	FromBytes(dtype DType, s []byte, shape ...int) Tensor
+	FromFloats(s []float32, shape ...int) Tensor
+	FromInts(s []int32, shape ...int) Tensor
 
 	// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
 	Arange(start, stop, step float32, dtype DType) Tensor
@@ -136,7 +137,9 @@ type Tensor interface {
 	Bytes() []byte
 	Floats() []float32
 
-	SetValueFromIntSlice(s []int32)
+	FromBytes([]byte)
+	FromFloats([]float32)
+	FromInts([]int32)
 
 	Neg(ctx Context) Tensor
 	Add(ctx Context, t2 Tensor) Tensor
@@ -158,6 +161,7 @@ type Tensor interface {
 
 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
+	Conv3D(ctx Context, weight Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) Tensor
 
 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 88078d77..33401c30 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -12,6 +12,7 @@ import "C"
 
 import (
 	"context"
+	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
@@ -724,7 +725,9 @@ func (b *Backend) BackendDevices() []ml.DeviceInfo {
 		if props.library != nil {
 			info.Library = C.GoString(props.library)
 		}
-		info.PCIID = fmt.Sprintf("%02x:%02x.%x", props.pci_bus_id, props.pci_device_id, props.pci_domain_id)
+		if props.device_id != nil {
+			info.PCIID = C.GoString(props.device_id)
+		}
 		info.LibraryPath = ggml.LibPaths()
 		if props.numeric_id != nil {
 			info.FilteredID = C.GoString(props.numeric_id)
@@ -871,7 +874,7 @@ func pad(length, pad C.size_t) C.size_t {
 	return ((length + pad - 1) / pad) * pad
 }
 
-func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
+func (c *Context) newTensor(dtype ml.DType, shape []int) *Tensor {
 	if c.buft == nil {
 		panic("set Input or Layer before creating tensors")
 	}
@@ -915,7 +918,7 @@ func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
 func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	t := c.newTensor(dtype, shape)
 	if c.b.allocMemory {
-		C.ggml_set_zero(t.(*Tensor).t)
+		C.ggml_set_zero(t.t)
 	}
 	return t
 }
@@ -936,25 +939,34 @@ func checkShape[S ~[]E, E any](s S, shape ...int) {
 	}
 }
 
-func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
-	checkShape(s, shape...)
-
-	t := c.newTensor(ml.DTypeF32, shape)
-
-	if c.b.allocMemory && len(s) > 0 {
-		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
+func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
+	// Unchecked to handle quantized types
+	t := c.newTensor(dtype, shape)
+	if c.b.allocMemory {
+		t.FromBytes(s)
 	}
 
 	return t
 }
 
-func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
+func (c *Context) FromFloats(s []float32, shape ...int) ml.Tensor {
+	checkShape(s, shape...)
+
+	t := c.newTensor(ml.DTypeF32, shape)
+
+	if c.b.allocMemory {
+		t.FromFloats(s)
+	}
+
+	return t
+}
+
+func (c *Context) FromInts(s []int32, shape ...int) ml.Tensor {
 	checkShape(s, shape...)
 
 	t := c.newTensor(ml.DTypeI32, shape)
-
-	if c.b.allocMemory && len(s) > 0 {
-		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
+	if c.b.allocMemory {
+		t.FromInts(s)
 	}
 
 	return t
@@ -975,7 +987,7 @@ func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
 			arange = append(arange, int32(i))
 		}
 
-		return c.Input().FromIntSlice(arange, len(arange))
+		return c.Input().FromInts(arange, len(arange))
 	default:
 		panic("unsupported dtype for arange")
 	}
@@ -1045,10 +1057,26 @@ func (t *Tensor) Floats() (data []float32) {
 	return
 }
 
-func (t *Tensor) SetValueFromIntSlice(s []int32) {
-	if len(s) > 0 {
-		C.ggml_backend_tensor_set(t.t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.t))
+func tensorSet[S ~[]E, E byte | float32 | int32](t *Tensor, s S) {
+	if len(s) == 0 {
+		return
 	}
+	if int(C.ggml_nbytes(t.t)) != len(s)*binary.Size(s[0]) {
+		panic("data size does not match tensor size")
+	}
+	C.ggml_backend_tensor_set(t.t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.t))
+}
+
+func (t *Tensor) FromBytes(s []byte) {
+	tensorSet(t, s)
+}
+
+func (t *Tensor) FromFloats(s []float32) {
+	tensorSet(t, s)
+}
+
+func (t *Tensor) FromInts(s []int32) {
+	tensorSet(t, s)
 }
 
 func (t *Tensor) DType() ml.DType {
@@ -1154,6 +1182,10 @@ func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
 }
 
 func (t *Tensor) Contiguous(ctx ml.Context, shape ...int) ml.Tensor {
+	if slices.Contains(shape, -1) {
+		inferShape(t, shape)
+	}
+
 	switch len(shape) {
 	case 0:
 		return &Tensor{
@@ -1296,7 +1328,43 @@ func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }
 
+// inferShape updates shape in place to automatically set a single -1 dimesion
+// based on the input tensor and the other dimensions
+func inferShape(t *Tensor, shape []int) {
+	total := 1
+	for _, dim := range t.Shape() {
+		total *= dim
+	}
+
+	dim := -1
+	for i := range shape {
+		switch shape[i] {
+		case -1:
+			if dim != -1 {
+				panic("only one dimension can be inferred")
+			}
+			dim = i
+		case 0:
+			panic("dimension cannot be zero")
+		default:
+			if total%shape[i] != 0 {
+				panic("cannot infer dimension")
+			}
+
+			total /= shape[i]
+		}
+	}
+
+	if dim != -1 {
+		shape[dim] = total
+	}
+}
+
 func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
+	if slices.Contains(shape, -1) {
+		inferShape(t, shape)
+	}
+
 	switch len(shape) {
 	case 1:
 		return &Tensor{
@@ -1509,6 +1577,16 @@ func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int
 	}
 }
 
+func (t *Tensor) Conv3D(ctx ml.Context, t2 ml.Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) ml.Tensor {
+	var tt ml.Tensor = &Tensor{
+		b: t.b,
+		t: C.ggml_conv_3d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int64_t(c), C.int(s0), C.int(s1), C.int(s2), C.int(p0), C.int(p1), C.int(p2), C.int(d0), C.int(d1), C.int(d2)),
+	}
+
+	tt = tt.Reshape(ctx, t.Dim(3)/c, t2.Dim(3)/c)
+	return tt
+}
+
 func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1622,13 +1700,3 @@ func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
 		t: C.ggml_clamp(ctx.(*Context).ctx, t.t, C.float(min), C.float(max)),
 	}
 }
-
-func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
-	// Unchecked to handle quantized types
-	t := c.newTensor(dtype, shape)
-	if c.b.allocMemory && len(s) > 0 {
-		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
-	}
-
-	return t
-}
diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h
index 094fc3c8..80983524 100644
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -174,9 +174,6 @@ extern "C" {
         int compute_major;
         int compute_minor;
         int integrated;
-        int pci_bus_id;
-        int pci_device_id;
-        int pci_domain_id;
         const char *library;
         // number with which the devices are accessed (Vulkan)
         const char *numeric_id;
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index f9cf2d4f..d62f412d 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3513,9 +3513,6 @@ struct ggml_backend_cuda_device_context {
     int driver_major;
     int driver_minor;
     int integrated;
-    int pciBusID;
-    int pciDeviceID;
-    int pciDomainID;
 };
 
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3539,9 +3536,9 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
 
 #if defined(GGML_USE_HIP)
     if (ggml_hip_mgmt_init() == 0) {
-        int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
+        int status = ggml_hip_get_device_memory(ctx->pci_bus_id.c_str(), free, total);
         if (status == 0) {
-            GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_bus_id.c_str(), *free, *total);
             ggml_hip_mgmt_release();
             return;
         }
@@ -3551,7 +3548,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
     if (ggml_nvml_init() == 0) {
         int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
         if (status == 0) {
-            GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->id.c_str(), *free, *total);
             ggml_nvml_release();
             return;
         }
@@ -3591,9 +3588,6 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
     props->driver_major = ctx->driver_major;
     props->driver_minor = ctx->driver_minor;
     props->integrated = ctx->integrated;
-    props->pci_bus_id = ctx->pciBusID;
-    props->pci_device_id = ctx->pciDeviceID;
-    props->pci_domain_id = ctx->pciDomainID;
     props->library = GGML_CUDA_NAME;
 
     bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
@@ -4182,9 +4176,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 dev_ctx->driver_major = driverVersion / 1000;
                 dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
                 dev_ctx->integrated = prop.integrated;
-                dev_ctx->pciBusID = prop.pciBusID;
-                dev_ctx->pciDeviceID = prop.pciDeviceID;
-                dev_ctx->pciDomainID = prop.pciDomainID;
                 ggml_backend_dev_t dev = new ggml_backend_device {
                     /* .iface   = */ ggml_backend_cuda_device_interface,
                     /* .reg     = */ &reg,
diff --git a/ml/backend/ggml/ggml/src/ggml-impl.h b/ml/backend/ggml/ggml/src/ggml-impl.h
index 80597b6e..b63edd0c 100644
--- a/ml/backend/ggml/ggml/src/ggml-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-impl.h
@@ -643,7 +643,7 @@ GGML_API int ggml_nvml_init();
 GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
 GGML_API void ggml_nvml_release();
 GGML_API int ggml_hip_mgmt_init();
-GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
+GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
 GGML_API void ggml_hip_mgmt_release();
 
 #ifdef __cplusplus
diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 564bc4a7..0bbcecd0 100644
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -231,6 +231,7 @@ class vk_memory_logger;
 #endif
 class vk_perf_logger;
 static void ggml_vk_destroy_buffer(vk_buffer& buf);
+static std::string ggml_vk_get_device_id(int device);
 
 static constexpr uint32_t mul_mat_vec_max_cols = 8;
 static constexpr uint32_t p021_max_gqa_ratio = 8;
@@ -11598,7 +11599,7 @@ static std::string ggml_vk_get_device_id(int device) {
     const auto& uuid = deviceIDProps.deviceUUID;
     char id[64];
     snprintf(id, sizeof(id),
-        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+        "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
         uuid[0], uuid[1], uuid[2], uuid[3],
         uuid[4], uuid[5],
         uuid[6], uuid[7],
@@ -12431,13 +12432,11 @@ struct ggml_backend_vk_device_context {
     std::string pci_id;
     std::string id;
     std::string uuid;
+    std::string numeric_id;
     int major;
     int minor;
     int driver_major;
     int driver_minor;
-    int pci_bus_id;
-    int pci_device_id;
-    int pci_domain_id;
 };
 
 void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size_t * free, size_t * total) {
@@ -12456,9 +12455,9 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
         switch (props2.properties.vendorID) {
         case VK_VENDOR_ID_AMD:
             if (ggml_hip_mgmt_init() == 0) {
-                int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
+                int status = ggml_hip_get_device_memory(ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), free, total);
                 if (status == 0) {
-                    GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+                    GGML_LOG_DEBUG("%s device %s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, ctx->pci_id != "" ? ctx->pci_id.c_str() : ctx->uuid.c_str(), *free, *total);
                     ggml_hip_mgmt_release();
                     return;
                 }
@@ -12469,7 +12468,7 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
             if (ggml_nvml_init() == 0) {
                 int status = ggml_nvml_get_device_memory(ctx->uuid.c_str(), free, total);
                 if (status == 0) {
-                    GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+                    GGML_LOG_DEBUG("%s device %s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, ctx->uuid.c_str(), *free, *total);
                     ggml_nvml_release();
                     return;
                 }
@@ -12545,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
         }
     }
 
+    vk::PhysicalDeviceProperties2 props2;
     if (!ext_support) {
-        return "";
+        device.getProperties2(&props2);
+        if (props2.properties.vendorID != VK_VENDOR_ID_AMD) {
+            return "";
+        }
+        // AMD doesn't claim to support PCI ID, but actually does, so try anyway and check for non-zero
     }
 
     vk::PhysicalDeviceProperties2 props = {};
@@ -12563,6 +12567,9 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
 
     char pci_bus_id[16] = {};
     snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
+    if (pci_domain == 0 && pci_bus == 0 && pci_device == 0 && pci_function == 0) {
+        return "";
+    }
 
     return std::string(pci_bus_id);
 }
@@ -12636,11 +12643,8 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
     props->driver_major = ctx->driver_major;
     props->driver_minor = ctx->driver_minor;
     props->integrated = ctx->is_integrated_gpu;
-    props->pci_bus_id = ctx->pci_bus_id;
-    props->pci_device_id = ctx->pci_device_id;
-    props->pci_domain_id = ctx->pci_domain_id;
     props->library = GGML_VK_NAME;
-    props->numeric_id = ctx->id.empty() ? nullptr : ctx->id.c_str();
+    props->numeric_id = ctx->numeric_id.c_str();
 }
 
 static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -13101,7 +13105,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                 vk_devices[dev_idx].getProperties2(&props2);
                 std::ostringstream oss;
                 oss << std::hex << std::setfill('0');
-                oss << "GPU-";
                 int byteIdx = 0;
                 for (int i = 0; i < 16; ++i, ++byteIdx) {
                     oss << std::setw(2) << static_cast<int>(device_id_props.deviceUUID[i]);
@@ -13110,15 +13113,12 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                     }
                 }
                 ctx->uuid = oss.str();
-                ctx->pci_bus_id = pci_bus_props.pciBus;
-                ctx->pci_device_id = pci_bus_props.pciDevice;
-                ctx->pci_domain_id = pci_bus_props.pciDomain;
-                ctx->id = std::to_string(i);
                 ctx->major = 0;
                 ctx->minor = 0;
                 // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
                 ctx->driver_major = 0;
                 ctx->driver_minor = 0;
+                ctx->numeric_id = std::to_string(i);
             }
             initialized = true;
         }
diff --git a/ml/backend/ggml/ggml/src/mem_hip.cpp b/ml/backend/ggml/ggml/src/mem_hip.cpp
index 8ef19b8c..5a7f5d46 100644
--- a/ml/backend/ggml/ggml/src/mem_hip.cpp
+++ b/ml/backend/ggml/ggml/src/mem_hip.cpp
@@ -331,7 +331,7 @@ void ggml_hip_mgmt_release() {
     if (gpus != NULL) gpus->pVtbl->Release(gpus); \
     if (gpu != NULL) gpu->pVtbl->Release(gpu)
 
-int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
     std::lock_guard<std::mutex> lock(ggml_adlx_lock);
     if (adlx.handle == NULL) {
         GGML_LOG_INFO("%s ADLX was not initialized\n", __func__);
@@ -343,9 +343,13 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free,
     IADLXGPU* gpu = NULL;
     IADLXGPUMetrics *gpuMetrics = NULL;
     ADLX_RESULT status;
-    // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs 
-    adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff);
 
+    uint32_t pci_domain, pci_bus, pci_device, pci_function;
+    if (sscanf(id, "%04x:%02x:%02x.%x", &pci_domain, &pci_bus, &pci_device, &pci_function) != 4) {
+        // TODO - parse other formats?
+        GGML_LOG_DEBUG("%s device ID was not a PCI ID %s\n", __func__, id);
+        return ADLX_NOT_FOUND;
+    }
     status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices);
     if (ADLX_FAILED(status)) {
         GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status);
@@ -368,16 +372,15 @@ int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free,
             GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status);
             continue;
         }
-        adlx_int id;
-        status = gpu->pVtbl->UniqueId(gpu, &id);
+        adlx_int uniqueID;
+        status = gpu->pVtbl->UniqueId(gpu, &uniqueID);
         if (ADLX_FAILED(status)) {
             GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status);
             gpu->pVtbl->Release(gpu);
             gpu = NULL;
             continue;
         }
-        if (id != target) {
-            GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id);
+        if ((((uniqueID >> 8) & 0xff) != pci_bus) || ((uniqueID & 0xff) != pci_device)) {
             gpu->pVtbl->Release(gpu);
             gpu = NULL;
             continue;
@@ -440,7 +443,7 @@ int ggml_hip_mgmt_init() {
     return -1;
 }
 void ggml_hip_mgmt_release() {}
-int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
     return -1;
 }
 
diff --git a/ml/backend/ggml/ggml_test.go b/ml/backend/ggml/ggml_test.go
new file mode 100644
index 00000000..4717ea90
--- /dev/null
+++ b/ml/backend/ggml/ggml_test.go
@@ -0,0 +1,126 @@
+package ggml
+
+import (
+	"errors"
+	"os"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/ml"
+)
+
+func setup(tb testing.TB) ml.Context {
+	tb.Helper()
+
+	f, err := os.CreateTemp(tb.TempDir(), "*.bin")
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+
+	if err := ggml.WriteGGUF(f, ggml.KV{"general.architecture": "test"}, nil); err != nil {
+		tb.Fatal(err)
+	}
+
+	b, err := ml.NewBackend(f.Name(), ml.BackendParams{})
+	if err != nil {
+		tb.Fatal(err)
+	}
+
+	ctx := b.NewContext().Input()
+
+	tb.Cleanup(func() {
+		ctx.Close()
+		b.Close()
+	})
+
+	return ctx
+}
+
+func TestInferShape(t *testing.T) {
+	cases := []struct {
+		name  string
+		input []int
+		want  []int
+		err   error
+	}{
+		{
+			name:  "no inferred shape",
+			input: []int{2, 3, 4},
+			want:  []int{2, 3, 4},
+		},
+		{
+			name:  "infer begin",
+			input: []int{-1, 3, 4},
+			want:  []int{2, 3, 4},
+		},
+		{
+			name:  "infer mid",
+			input: []int{2, -1, 4},
+			want:  []int{2, 3, 4},
+		},
+		{
+			name:  "infer end",
+			input: []int{2, 3, -1},
+			want:  []int{2, 3, 4},
+		},
+		{
+			name:  "too many inferred dims",
+			input: []int{-1, 3, -1},
+			err:   errors.New("only one dimension can be inferred"),
+		},
+		{
+			name:  "infer gather",
+			input: []int{2, -1},
+			want:  []int{2, 12},
+		},
+		{
+			name:  "infer gather all",
+			input: []int{-1},
+			want:  []int{24},
+		},
+		{
+			name:  "infer split",
+			input: []int{2, -1, 3, 2},
+			want:  []int{2, 2, 3, 2},
+		},
+		{
+			name:  "indivisible infer",
+			input: []int{2, -1, 2, 4},
+			err:   errors.New("cannot infer dimension"),
+		},
+		{
+			name:  "infer zero dim",
+			input: []int{2, 0, 4},
+			err:   errors.New("dimension cannot be zero"),
+		},
+	}
+
+	ctx := setup(t)
+	tensor, ok := ctx.Empty(ml.DTypeF32, 2, 3, 4).(*Tensor)
+	if !ok {
+		t.Fatal("expected *Tensor")
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			defer func() {
+				if r := recover(); r == nil && tt.err == nil {
+					// all good
+				} else if r != nil && tt.err == nil {
+					t.Errorf("unexpected panic: %v", r)
+				} else if r == nil && tt.err != nil {
+					t.Errorf("expected panic but did not get one: %v", tt.err)
+				} else if errStr, ok := r.(string); ok && errStr != tt.err.Error() {
+					t.Errorf("expected panic %q but got %q", tt.err.Error(), errStr)
+				}
+			}()
+
+			inferShape(tensor, tt.input)
+			if diff := cmp.Diff(tt.want, tt.input); diff != "" {
+				t.Errorf("%s: shape mismatch (-want +got):\n%s", tt.name, diff)
+			}
+		})
+	}
+}
diff --git a/ml/device.go b/ml/device.go
index 6569d87b..57c3976b 100644
--- a/ml/device.go
+++ b/ml/device.go
@@ -3,15 +3,21 @@ package ml
 import (
 	"context"
 	"encoding/binary"
+	"encoding/json"
 	"fmt"
 	"hash/maphash"
+	"io"
 	"log/slog"
+	"net/http"
+	"runtime"
 	"slices"
 	"sort"
 	"strconv"
 	"strings"
+	"time"
 
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/logutil"
 )
 
 // GPULayers is a set of layers to be allocated on a single GPU
@@ -282,6 +288,20 @@ type DeviceInfo struct {
 	LibraryPath []string
 }
 
+type SystemInfo struct {
+	// ThreadCount is the optimal number of threads to use for inference
+	ThreadCount int `json:"threads,omitempty"`
+
+	// TotalMemory is the total amount of system memory
+	TotalMemory uint64 `json:"total_memory,omitempty"`
+
+	// FreeMemory is the amount of memory currently available on the system for loading models
+	FreeMemory uint64 `json:"free_memory,omitempty"`
+
+	// FreeSwap is the amount of system swap space reported as available
+	FreeSwap uint64 `json:"free_swap,omitempty"`
+}
+
 func (d DeviceInfo) Compute() string {
 	// AMD gfx is encoded into the major minor in hex form
 	if strings.EqualFold(d.Library, "ROCm") {
@@ -294,6 +314,71 @@ func (d DeviceInfo) Driver() string {
 	return strconv.Itoa(d.DriverMajor) + "." + strconv.Itoa(d.DriverMinor)
 }
 
+// MinimumMemory reports the amount of memory that should be set aside
+// on the device for overhead (e.g. VRAM consumed by context structures independent
+// of model allocations)
+func (d DeviceInfo) MinimumMemory() uint64 {
+	if d.Library == "Metal" {
+		return 512 * format.MebiByte
+	}
+	return 457 * format.MebiByte
+}
+
+// Sort by Free Space.
+// iGPUs are reported first, thus Reverse() yields the largest discrete GPU first
+type ByFreeMemory []DeviceInfo
+
+func (a ByFreeMemory) Len() int      { return len(a) }
+func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
+func (a ByFreeMemory) Less(i, j int) bool {
+	if a[i].Integrated && !a[j].Integrated {
+		return true
+	} else if !a[i].Integrated && a[j].Integrated {
+		return false
+	}
+	return a[i].FreeMemory < a[j].FreeMemory
+}
+
+func ByLibrary(l []DeviceInfo) [][]DeviceInfo {
+	resp := [][]DeviceInfo{}
+	libs := []string{}
+	for _, info := range l {
+		found := false
+		requested := info.Library
+		for i, lib := range libs {
+			if lib == requested {
+				resp[i] = append(resp[i], info)
+				found = true
+				break
+			}
+		}
+		if !found {
+			libs = append(libs, requested)
+			resp = append(resp, []DeviceInfo{info})
+		}
+	}
+	return resp
+}
+
+func LibraryPaths(l []DeviceInfo) []string {
+	var gpuLibs []string
+	for _, gpu := range l {
+		for _, dir := range gpu.LibraryPath {
+			needed := true
+			for _, existing := range gpuLibs {
+				if dir == existing {
+					needed = false
+					break
+				}
+			}
+			if needed {
+				gpuLibs = append(gpuLibs, dir)
+			}
+		}
+	}
+	return gpuLibs
+}
+
 type DeviceComparison int
 
 const (
@@ -306,6 +391,10 @@ func (a DeviceInfo) Compare(b DeviceInfo) DeviceComparison {
 	if a.PCIID != b.PCIID {
 		return UniqueDevice
 	}
+	// If PCIID is empty, we have to use ID + library for uniqueness
+	if a.PCIID == "" && a.DeviceID != b.DeviceID {
+		return UniqueDevice
+	}
 	if a.Library == b.Library {
 		return SameBackendDevice
 	}
@@ -336,3 +425,133 @@ func (a DeviceInfo) IsBetter(b DeviceInfo) bool {
 	sort.Sort(sort.Reverse(sort.StringSlice(cmp)))
 	return cmp[0] == bLibSplit[1]
 }
+
+// For each GPU, check if it does NOT support flash attention
+func FlashAttentionSupported(l []DeviceInfo) bool {
+	for _, gpu := range l {
+		supportsFA := gpu.Library == "cpu" ||
+			gpu.Name == "Metal" || gpu.Library == "Metal" ||
+			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) ||
+			gpu.Library == "ROCm"
+
+		if !supportsFA {
+			return false
+		}
+	}
+	return true
+}
+
+// Given the list of GPUs this instantiation is targeted for,
+// figure out the visible devices environment variables
+func GetVisibleDevicesEnv(l []DeviceInfo) map[string]string {
+	if len(l) == 0 {
+		return nil
+	}
+	env := map[string]string{}
+	for _, d := range l {
+		d.updateVisibleDevicesEnv(env)
+	}
+	return env
+}
+
+func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) {
+	var envVar string
+	switch d.Library {
+	case "ROCm":
+		// ROCm must be filtered as it can crash the runner on unsupported devices
+		envVar = "ROCR_VISIBLE_DEVICES"
+		if runtime.GOOS != "linux" {
+			envVar = "HIP_VISIBLE_DEVICES"
+		}
+	default:
+		// CUDA and Vulkan are not filtered via env var, but via scheduling decisions
+		return
+	}
+	v, existing := env[envVar]
+	if existing {
+		v = v + ","
+	}
+	if d.FilteredID != "" {
+		v = v + d.FilteredID
+	} else {
+		v = v + d.ID
+	}
+	env[envVar] = v
+}
+
+type BaseRunner interface {
+	// GetPort returns the localhost port number the runner is running on
+	GetPort() int
+
+	// HasExited indicates if the runner is no longer running.  This can be used during
+	// bootstrap to detect if a given filtered device is incompatible and triggered an assert
+	HasExited() bool
+}
+
+type RunnerDiscovery interface {
+	BaseRunner
+
+	// GetDeviceInfos will perform a query of the underlying device libraries
+	// for device identification and free VRAM information
+	// During bootstrap scenarios, this routine may take seconds to complete
+	GetDeviceInfos(ctx context.Context) []DeviceInfo
+}
+
+type FilteredRunnerDiscovery interface {
+	RunnerDiscovery
+
+	// GetActiveDeviceIDs returns the filtered set of devices actively in
+	// use by this runner for running models.  If the runner is a bootstrap runner, no devices
+	// will be active yet so no device IDs are returned.
+	// This routine will not query the underlying device and will return immediately
+	GetActiveDeviceIDs() []DeviceID
+}
+
+func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]DeviceInfo, error) {
+	var moreDevices []DeviceInfo
+	port := runner.GetPort()
+	tick := time.Tick(10 * time.Millisecond)
+	for {
+		select {
+		case <-ctx.Done():
+			return nil, fmt.Errorf("failed to finish discovery before timeout")
+		case <-tick:
+			r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
+			if err != nil {
+				return nil, fmt.Errorf("failed to create request: %w", err)
+			}
+			r.Header.Set("Content-Type", "application/json")
+
+			resp, err := http.DefaultClient.Do(r)
+			if err != nil {
+				// slog.Warn("failed to send request", "error", err)
+				if runner.HasExited() {
+					return nil, fmt.Errorf("runner crashed")
+				}
+				continue
+			}
+			defer resp.Body.Close()
+
+			if resp.StatusCode == http.StatusNotFound {
+				// old runner, fall back to bootstrapping model
+				return nil, fmt.Errorf("llamarunner free vram reporting not supported")
+			}
+
+			body, err := io.ReadAll(resp.Body)
+			if err != nil {
+				slog.Warn("failed to read response", "error", err)
+				continue
+			}
+			if resp.StatusCode != 200 {
+				logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
+				return nil, fmt.Errorf("runner error: %s", string(body))
+			}
+
+			if err := json.Unmarshal(body, &moreDevices); err != nil {
+				slog.Warn("unmarshal encode response", "error", err)
+				continue
+			}
+			return moreDevices, nil
+		}
+	}
+}
diff --git a/ml/nn/convolution.go b/ml/nn/convolution.go
index 8e015c73..2954de00 100644
--- a/ml/nn/convolution.go
+++ b/ml/nn/convolution.go
@@ -4,8 +4,27 @@ import "github.com/ollama/ollama/ml"
 
 type Conv2D struct {
 	Weight ml.Tensor `gguf:"weight"`
+	Bias   ml.Tensor `gguf:"bias"`
 }
 
 func (m *Conv2D) Forward(ctx ml.Context, t ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
-	return m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
+	t = m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
+	if m.Bias != nil {
+		// Bias shape is (out_channels,) while t shape is (width, height, out_channels, batch)
+		t = t.Add(ctx, m.Bias.Reshape(ctx, 1, 1, -1))
+	}
+	return t
+}
+
+type Conv3D struct {
+	Weight ml.Tensor `gguf:"weight"`
+	Bias   ml.Tensor `gguf:"bias"`
+}
+
+func (m *Conv3D) Forward(ctx ml.Context, t ml.Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) ml.Tensor {
+	t = m.Weight.Conv3D(ctx, t, c, s0, s1, s2, p0, p1, p2, d0, d1, d2)
+	if m.Bias != nil {
+		t = t.Add(ctx, m.Bias)
+	}
+	return t
 }
diff --git a/model/models/bert/embed.go b/model/models/bert/embed.go
index 166c11e1..2d78710f 100644
--- a/model/models/bert/embed.go
+++ b/model/models/bert/embed.go
@@ -30,7 +30,7 @@ type Model struct {
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	hiddenStates = hiddenStates.Add(ctx, m.TypeEmbedding.Weight.View(ctx, 0, m.hiddenSize))
-	hiddenStates = hiddenStates.Add(ctx, m.PositionEmbedding.Forward(ctx, ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))))
+	hiddenStates = hiddenStates.Add(ctx, m.PositionEmbedding.Forward(ctx, ctx.Input().FromInts(batch.Positions, len(batch.Positions))))
 	hiddenStates = m.TokenEmbeddingNorm.Forward(ctx, hiddenStates, m.eps)
 
 	for _, layer := range m.Layers {
diff --git a/model/models/deepseek2/model.go b/model/models/deepseek2/model.go
index 7e57f72d..cfd579ca 100644
--- a/model/models/deepseek2/model.go
+++ b/model/models/deepseek2/model.go
@@ -302,7 +302,7 @@ func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
 
 	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 
diff --git a/model/models/gemma2/model.go b/model/models/gemma2/model.go
index 2b16dc62..06c71fc3 100644
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -175,7 +175,7 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
 
 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
diff --git a/model/models/gemma3/embed.go b/model/models/gemma3/embed.go
index 52554776..9251111c 100644
--- a/model/models/gemma3/embed.go
+++ b/model/models/gemma3/embed.go
@@ -2,7 +2,6 @@ package gemma3
 
 import (
 	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/ml/nn/pooling"
@@ -53,10 +52,5 @@ func newEmbedModel(c fs.Config) (model.Model, error) {
 		poolingType: pooling.Type(c.Uint("pooling_type", 0)),
 	}
 
-	m.Cache = kvcache.NewWrapperCache(
-		kvcache.NewSWACache(int32(c.Uint("attention.sliding_window")), m.Shift),
-		kvcache.NewCausalCache(m.Shift),
-	)
-
 	return m, nil
 }
diff --git a/model/models/gemma3/model.go b/model/models/gemma3/model.go
index 27da889e..62f51074 100644
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -101,7 +101,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}
 
-	pixelValues := ctx.Input().FromFloatSlice(f32s,
+	pixelValues := ctx.Input().FromFloats(f32s,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.numChannels,
diff --git a/model/models/gemma3/model_text.go b/model/models/gemma3/model_text.go
index 631baecc..8d1a1be6 100644
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -163,7 +163,7 @@ func (l *TextLayer) Forward(ctx ml.Context, layer int, hiddenState, positionIDs,
 }
 
 func (m *TextModel) Forward(ctx ml.Context, batch input.Batch, cache kvcache.Cache) ml.Tensor {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
 
 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.TextConfig.hiddenSize)))
@@ -182,16 +182,18 @@ func (m *TextModel) Forward(ctx ml.Context, batch input.Batch, cache kvcache.Cac
 	for i, layer := range m.Layers {
 		// gemma alternates between the sliding window (local) and causal (global)
 		// kv cache every 6 layers
-		cacheType := cacheTypeSWA
-		if (i+1)%gemmaGlobalCacheCount == 0 {
-			cacheType = cacheTypeCausal
-		}
-		cache.SetLayer(i)
-		wc := cache.(*kvcache.WrapperCache)
-		wc.SetLayerType(cacheType)
+		if cache != nil {
+			cacheType := cacheTypeSWA
+			if (i+1)%gemmaGlobalCacheCount == 0 {
+				cacheType = cacheTypeCausal
+			}
+			cache.SetLayer(i)
+			wc := cache.(*kvcache.WrapperCache)
+			wc.SetLayerType(cacheType)
 
-		if causal, ok := wc.UnderlyingCache().(*kvcache.Causal); ok {
-			causal.SetCausal(ctx, kvcache.CausalOptions{Except: except})
+			if causal, ok := wc.UnderlyingCache().(*kvcache.Causal); ok {
+				causal.SetCausal(ctx, kvcache.CausalOptions{Except: except})
+			}
 		}
 
 		var lastLayerOutputs ml.Tensor
diff --git a/model/models/gemma3n/model_text.go b/model/models/gemma3n/model_text.go
index 1333151b..ec038a28 100644
--- a/model/models/gemma3n/model_text.go
+++ b/model/models/gemma3n/model_text.go
@@ -29,9 +29,9 @@ type TextModel struct {
 }
 
 func (m *TextModel) Forward(ctx ml.Context, batch input.Batch, cache kvcache.Cache) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
 	// Create a tensor of a single float32 value of 1.0 to use for altup correction
-	one := ctx.Input().FromFloatSlice([]float32{1.0}, 1)
+	one := ctx.Input().FromFloats([]float32{1.0}, 1)
 
 	inputs := m.TokenEmbedding.Forward(ctx, batch.Inputs, math.Sqrt(float64(m.hiddenSize)))
 	inputsPerLayer := m.PerLayerProjector.Forward(ctx, batch, inputs, &m.TextOptions)
diff --git a/model/models/gptoss/model.go b/model/models/gptoss/model.go
index 6a327065..08bf753d 100644
--- a/model/models/gptoss/model.go
+++ b/model/models/gptoss/model.go
@@ -30,9 +30,9 @@ type Transformer struct {
 // Forward implements model.Model.
 func (m *Transformer) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
 
-	one := ctx.Input().FromFloatSlice([]float32{1}, 1)
+	one := ctx.Input().FromFloats([]float32{1}, 1)
 	for i, block := range m.TransformerBlocks {
 		m.Cache.SetLayer(i)
 		if c, ok := m.Cache.(*kvcache.WrapperCache); ok {
diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index c03f04a0..52c66ba5 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -179,7 +179,7 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tenso
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
 
 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 
diff --git a/model/models/llama4/model.go b/model/models/llama4/model.go
index e80fbaed..5eeac07c 100644
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -76,7 +76,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}
 
-	tilesLocal := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
+	tilesLocal := ctx.Input().FromFloats(pixelsLocal, size.X, size.Y, m.numChannels)
 
 	ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize
 
@@ -87,7 +87,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 	pixelValues := tilesLocal
 
 	if len(pixelsGlobal) > 0 {
-		tilesGlobal := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
+		tilesGlobal := ctx.Input().FromFloats(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
 		pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3)
 	}
 
@@ -174,7 +174,7 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, batch.Outputs, batch, m.Cache), nil
 }
 
diff --git a/model/models/llama4/model_text.go b/model/models/llama4/model_text.go
index e056391f..96b5d24d 100644
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -211,7 +211,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 			scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
 		}
 
-		attentionScales = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
+		attentionScales = ctx.Input().FromFloats(scales, 1, 1, len(scales))
 	}
 
 	for i, layer := range m.Layers {
diff --git a/model/models/llama4/model_vision.go b/model/models/llama4/model_vision.go
index dc6f82b8..1aa50aec 100644
--- a/model/models/llama4/model_vision.go
+++ b/model/models/llama4/model_vision.go
@@ -245,7 +245,7 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
 		}
 	}
 
-	ropeFreqs := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
+	ropeFreqs := ctx.Input().FromFloats(freqs, freqDim/2, numPatches, 2)
 
 	ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches)
diff --git a/model/models/mistral3/model.go b/model/models/mistral3/model.go
index 5c46615e..e071d71a 100644
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -114,7 +114,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}
 
-	pixelValues := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
+	pixelValues := ctx.Input().FromFloats(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
 
 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)
@@ -158,7 +158,7 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
 
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, batch.Outputs, batch, m.Cache), nil
 }
diff --git a/model/models/mistral3/model_vision.go b/model/models/mistral3/model_vision.go
index 3bfb8c90..ce3110c7 100644
--- a/model/models/mistral3/model_vision.go
+++ b/model/models/mistral3/model_vision.go
@@ -110,8 +110,8 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor)
 		}
 	}
 
-	h := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
-	w := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
+	h := ctx.Input().FromFloats(frequenciesHeight, maxPatchesPerSide, frequencies/2)
+	w := ctx.Input().FromFloats(frequenciesWidth, maxPatchesPerSide, frequencies/2)
 
 	h = h.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
 	w = w.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
@@ -144,7 +144,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 		}
 	}
 
-	positionIDs := ctx.Input().FromIntSlice(positions, len(positions))
+	positionIDs := ctx.Input().FromInts(positions, len(positions))
 
 	positionEmbedding := m.positionalEmbedding(ctx, positionIDs)
 	cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go
index 76974369..58fd5adc 100644
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -80,8 +80,8 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
 	}
 
-	pixelValues := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
-	aspectRatio := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
+	pixelValues := ctx.Input().FromFloats(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
+	aspectRatio := ctx.Input().FromInts([]int32{int32(ratio.rank)}, 1)
 
 	positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
 	crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
@@ -106,7 +106,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor
 	}
 
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
 
 	// TODO: attention mask, cross attention mask
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, batch.Outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
diff --git a/model/models/models.go b/model/models/models.go
index 0cda615a..deefeb58 100644
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -14,4 +14,5 @@ import (
 	_ "github.com/ollama/ollama/model/models/qwen2"
 	_ "github.com/ollama/ollama/model/models/qwen25vl"
 	_ "github.com/ollama/ollama/model/models/qwen3"
+	_ "github.com/ollama/ollama/model/models/qwen3vl"
 )
diff --git a/model/models/qwen2/model.go b/model/models/qwen2/model.go
index 2e234710..10a1e65c 100644
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
@@ -102,7 +102,7 @@ type Model struct {
 
 // Forward implements model.Model.
 func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
 
 	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 
diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go
index 6898e38c..13fa3fee 100644
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -69,7 +69,7 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
 		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
 	numPatches := grid.Temporal * grid.Height * grid.Width
 
-	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
+	pixelValues := ctx.Input().FromFloats(f32s, patchDim, numPatches)
 
 	return pixelValues, grid, nil
 }
@@ -139,7 +139,7 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
 
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, batch.Outputs, batch, m.Cache)
 }
diff --git a/model/models/qwen25vl/model_vision.go b/model/models/qwen25vl/model_vision.go
index 3dd60e3b..88b2c005 100644
--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@@ -43,7 +43,7 @@ func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int
 		}
 	}
 
-	mask := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
+	mask := ctx.Input().FromFloats(flat, seqLength, seqLength)
 
 	// Reshape to match [seqLength, seqLength, 1] for broadcasting
 	mask = mask.Reshape(ctx, seqLength, seqLength, 1)
@@ -299,7 +299,7 @@ func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int)
 		}
 	}
 
-	t := ctx.Input().FromIntSlice(index, len(index))
+	t := ctx.Input().FromInts(index, len(index))
 
 	return t, bounds
 }
@@ -319,7 +319,7 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 			freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
 		}
 	}
-	freqs := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
+	freqs := ctx.Input().FromFloats(freqVals, freq, maxGridSize)
 
 	// Create position coordinates (y,x pairs) for the grid
 	// In PyTorch: Equivalent to generating position ids with torch.arange()
@@ -329,7 +329,7 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 			coords = append(coords, int32(y), int32(x))
 		}
 	}
-	pos := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
+	pos := ctx.Input().FromInts(coords, 2, grid.Width, grid.Height)
 
 	// Reshape and permute positions to match spatial merging pattern
 	pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
diff --git a/model/models/qwen3/model.go b/model/models/qwen3/model.go
index 9fd6e313..483439ac 100644
--- a/model/models/qwen3/model.go
+++ b/model/models/qwen3/model.go
@@ -3,6 +3,7 @@ package qwen3
 import (
 	"cmp"
 	"math"
+	"strings"
 
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -181,7 +182,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 
 // Forward implements model.Model.
 func (m *Model) forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
 
 	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 
@@ -210,7 +211,7 @@ var _ model.Model = (*Model)(nil)
 func New(c fs.Config) (model.Model, error) {
 	layers := make([]Layer, c.Uint("block_count"))
 	for i := range layers {
-		if c.String("general.architecture") == "qwen3moe" {
+		if strings.HasSuffix(c.String("general.architecture"), "moe") {
 			layers[i].MLP = &sparse{}
 		} else {
 			layers[i].MLP = &dense{}
diff --git a/model/models/qwen3vl/imageprocessor.go b/model/models/qwen3vl/imageprocessor.go
new file mode 100644
index 00000000..621167f5
--- /dev/null
+++ b/model/models/qwen3vl/imageprocessor.go
@@ -0,0 +1,194 @@
+package qwen3vl
+
+import (
+	"fmt"
+	"image"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+// ImageProcessor contains configuration for the Qwen 3 VL image processing
+type ImageProcessor struct {
+	numChannels       int
+	patchSize         int
+	temporalPatchSize int
+	mergeSize         int
+	shortestEdge      int
+	longestEdge       int
+	factor            int
+	rescaleFactor     float32
+	imageMean         []float32
+	imageStd          []float32
+}
+
+// newImageProcessor creates a new image processor with default values
+func newImageProcessor(c fs.Config) ImageProcessor {
+	patchSize := int(c.Uint("vision.patch_size", 14))
+	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
+
+	return ImageProcessor{
+		numChannels:       int(c.Uint("vision.num_channels", 3)), // not set
+		patchSize:         patchSize,
+		temporalPatchSize: 2,
+		mergeSize:         mergeSize,
+		shortestEdge:      int(c.Uint("vision.shortest_edge", 64<<10)),
+		// FIXME(mxyng): the model defined longest edge (16M) is too large for the default
+		// context length of 8K and will panic. Adjusting to 2M for now.
+		// longestEdge:   int(c.Uint("vision.longest_edge", 16<<20)),
+		longestEdge:   2 << 20,
+		factor:        patchSize * mergeSize,
+		rescaleFactor: 1.0 / 255.0,
+		imageMean:     c.Floats("vision.image_mean", imageproc.ImageNetStandardMean[:]),
+		imageStd:      c.Floats("vision.image_std", imageproc.ImageNetStandardSTD[:]),
+	}
+}
+
+// SmartResize implements the smart resize algorithm
+func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
+	factor := p.factor
+
+	if height < factor || width < factor {
+		panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
+	} else if aspectRatio := max(height, width) / min(height, width); aspectRatio > 200 {
+		panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %v", aspectRatio))
+	}
+
+	round := func(x float64) int { return int(math.RoundToEven(x)) }
+
+	hBar := round(float64(height)/float64(factor)) * factor
+	wBar := round(float64(width)/float64(factor)) * factor
+
+	if hBar*wBar > p.longestEdge {
+		beta := math.Sqrt(float64(height*width) / float64(p.longestEdge))
+
+		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
+		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
+	} else if hBar*wBar < p.shortestEdge {
+		beta := math.Sqrt(float64(p.shortestEdge) / float64(height*width))
+
+		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
+		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
+	}
+
+	return hBar, wBar
+}
+
+type Grid struct {
+	Height   int
+	Width    int
+	Temporal int
+}
+
+func (p *ImageProcessor) ProcessImage(ctx ml.Context, img image.Image) (ml.Tensor, *Grid, error) {
+	origWidth := img.Bounds().Dx()
+	origHeight := img.Bounds().Dy()
+
+	// Calculate smart resize dimensions
+	resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
+
+	// Resize image using existing functions
+	resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
+
+	normalizedPixels := imageproc.Normalize(
+		resizedImg,
+		[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
+		[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
+		true, // rescale
+		true, // channelFirst
+	)
+
+	// Calculate grid dimensions
+	grid := &Grid{
+		Height:   resizedHeight / p.patchSize,
+		Width:    resizedWidth / p.patchSize,
+		Temporal: 1, // For single images, temporal dimension is 1
+	}
+
+	patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create patches: %v", err)
+	}
+
+	patchDim := p.numChannels * p.temporalPatchSize *
+		p.patchSize * p.patchSize
+	numPatches := grid.Temporal * grid.Height * grid.Width
+
+	pixelValues := ctx.Input().FromFloats(patches, patchDim, numPatches)
+
+	// Return patches and grid dimensions
+	return pixelValues, grid, nil
+}
+
+func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
+	channels := p.numChannels
+	patchSize := p.patchSize
+	mergeSize := p.mergeSize
+	temporalPatchSize := p.temporalPatchSize
+
+	// Calculate output dimensions
+	numPatches := grid.Temporal * grid.Height * grid.Width
+	patchDim := channels * temporalPatchSize * patchSize * patchSize
+
+	result := make([]float32, numPatches*patchDim)
+	patchIndex := 0
+
+	// Single temporal frame handling (copies to all frames)
+	for range grid.Temporal {
+		for h := 0; h < grid.Height; h += mergeSize {
+			for w := 0; w < grid.Width; w += mergeSize {
+				// Handle the 2x2 merged patches
+				for mh := range mergeSize {
+					for mw := range mergeSize {
+						baseOffset := patchIndex * patchDim
+
+						// Extract patch data for first temporal frame
+						for c := range channels {
+							channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
+
+							for py := range patchSize {
+								for px := range patchSize {
+									// Calculate source pixel coordinates
+									y := (h+mh)*patchSize + py
+									x := (w+mw)*patchSize + px
+
+									// Source index in input tensor (CHW format)
+									srcIdx := c*height*width + y*width + x
+
+									// Destination index in first temporal frame
+									dstIdx := channelOffset + (py * patchSize) + px
+
+									if srcIdx < len(pixels) && dstIdx < len(result) {
+										result[dstIdx] = pixels[srcIdx]
+									}
+								}
+							}
+						}
+
+						// Copy first temporal frame to all other frames
+						if temporalPatchSize > 1 {
+							for c := range channels {
+								channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
+								firstFrameOffset := channelOffset
+								frameSize := patchSize * patchSize
+
+								// Copy first frame to all other frames
+								for tp := 1; tp < temporalPatchSize; tp++ {
+									currentFrameOffset := channelOffset + (tp * frameSize)
+									copy(result[currentFrameOffset:currentFrameOffset+frameSize],
+										result[firstFrameOffset:firstFrameOffset+frameSize])
+								}
+							}
+						}
+
+						patchIndex++
+					}
+				}
+			}
+		}
+	}
+
+	return result, nil
+}
diff --git a/model/models/qwen3vl/model.go b/model/models/qwen3vl/model.go
new file mode 100644
index 00000000..08beb37c
--- /dev/null
+++ b/model/models/qwen3vl/model.go
@@ -0,0 +1,204 @@
+package qwen3vl
+
+import (
+	"bytes"
+	"image"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Model struct {
+	model.Base
+	model.TextProcessor
+
+	*TextModel
+	*VisionModel `gguf:"v"`
+
+	ImageProcessor
+
+	positionCache []int32
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	img, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
+	}
+
+	pixelValues, grid, err := m.ProcessImage(ctx, img)
+	if err != nil {
+		return nil, err
+	}
+
+	// Calculate tensor dimensions
+	visionOutputs, deepstackVisualEmbeds := m.VisionModel.Forward(ctx, pixelValues, grid)
+	mm := []input.Multimodal{{Tensor: visionOutputs, Data: grid}}
+	for i := range deepstackVisualEmbeds {
+		mm = append(mm, input.Multimodal{Tensor: deepstackVisualEmbeds[i]})
+	}
+
+	return mm, nil
+}
+
+var (
+	tokenVision      int32 = 151655
+	tokenVisionStart int32 = 151652
+	tokenVisionEnd   int32 = 151653
+)
+
+type modelInput struct {
+	*input.Input
+	position int32
+}
+
+// PostTokenize arranges Qwen 3 VL's inputs for the forward pass
+func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+	m.positionCache = m.positionCache[:0]
+	return slices.Collect(func(yield func(*input.Input) bool) {
+		for i := range inputs {
+			s := []modelInput{{Input: inputs[i]}}
+			if mm := inputs[i].Multimodal; mm != nil {
+				t := mm[0].Tensor
+				s = slices.Repeat([]modelInput{
+					{
+						position: int32(i + 1),
+						Input:    &input.Input{Token: tokenVision},
+					},
+				}, t.Dim(1)+1+1)
+
+				s[0] = modelInput{
+					Input:    &input.Input{Token: tokenVisionStart},
+					position: int32(i),
+				}
+
+				s[len(s)-1] = modelInput{
+					Input:    &input.Input{Token: tokenVisionEnd},
+					position: int32(i + mm[0].Data.(*Grid).Width/m.spatialMergeSize + 1),
+				}
+
+				s[1] = modelInput{
+					Input: &input.Input{
+						Token:          tokenVision,
+						Multimodal:     inputs[i].Multimodal,
+						MultimodalHash: inputs[i].MultimodalHash,
+						SameBatch:      t.Dim(1),
+					},
+					position: int32(i + 1),
+				}
+			}
+
+			for _, e := range s {
+				position := e.position
+				if position == 0 && len(m.positionCache) > 0 {
+					position = m.positionCache[len(m.positionCache)-1] + 1
+				}
+
+				m.positionCache = append(m.positionCache, position)
+				if !yield(e.Input) {
+					return
+				}
+			}
+		}
+	}), nil
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positionSlice := slices.Collect(makeSlice2D[int32](3, len(batch.Positions)))
+	for i, id := range batch.Positions {
+		if id < int32(len(m.positionCache)) {
+			id = m.positionCache[id]
+		} else if len(m.positionCache) > 0 {
+			id = id - int32(len(m.positionCache)) + m.positionCache[len(m.positionCache)-1] + 1
+		}
+
+		positionSlice[0][i] = id
+		positionSlice[1][i] = id
+		positionSlice[2][i] = id
+	}
+
+	hiddenStates := m.TextModel.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
+
+	var deepstackVisualEmbeds []ml.Tensor
+	for _, mi := range batch.Multimodal {
+		visionOutputs := mi.Multimodal[0].Tensor
+		ctx.Forward(visionOutputs.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
+
+		if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
+			for i := range visionOutputs.Dim(1) {
+				w := grid.Width / m.spatialMergeSize
+				positionSlice[1][mi.Index+i] += int32(i / w)
+				positionSlice[2][mi.Index+i] += int32(i % w)
+			}
+		}
+
+		deepstackVisualEmbeds = make([]ml.Tensor, len(mi.Multimodal[1:]))
+		for i, mm := range mi.Multimodal[1:] {
+			deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
+			ctx.Forward(mm.Tensor.Copy(ctx, deepstackVisualEmbeds[i].View(ctx, mi.Index*deepstackVisualEmbeds[i].Stride(1), mm.Tensor.Dim(0)*mm.Tensor.Dim(1))))
+		}
+	}
+
+	positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0]), len(positionSlice))
+	cos, sin := m.rotaryEmbedding(ctx, positions)
+	for i, layer := range m.TextModel.Layers {
+		if m.Cache != nil {
+			m.Cache.SetLayer(i)
+		}
+
+		var outputs ml.Tensor
+		if i == len(m.TextModel.Layers)-1 {
+			outputs = batch.Outputs
+		}
+
+		hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, outputs, m.Cache, m.Options)
+		if i < len(deepstackVisualEmbeds) {
+			hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i])
+		}
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, 1e-06)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
+
+func New(c fs.Config) (model.Model, error) {
+	m := Model{
+		TextProcessor: model.NewBytePairEncoding(
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+		),
+		TextModel:      newTextModel(c),
+		VisionModel:    newVisionModel(c),
+		ImageProcessor: newImageProcessor(c),
+	}
+
+	m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, position ml.Tensor) (ml.Tensor, error) {
+		m.positionCache = nil
+		return nil, kvcache.ErrNotSupported
+	})
+	return &m, nil
+}
+
+func init() {
+	model.Register("qwen3vl", New)
+	model.Register("qwen3vlmoe", New)
+}
diff --git a/model/models/qwen3vl/model_text.go b/model/models/qwen3vl/model_text.go
new file mode 100644
index 00000000..14e7d7dc
--- /dev/null
+++ b/model/models/qwen3vl/model_text.go
@@ -0,0 +1,229 @@
+package qwen3vl
+
+import (
+	"cmp"
+	"math"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+)
+
+type TextOptions struct {
+	hiddenSize,
+	numHeads,
+	numKVHeads,
+	keyLength,
+	valueLength int
+
+	eps,
+	ropeBase,
+	ropeScale float32
+	mropeSections []int
+
+	numExperts, numExpertsUsed int
+	normTopKProb               bool
+
+	inverseFrequenciesCache []float32
+}
+
+func (o TextOptions) headDim() int {
+	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
+}
+
+type TextAttention struct {
+	Query     *nn.Linear  `gguf:"attn_q"`
+	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
+	Key       *nn.Linear  `gguf:"attn_k"`
+	KeyNorm   *nn.RMSNorm `gguf:"attn_k_norm"`
+	Value     *nn.Linear  `gguf:"attn_v"`
+	Output    *nn.Linear  `gguf:"attn_output"`
+}
+
+func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+
+	query := sa.Query.Forward(ctx, hiddenStates)
+	key := sa.Key.Forward(ctx, hiddenStates)
+	value := sa.Value.Forward(ctx, hiddenStates)
+
+	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
+	key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+	value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+
+	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
+	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
+
+	query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
+	key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
+
+	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
+	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
+	return sa.Output.Forward(ctx, attention)
+}
+
+type TextMLP interface {
+	Forward(ml.Context, ml.Tensor, *TextOptions) ml.Tensor
+}
+
+type sparse struct {
+	Router *nn.Linear      `gguf:"ffn_gate_inp"`
+	Gate   *nn.LinearBatch `gguf:"ffn_gate_exps"`
+	Up     *nn.LinearBatch `gguf:"ffn_up_exps"`
+	Down   *nn.LinearBatch `gguf:"ffn_down_exps"`
+}
+
+func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions) ml.Tensor {
+	hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
+	routerLogits := mlp.Router.Forward(ctx, hiddenStates)
+
+	routingWeights := routerLogits.Softmax(ctx)
+	selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
+	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
+	if opts.normTopKProb {
+		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
+		routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
+		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
+	}
+
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
+
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates, selectedExperts).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates, selectedExperts))
+
+	experts := mlp.Down.Forward(ctx, hiddenStates, selectedExperts)
+	experts = experts.Mul(ctx, routingWeights)
+
+	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+
+	return nextStates
+}
+
+type dense struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *TextOptions) ml.Tensor {
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+
+type TextLayer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	*TextAttention
+
+	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
+	TextMLP
+}
+
+func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.TextAttention.Forward(ctx, hiddenStates, cos, sin, cache, opts)
+
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.TextMLP.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type TextModel struct {
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+
+	Layers []TextLayer `gguf:"blk"`
+
+	Options *TextOptions
+}
+
+func (m *TextModel) rotaryEmbedding(ctx ml.Context, positions ml.Tensor) (_, _ ml.Tensor) {
+	positions = positions.Reshape(ctx, 1, positions.Dim(0), positions.Dim(1))
+	if len(m.Options.inverseFrequenciesCache) == 0 {
+		m.Options.inverseFrequenciesCache = make([]float32, m.Options.headDim()/2)
+		for i := range m.Options.inverseFrequenciesCache {
+			frequency := float32(math.Pow(float64(m.Options.ropeBase), float64(i*2)/float64(m.Options.headDim())))
+			m.Options.inverseFrequenciesCache[i] = 1 / frequency
+		}
+	}
+
+	inverseFrequencies := ctx.Input().FromFloats(m.Options.inverseFrequenciesCache, 1, len(m.Options.inverseFrequenciesCache))
+
+	positions = positions.Cast(ctx, ml.DTypeF32)
+	frequencies := inverseFrequencies.Mulmat(ctx, positions)
+
+	interleaved := frequencies.View(ctx,
+		0, frequencies.Dim(0),
+		frequencies.Stride(1), frequencies.Dim(1),
+	)
+
+	for _, i := range []int{1, 2} {
+		args := []int{
+			i * frequencies.Stride(0), 1,
+			3 * frequencies.Stride(0), m.Options.mropeSections[i],
+			frequencies.Stride(1), frequencies.Dim(1),
+		}
+
+		ctx.Forward(frequencies.View(ctx, i*frequencies.Stride(2)+args[0], args[1:]...).
+			Copy(ctx, interleaved.View(ctx, args[0], args[1:]...)))
+	}
+
+	interleaved = interleaved.Concat(ctx, interleaved, 0)
+	interleaved = interleaved.Reshape(ctx, interleaved.Dim(0), 1, interleaved.Dim(1), interleaved.Dim(2))
+	return interleaved.Cos(ctx), interleaved.Sin(ctx)
+}
+
+var _ model.Model = (*Model)(nil)
+
+func newTextModel(c fs.Config) *TextModel {
+	layers := make([]TextLayer, c.Uint("block_count"))
+	for i := range layers {
+		if strings.HasSuffix(c.String("general.architecture"), "moe") {
+			layers[i].TextMLP = &sparse{}
+		} else {
+			layers[i].TextMLP = &dense{}
+		}
+	}
+
+	m := TextModel{
+		Layers: layers,
+		Options: &TextOptions{
+			hiddenSize:     int(c.Uint("embedding_length")),
+			numHeads:       int(c.Uint("attention.head_count")),
+			numKVHeads:     int(c.Uint("attention.head_count_kv")),
+			keyLength:      int(c.Uint("attention.key_length")),
+			valueLength:    int(c.Uint("attention.value_length")),
+			eps:            c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:       c.Float("rope.freq_base"),
+			ropeScale:      c.Float("rope.scaling.factor", 1),
+			numExperts:     int(c.Uint("expert_count")),
+			numExpertsUsed: int(c.Uint("expert_used_count")),
+			normTopKProb:   c.Bool("norm_top_k_prob", true),
+			mropeSections: slices.Collect(func(yield func(int) bool) {
+				for _, section := range c.Ints("mrope_sections", []int32{24, 20, 20}) {
+					if !yield(int(section)) {
+						return
+					}
+				}
+			}),
+		},
+	}
+
+	return &m
+}
diff --git a/model/models/qwen3vl/model_vision.go b/model/models/qwen3vl/model_vision.go
new file mode 100644
index 00000000..69118666
--- /dev/null
+++ b/model/models/qwen3vl/model_vision.go
@@ -0,0 +1,268 @@
+package qwen3vl
+
+import (
+	"iter"
+	"math"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+type VisionAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_out"`
+}
+
+func rotateHalf(ctx ml.Context, t ml.Tensor) ml.Tensor {
+	x1 := t.View(ctx, 0, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3))
+	x2 := t.View(ctx, t.Stride(0)*t.Dim(0)/2, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3)).Contiguous(ctx)
+	return x2.Scale(ctx, -1).Concat(ctx, x1, 0)
+}
+
+func applyRotaryPositionalEmbedding(ctx ml.Context, t, cos, sin ml.Tensor) ml.Tensor {
+	return t.Mul(ctx, cos).Add(ctx, rotateHalf(ctx, t).Mul(ctx, sin))
+}
+
+func (sa *VisionAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, opts VisionOptions) ml.Tensor {
+	query := sa.Query.Forward(ctx, hiddenStates)
+	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, query.Dim(1))
+	query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
+
+	key := sa.Key.Forward(ctx, hiddenStates)
+	key = key.Reshape(ctx, opts.headDim(), opts.numHeads, key.Dim(1))
+	key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
+
+	value := sa.Value.Forward(ctx, hiddenStates)
+	value = value.Reshape(ctx, opts.headDim(), opts.numHeads, value.Dim(1))
+
+	attention := nn.Attention(ctx, query, key, value, math.Pow(float64(opts.headDim()), -0.5), nil)
+	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2))
+	return sa.Output.Forward(ctx, attention)
+}
+
+type VisionMLP struct {
+	FC1 *nn.Linear `gguf:"linear_fc1"`
+	FC2 *nn.Linear `gguf:"linear_fc2"`
+}
+
+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts VisionOptions) ml.Tensor {
+	return mlp.FC2.Forward(ctx, mlp.FC1.Forward(ctx, hiddenStates).GELU(ctx))
+}
+
+type VisionEncoderLayer struct {
+	Norm1     *nn.LayerNorm `gguf:"norm1"`
+	Attention *VisionAttention
+	Norm2     *nn.LayerNorm `gguf:"norm2"`
+	MLP       *VisionMLP    `gguf:"mlp"`
+}
+
+func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, opts VisionOptions) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = e.Attention.Forward(ctx, hiddenStates, cos, sin, opts)
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type VisionOptions struct {
+	hiddenSize,
+	numHeads,
+	patchSize,
+	numChannels,
+	spatialMergeSize,
+	temporalPatchSize,
+	gridPerSide int
+
+	eps,
+	ropeTheta float32
+
+	deepstackVisualIndexes []int32
+	mropeSections          []int
+}
+
+func (o VisionOptions) headDim() int {
+	return o.hiddenSize / o.numHeads
+}
+
+type VisionPatchMerger struct {
+	Norm *nn.LayerNorm `gguf:"norm"`
+	FC1  *nn.Linear    `gguf:"linear_fc1"`
+	FC2  *nn.Linear    `gguf:"linear_fc2"`
+}
+
+func (m *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, postshuffleNorm bool, opts VisionOptions) ml.Tensor {
+	hiddenSize := opts.hiddenSize * opts.spatialMergeSize * opts.spatialMergeSize
+	if postshuffleNorm {
+		visionOutputs = visionOutputs.Reshape(ctx, hiddenSize, -1)
+	}
+
+	visionOutputs = m.Norm.Forward(ctx, visionOutputs, opts.eps)
+	visionOutputs = visionOutputs.Reshape(ctx, hiddenSize, -1)
+	return m.FC2.Forward(ctx, m.FC1.Forward(ctx, visionOutputs).GELU(ctx))
+}
+
+type VisionPositionEmbedding struct {
+	PositionEmbedding *nn.Embedding `gguf:"pos_embed"`
+}
+
+func makeSlice2D[T int32 | float32](n0, n1 int) iter.Seq[[]T] {
+	return func(yield func([]T) bool) {
+		for range n0 {
+			if !yield(make([]T, n1)) {
+				return
+			}
+		}
+	}
+}
+
+func (m *VisionPositionEmbedding) Forward(ctx ml.Context, hiddenStates ml.Tensor, grid *Grid, opts VisionOptions) ml.Tensor {
+	indexSlice := slices.Collect(makeSlice2D[int32](4, grid.Height*grid.Width))
+	weightSlice := slices.Collect(makeSlice2D[float32](4, grid.Height*grid.Width))
+
+	stepHeight := float32(opts.gridPerSide-1) / float32(grid.Height-1)
+	stepWidth := float32(opts.gridPerSide-1) / float32(grid.Width-1)
+
+	var i int
+	for h := range grid.Height {
+		for w := range grid.Width {
+			y, x := float32(h)*stepHeight, float32(w)*stepWidth
+
+			floorY, floorX := int32(y), int32(x)
+			ceilY, ceilX := min(floorY+1, int32(opts.gridPerSide-1)), min(floorX+1, int32(opts.gridPerSide-1))
+
+			indexSlice[0][i] = floorY*int32(opts.gridPerSide) + floorX
+			indexSlice[1][i] = floorY*int32(opts.gridPerSide) + ceilX
+			indexSlice[2][i] = ceilY*int32(opts.gridPerSide) + floorX
+			indexSlice[3][i] = ceilY*int32(opts.gridPerSide) + ceilX
+
+			weightSlice[0][i] = (1 - (y - float32(floorY))) * (1 - (x - float32(floorX)))
+			weightSlice[1][i] = (1 - (y - float32(floorY))) * (x - float32(floorX))
+			weightSlice[2][i] = (y - float32(floorY)) * (1 - (x - float32(floorX)))
+			weightSlice[3][i] = (y - float32(floorY)) * (x - float32(floorX))
+
+			i++
+		}
+	}
+
+	indices := ctx.Input().FromInts(slices.Concat(indexSlice...), grid.Height*grid.Width*4)
+	weights := ctx.Input().FromFloats(slices.Concat(weightSlice...), 1, grid.Height*grid.Width*4)
+
+	n := hiddenStates.Dim(0)
+	positionEmbeds := m.PositionEmbedding.Forward(ctx, indices)
+	positionEmbeds = positionEmbeds.Mul(ctx, weights)
+	positionEmbeds = positionEmbeds.Reshape(ctx, n, -1, 4)
+
+	positionEmbeds = positionEmbeds.View(ctx, 0, n, positionEmbeds.Stride(1), grid.Height*grid.Width).
+		Add(ctx, positionEmbeds.View(ctx, 1*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width)).
+		Add(ctx, positionEmbeds.View(ctx, 2*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width)).
+		Add(ctx, positionEmbeds.View(ctx, 3*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width))
+
+	positionEmbeds = positionEmbeds.Reshape(ctx, -1, grid.Width/opts.spatialMergeSize, opts.spatialMergeSize, grid.Height/opts.spatialMergeSize)
+	positionEmbeds = positionEmbeds.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, n, -1)
+	return hiddenStates.Add(ctx, positionEmbeds)
+}
+
+type VisionModel struct {
+	PatchEmbedding    *nn.Conv3D `gguf:"patch_embed"`
+	PositionEmbedding *VisionPositionEmbedding
+	Layers            []VisionEncoderLayer `gguf:"blk"`
+	PatchMerger       *VisionPatchMerger   `gguf:"merger"`
+	DeepstackMerger   []*VisionPatchMerger `gguf:"deepstack_merger"`
+
+	VisionOptions
+}
+
+func (m *VisionModel) positions(ctx ml.Context, grid *Grid) (_, _ ml.Tensor) {
+	indices := ctx.Input().FromInts(slices.Collect(func(yield func(int32) bool) {
+		for y := range grid.Height {
+			for x := range grid.Width {
+				if !yield(int32(y)) {
+					return
+				}
+				if !yield(int32(x)) {
+					return
+				}
+			}
+		}
+	}), grid.Width*grid.Height*2)
+
+	indices = indices.Reshape(ctx, -1, grid.Width/m.spatialMergeSize, m.spatialMergeSize, grid.Height/m.spatialMergeSize)
+	indices = indices.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	indices = indices.Reshape(ctx, -1)
+
+	halfDim := m.headDim() / 2
+	maxGrid := max(grid.Height, grid.Width)
+	frequencies := ctx.Input().FromFloats(slices.Collect(func(yield func(float32) bool) {
+		ropeTheta := float64(m.ropeTheta)
+		for i := range maxGrid {
+			for j := range halfDim / 2 {
+				if !yield(float32(i) / float32(math.Pow(ropeTheta, float64(j*2)/float64(halfDim)))) {
+					return
+				}
+			}
+		}
+	}), halfDim/2, maxGrid)
+
+	embeds := frequencies.Rows(ctx, indices)
+	embeds = embeds.Reshape(ctx, halfDim, 1, -1)
+	embeds = embeds.Concat(ctx, embeds, 0)
+	return embeds.Cos(ctx), embeds.Sin(ctx)
+}
+
+// Forward computes the vision model for an input tensor
+func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) (ml.Tensor, []ml.Tensor) {
+	pixelValues = pixelValues.Reshape(ctx, m.patchSize, m.patchSize, m.temporalPatchSize, -1)
+	hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.numChannels, m.patchSize, m.patchSize, m.temporalPatchSize, 0, 0, 0, 1, 1, 1)
+	hiddenStates = m.PositionEmbedding.Forward(ctx, hiddenStates, grid, m.VisionOptions)
+
+	cos, sin := m.positions(ctx, grid)
+
+	deepstackStates := make([]ml.Tensor, len(m.deepstackVisualIndexes))
+	for i, layer := range m.Layers {
+		hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, m.VisionOptions)
+		if i := slices.Index(m.deepstackVisualIndexes, int32(i)); i >= 0 {
+			deepstackStates[i] = m.DeepstackMerger[i].Forward(ctx, hiddenStates, true, m.VisionOptions)
+		}
+	}
+
+	hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, false, m.VisionOptions)
+	return hiddenStates, deepstackStates
+}
+
+// newVisionModel creates a new instance of the Qwen vision model
+func newVisionModel(c fs.Config) *VisionModel {
+	deepstackVisualIndexes := c.Ints("vision.deepstack_visual_indexes")
+	model := &VisionModel{
+		Layers:          make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
+		DeepstackMerger: make([]*VisionPatchMerger, len(deepstackVisualIndexes)),
+		VisionOptions: VisionOptions{
+			hiddenSize:        int(c.Uint("vision.embedding_length", 1280)),
+			numHeads:          int(c.Uint("vision.attention.head_count", 16)),
+			patchSize:         int(c.Uint("vision.patch_size", 14)),
+			numChannels:       int(c.Uint("vision.num_channels", 3)),
+			eps:               c.Float("vision.attention.layer_norm_epsilon", 1e-6),
+			ropeTheta:         c.Float("vision.rope.freq_base", 10000.0),
+			spatialMergeSize:  int(c.Uint("vision.spatial_merge_size", 2)),
+			temporalPatchSize: int(c.Uint("vision.temporal_patch_size", 2)),
+			gridPerSide:       int(math.Sqrt(float64(c.Uint("vision.num_positional_embeddings", 2304)))),
+			mropeSections: slices.Collect(func(yield func(int) bool) {
+				for _, section := range c.Ints("mrope_sections", []int32{24, 20, 20}) {
+					if !yield(int(section)) {
+						return
+					}
+				}
+			}),
+			deepstackVisualIndexes: deepstackVisualIndexes,
+		},
+	}
+
+	return model
+}
diff --git a/model/parsers/qwen3vl.go b/model/parsers/qwen3vl.go
index a8e7376c..87f49e89 100644
--- a/model/parsers/qwen3vl.go
+++ b/model/parsers/qwen3vl.go
@@ -16,6 +16,8 @@ const (
 	CollectingThinkingContent qwenParserState = iota
 	CollectingContent
 	CollectingToolContent
+	ThinkingDoneEatingWhitespace
+	ToolCallDoneEatingWhitespace
 )
 
 const (
@@ -111,17 +113,28 @@ func (p *Qwen3VLParser) parseEvents() []qwenEvent {
 	return all
 }
 
-func emitContentBeforeTag(p *Qwen3VLParser, events []qwenEvent, tag string) []qwenEvent {
+func splitAtTag(p *Qwen3VLParser, tag string, trimAfter bool) (string, string) {
 	split := strings.SplitN(p.buffer.String(), tag, 2)
 	before := split[0]
 	before = strings.TrimRightFunc(before, unicode.IsSpace)
-	if len(before) > 0 {
-		events = append(events, qwenEventContent{content: before})
-	}
 	after := split[1]
+	if trimAfter {
+		after = strings.TrimLeftFunc(after, unicode.IsSpace)
+	}
 	p.buffer.Reset()
 	p.buffer.WriteString(after)
-	return events
+	return before, after // return events
+}
+
+func (p *Qwen3VLParser) eatLeadingWhitespaceAndTransitionTo(nextState qwenParserState) ([]qwenEvent, bool) {
+	trimmed := strings.TrimLeftFunc(p.buffer.String(), unicode.IsSpace)
+	p.buffer.Reset()
+	if trimmed == "" {
+		return nil, false
+	}
+	p.state = nextState
+	p.buffer.WriteString(trimmed)
+	return nil, true
 }
 
 func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
@@ -130,7 +143,11 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 	switch p.state {
 	case CollectingContent:
 		if strings.Contains(p.buffer.String(), toolOpenTag) {
-			events = emitContentBeforeTag(p, events, toolOpenTag)
+			// events = emitContentBeforeTag(p, events, toolOpenTag)
+			before, _ := splitAtTag(p, toolOpenTag, false)
+			if len(before) > 0 {
+				events = append(events, qwenEventContent{content: before})
+			}
 			p.state = CollectingToolContent
 			return events, true
 		} else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 {
@@ -167,27 +184,26 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 				slog.Warn("qwen tool call closing tag found but no content before it")
 			}
 
-			after := strings.TrimLeftFunc(split[1], unicode.IsSpace)
+			after := split[1]
 			events = append(events, qwenEventRawToolCall{raw: before})
 			p.buffer.Reset()
 			p.buffer.WriteString(after)
-			p.state = CollectingContent
+			p.state = ToolCallDoneEatingWhitespace
 			return events, true
 		} else {
 			return events, false
 		}
 	case CollectingThinkingContent:
 		if strings.Contains(p.buffer.String(), thinkingCloseTag) {
-			split := strings.SplitN(p.buffer.String(), thinkingCloseTag, 2)
-			// before := split[0]
-			before := strings.TrimRightFunc(split[0], unicode.IsSpace)
-			after := strings.TrimLeftFunc(split[1], unicode.IsSpace)
-			if len(before) > 0 {
-				events = append(events, qwenEventThinkingContent{content: before})
+			thinking, remaining := splitAtTag(p, thinkingCloseTag, true)
+			if len(thinking) > 0 {
+				events = append(events, qwenEventThinkingContent{content: thinking})
+			}
+			if remaining == "" {
+				p.state = ThinkingDoneEatingWhitespace
+			} else {
+				p.state = CollectingContent
 			}
-			p.buffer.Reset()
-			p.buffer.WriteString(after)
-			p.state = CollectingContent
 			return events, true
 		} else if overlapLen := overlap(p.buffer.String(), thinkingCloseTag); overlapLen > 0 {
 			beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
@@ -215,6 +231,10 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 			}
 			return events, false
 		}
+	case ThinkingDoneEatingWhitespace:
+		return p.eatLeadingWhitespaceAndTransitionTo(CollectingContent)
+	case ToolCallDoneEatingWhitespace:
+		return p.eatLeadingWhitespaceAndTransitionTo(CollectingContent)
 	default:
 		panic("unreachable")
 	}
diff --git a/model/parsers/qwen3vl_nonthinking_test.go b/model/parsers/qwen3vl_nonthinking_test.go
index 74392946..e0b9a02b 100644
--- a/model/parsers/qwen3vl_nonthinking_test.go
+++ b/model/parsers/qwen3vl_nonthinking_test.go
@@ -653,3 +653,189 @@ func TestQwen3VLNonThinkingToolParser(t *testing.T) {
 		}
 	}
 }
+
+func TestQwen3VLNonThinkingToolCallWhitespaceHandling(t *testing.T) {
+	type step struct {
+		input      string
+		wantEvents []qwenEvent
+	}
+
+	cases := []struct {
+		desc  string
+		steps []step
+		only  bool
+	}{
+		{
+			desc: "whitespace inside tool call preserves trailing space",
+			steps: []step{
+				{
+					input: "before<tool_call>   tool content   </tool_call>after",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "before"},
+						qwenEventRawToolCall{raw: "   tool content   "},
+						qwenEventContent{content: "after"},
+					},
+				},
+			},
+		},
+		{
+			desc: "whitespace inside tool call preserves trailing space",
+			steps: []step{
+				{
+					input: "\n \n \n \n \n \n blahhhhhhhhhh blahhhh blahhhh \n\n\n\t\t     <tool_call>   tool content   </tool_call> \n\n\n\n\n\n\n after",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "\n \n \n \n \n \n blahhhhhhhhhh blahhhh blahhhh"},
+						qwenEventRawToolCall{raw: "   tool content   "},
+						qwenEventContent{content: "after"},
+					},
+				},
+			},
+		},
+		{
+			desc: "whitespace inside tool call preserves trailing space",
+			steps: []step{
+				{
+					input: "<tool_call>   tool content   </tool_call>            ",
+					wantEvents: []qwenEvent{
+						qwenEventRawToolCall{raw: "   tool content   "},
+					},
+				},
+				{
+					input: "\n \n \n \n \n \n blahhhhhhhhhh blahhhh blahhhh \n\n\n\t\t     <tool_call>   anotha one   </tool_call> \n\n\n\n\n\n\n after \n\n\n\n\n\n blep",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "blahhhhhhhhhh blahhhh blahhhh"},
+						qwenEventRawToolCall{raw: "   anotha one   "},
+						qwenEventContent{content: "after \n\n\n\n\n\n blep"},
+					},
+				},
+			},
+		},
+		{
+			desc: "whitespace between content and tool call",
+			steps: []step{
+				{
+					input: "content   \n  <tool_call>tool</tool_call>  \n  more content",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "content"},
+						qwenEventRawToolCall{raw: "tool"},
+						qwenEventContent{content: "more content"},
+					},
+				},
+			},
+		},
+		{
+			desc: "consecutive tool calls with whitespace",
+			steps: []step{
+				{
+					input: "<tool_call>first</tool_call>  \n  <tool_call>second</tool_call>  \n  <tool_call>third</tool_call>",
+					wantEvents: []qwenEvent{
+						qwenEventRawToolCall{raw: "first"},
+						qwenEventRawToolCall{raw: "second"},
+						qwenEventRawToolCall{raw: "third"},
+					},
+				},
+			},
+		},
+		{
+			desc: "whitespace before and after tool open tag",
+			steps: []step{
+				{
+					input: "text   \n   <tool_call>content</tool_call>",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "text"},
+						qwenEventRawToolCall{raw: "content"},
+					},
+				},
+			},
+		},
+		{
+			desc: "unicode whitespace around tool calls",
+			steps: []step{
+				{
+					input: "text\u00a0\u3000<tool_call>content</tool_call>\u00a0\u3000text",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "text"},
+						qwenEventRawToolCall{raw: "content"},
+						qwenEventContent{content: "text"},
+					},
+				},
+			},
+		},
+		{
+			desc: "empty tool call with surrounding whitespace",
+			steps: []step{
+				{
+					input: "before  <tool_call></tool_call>  after",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "before"},
+						qwenEventRawToolCall{raw: ""},
+						qwenEventContent{content: "after"},
+					},
+				},
+			},
+		},
+		{
+			desc: "whitespace in tool call split across chunks",
+			steps: []step{
+				{
+					input:      "before<tool_call>  ",
+					wantEvents: []qwenEvent{qwenEventContent{content: "before"}},
+				},
+				{
+					input:      "tool",
+					wantEvents: []qwenEvent{},
+				},
+				{
+					input: "  </tool_call>after",
+					wantEvents: []qwenEvent{
+						qwenEventRawToolCall{raw: "  tool  "},
+						qwenEventContent{content: "after"},
+					},
+				},
+			},
+		},
+		{
+			desc: "mixed whitespace types between tool calls",
+			steps: []step{
+				{
+					input: "<tool_call>first</tool_call> \t\n\r <tool_call>second</tool_call>",
+					wantEvents: []qwenEvent{
+						qwenEventRawToolCall{raw: "first"},
+						qwenEventRawToolCall{raw: "second"},
+					},
+				},
+			},
+		},
+	}
+
+	anyOnlies := false
+	for _, tc := range cases {
+		if tc.only {
+			anyOnlies = true
+		}
+	}
+
+	for _, tc := range cases {
+		if anyOnlies && !tc.only {
+			continue
+		}
+
+		t.Run(tc.desc, func(t *testing.T) {
+			parser := Qwen3VLParser{hasThinkingSupport: false}
+			parser.Init([]api.Tool{}, nil)
+
+			for i, step := range tc.steps {
+				parser.buffer.WriteString(step.input)
+				gotEvents := parser.parseEvents()
+
+				if len(gotEvents) == 0 && len(step.wantEvents) == 0 {
+					continue
+				}
+
+				if !reflect.DeepEqual(gotEvents, step.wantEvents) {
+					t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents)
+				}
+			}
+		})
+	}
+}
diff --git a/model/parsers/qwen3vl_thinking_test.go b/model/parsers/qwen3vl_thinking_test.go
index d85a60fd..04b2a7db 100644
--- a/model/parsers/qwen3vl_thinking_test.go
+++ b/model/parsers/qwen3vl_thinking_test.go
@@ -546,3 +546,333 @@ func TestQwen3VLThinkingParserStreamingAssistantPrefillContent(t *testing.T) {
 		}
 	}
 }
+
+func TestQwen3VLThinkingWhitespaceHandling(t *testing.T) {
+	type step struct {
+		input      string
+		wantEvents []qwenEvent
+	}
+
+	cases := []struct {
+		desc  string
+		steps []step
+		only  bool
+	}{
+		{
+			desc: "whitespace after thinking tag is trimmed",
+			steps: []step{
+				{
+					input: "thinking content</think>   \n\t  content starts here",
+					wantEvents: []qwenEvent{
+						qwenEventThinkingContent{content: "thinking content"},
+						qwenEventContent{content: "content starts here"},
+					},
+				},
+			},
+		},
+		{
+			desc: "whitespace after thinking tag split across chunks",
+			steps: []step{
+				{
+					input:      "thinking content</think>   ",
+					wantEvents: []qwenEvent{qwenEventThinkingContent{content: "thinking content"}},
+				},
+				{
+					input:      "  \n\t",
+					wantEvents: []qwenEvent{},
+				},
+				{
+					input: "content",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "content"},
+					},
+				},
+			},
+		},
+		{
+			desc: "only whitespace after thinking tag",
+			steps: []step{
+				{
+					input:      "thinking content</think>   \n\t  ",
+					wantEvents: []qwenEvent{qwenEventThinkingContent{content: "thinking content"}},
+				},
+			},
+		},
+		{
+			desc: "multiple spaces and tabs after thinking",
+			steps: []step{
+				{
+					input: "think</think>     \t\t\n\n   text",
+					wantEvents: []qwenEvent{
+						qwenEventThinkingContent{content: "think"},
+						qwenEventContent{content: "text"},
+					},
+				},
+			},
+		},
+		{
+			desc: "trailing whitespace before thinking tag is preserved in content",
+			steps: []step{
+				{
+					input: "thinking with spaces   </think>text",
+					wantEvents: []qwenEvent{
+						qwenEventThinkingContent{content: "thinking with spaces"},
+						qwenEventContent{content: "text"},
+					},
+				},
+			},
+		},
+		{
+			desc: "whitespace between thinking and tool call",
+			steps: []step{
+				{
+					input: "thinking</think>  \n  <tool_call>{\"name\":\"test\"}</tool_call>",
+					wantEvents: []qwenEvent{
+						qwenEventThinkingContent{content: "thinking"},
+						qwenEventRawToolCall{raw: "{\"name\":\"test\"}"},
+					},
+				},
+			},
+		},
+		{
+			desc: "no whitespace after thinking tag",
+			steps: []step{
+				{
+					input: "thinking</think>content",
+					wantEvents: []qwenEvent{
+						qwenEventThinkingContent{content: "thinking"},
+						qwenEventContent{content: "content"},
+					},
+				},
+			},
+		},
+		{
+			desc: "unicode whitespace after thinking tag",
+			steps: []step{
+				{
+					input: "thinking</think>\u00a0\u3000content",
+					wantEvents: []qwenEvent{
+						qwenEventThinkingContent{content: "thinking"},
+						qwenEventContent{content: "content"},
+					},
+				},
+			},
+		},
+		{
+			desc: "whitespace split with partial thinking tag",
+			steps: []step{
+				{
+					input:      "thinking</th",
+					wantEvents: []qwenEvent{qwenEventThinkingContent{content: "thinking"}},
+				},
+				{
+					input:      "ink>  \n",
+					wantEvents: []qwenEvent{},
+				},
+				{
+					input: "  content",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "content"},
+					},
+				},
+			},
+		},
+		{
+			desc: "empty thinking tag with whitespace after",
+			steps: []step{
+				{
+					input: "</think>   \ncontent",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "content"},
+					},
+				},
+			},
+		},
+		{
+			desc: "whitespace inside tool call preserves trailing space",
+			steps: []step{
+				{
+					input: "bruh</think> \n \n \n \n \n \n blahhhhhhhhhh blahhhh blahhhh \n\n\n\t\t     <tool_call>   tool content   </tool_call> \n\n\n\n\n\n\n after",
+					wantEvents: []qwenEvent{
+						qwenEventThinkingContent{content: "bruh"},
+						qwenEventContent{content: "blahhhhhhhhhh blahhhh blahhhh"},
+						qwenEventRawToolCall{raw: "   tool content   "},
+						qwenEventContent{content: "after"},
+					},
+				},
+			},
+		},
+		{
+			desc: "whitespace inside tool call preserves trailing space",
+			steps: []step{
+				{
+					input: "bruh</think>          shdjfhksdhfj  ",
+					wantEvents: []qwenEvent{
+						qwenEventThinkingContent{content: "bruh"},
+						qwenEventContent{content: "shdjfhksdhfj"},
+					},
+				},
+				{
+					input: "another word  ",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "  another word"},
+					},
+				},
+				{
+					input: "<tool_call>   tool content   </tool_call>            ",
+					wantEvents: []qwenEvent{
+						qwenEventRawToolCall{raw: "   tool content   "},
+					},
+				},
+				{
+					input: "\n \n \n \n \n \n blahhhhhhhhhh blahhhh blahhhh \n\n\n\t\t     <tool_call>   anotha one   </tool_call> \n\n\n\n\n\n\n after \n\n\n\n\n\n blep",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "blahhhhhhhhhh blahhhh blahhhh"},
+						qwenEventRawToolCall{raw: "   anotha one   "},
+						qwenEventContent{content: "after \n\n\n\n\n\n blep"},
+					},
+				},
+			},
+		},
+	}
+
+	anyOnlies := false
+	for _, tc := range cases {
+		if tc.only {
+			anyOnlies = true
+		}
+	}
+
+	for _, tc := range cases {
+		if anyOnlies && !tc.only {
+			continue
+		}
+
+		t.Run(tc.desc, func(t *testing.T) {
+			parser := Qwen3VLParser{hasThinkingSupport: true}
+			parser.Init([]api.Tool{}, nil)
+
+			for i, step := range tc.steps {
+				parser.buffer.WriteString(step.input)
+				gotEvents := parser.parseEvents()
+
+				if len(gotEvents) == 0 && len(step.wantEvents) == 0 {
+					continue
+				}
+
+				if !reflect.DeepEqual(gotEvents, step.wantEvents) {
+					t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents)
+				}
+			}
+		})
+	}
+}
+
+func TestQwen3VLToolCallWhitespaceHandling(t *testing.T) {
+	type step struct {
+		input      string
+		wantEvents []qwenEvent
+	}
+
+	cases := []struct {
+		desc       string
+		steps      []step
+		only       bool
+		prefillMsg *api.Message // allows starting in content mode instead of thinking mode
+	}{
+		{
+			desc:       "whitespace inside tool call is fully preserved (with content prefill)",
+			prefillMsg: &api.Message{Role: "assistant", Content: "prefill"},
+			steps: []step{
+				{
+					input: "before<tool_call>   tool content   </tool_call>  \n  after",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "before"},
+						qwenEventRawToolCall{raw: "   tool content   "},
+						qwenEventContent{content: "after"},
+					},
+				},
+			},
+		},
+		{
+			desc:       "whitespace after tool call trimmed across chunks (with content prefill)",
+			prefillMsg: &api.Message{Role: "assistant", Content: "prefill"},
+			steps: []step{
+				{
+					input: "before<tool_call>tool</tool_call>   ",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "before"},
+						qwenEventRawToolCall{raw: "tool"},
+					},
+				},
+				{
+					input:      "\n\t",
+					wantEvents: []qwenEvent{},
+				},
+				{
+					input: "after \n this is a song",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "after \n this is a song"},
+					},
+				},
+			},
+		},
+		{
+			desc:       "multiple tool calls with whitespace between (with content prefill)",
+			prefillMsg: &api.Message{Role: "assistant", Content: "prefill"},
+			steps: []step{
+				{
+					input: "<tool_call>first</tool_call>  \n  <tool_call>second</tool_call>",
+					wantEvents: []qwenEvent{
+						qwenEventRawToolCall{raw: "first"},
+						qwenEventRawToolCall{raw: "second"},
+					},
+				},
+			},
+		},
+		{
+			desc: "thinking with whitespace then tool call",
+			steps: []step{
+				{
+					input: "thinking</think>   \n   <tool_call>tool</tool_call>   \n   content",
+					wantEvents: []qwenEvent{
+						qwenEventThinkingContent{content: "thinking"},
+						qwenEventRawToolCall{raw: "tool"},
+						qwenEventContent{content: "content"},
+					},
+				},
+			},
+		},
+	}
+
+	anyOnlies := false
+	for _, tc := range cases {
+		if tc.only {
+			anyOnlies = true
+		}
+	}
+
+	for _, tc := range cases {
+		if anyOnlies && !tc.only {
+			continue
+		}
+
+		t.Run(tc.desc, func(t *testing.T) {
+			parser := Qwen3VLParser{hasThinkingSupport: true}
+			parser.Init([]api.Tool{}, tc.prefillMsg)
+
+			for i, step := range tc.steps {
+				parser.buffer.WriteString(step.input)
+				gotEvents := parser.parseEvents()
+
+				if len(gotEvents) == 0 && len(step.wantEvents) == 0 {
+					continue
+				}
+
+				if !reflect.DeepEqual(gotEvents, step.wantEvents) {
+					t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents)
+				}
+			}
+		})
+	}
+}
diff --git a/openai/openai.go b/openai/openai.go
index 23e9522f..650514cf 100644
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -2,7 +2,9 @@
 package openai
 
 import (
+	"bytes"
 	"encoding/base64"
+	"encoding/binary"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -73,9 +75,10 @@ type JsonSchema struct {
 }
 
 type EmbedRequest struct {
-	Input      any    `json:"input"`
-	Model      string `json:"model"`
-	Dimensions int    `json:"dimensions,omitempty"`
+	Input          any    `json:"input"`
+	Model          string `json:"model"`
+	Dimensions     int    `json:"dimensions,omitempty"`
+	EncodingFormat string `json:"encoding_format,omitempty"` // "float" or "base64"
 }
 
 type StreamOptions struct {
@@ -181,9 +184,9 @@ type Model struct {
 }
 
 type Embedding struct {
-	Object    string    `json:"object"`
-	Embedding []float32 `json:"embedding"`
-	Index     int       `json:"index"`
+	Object    string `json:"object"`
+	Embedding any    `json:"embedding"` // Can be []float32 (float format) or string (base64 format)
+	Index     int    `json:"index"`
 }
 
 type ListCompletion struct {
@@ -377,13 +380,21 @@ func ToListCompletion(r api.ListResponse) ListCompletion {
 }
 
 // ToEmbeddingList converts an api.EmbedResponse to EmbeddingList
-func ToEmbeddingList(model string, r api.EmbedResponse) EmbeddingList {
+// encodingFormat can be "float", "base64", or empty (defaults to "float")
+func ToEmbeddingList(model string, r api.EmbedResponse, encodingFormat string) EmbeddingList {
 	if r.Embeddings != nil {
 		var data []Embedding
 		for i, e := range r.Embeddings {
+			var embedding any
+			if strings.EqualFold(encodingFormat, "base64") {
+				embedding = floatsToBase64(e)
+			} else {
+				embedding = e
+			}
+
 			data = append(data, Embedding{
 				Object:    "embedding",
-				Embedding: e,
+				Embedding: embedding,
 				Index:     i,
 			})
 		}
@@ -402,6 +413,13 @@ func ToEmbeddingList(model string, r api.EmbedResponse) EmbeddingList {
 	return EmbeddingList{}
 }
 
+// floatsToBase64 encodes a []float32 to a base64 string
+func floatsToBase64(floats []float32) string {
+	var buf bytes.Buffer
+	binary.Write(&buf, binary.LittleEndian, floats)
+	return base64.StdEncoding.EncodeToString(buf.Bytes())
+}
+
 // ToModel converts an api.ShowResponse to Model
 func ToModel(r api.ShowResponse, m string) Model {
 	return Model{
diff --git a/openai/openai_encoding_format_test.go b/openai/openai_encoding_format_test.go
new file mode 100644
index 00000000..0fd781fd
--- /dev/null
+++ b/openai/openai_encoding_format_test.go
@@ -0,0 +1,139 @@
+package openai
+
+import (
+	"encoding/base64"
+	"math"
+	"testing"
+
+	"github.com/ollama/ollama/api"
+)
+
+func TestToEmbeddingList(t *testing.T) {
+	testCases := []struct {
+		name         string
+		embeddings   [][]float32
+		format       string
+		expectType   string // "float" or "base64"
+		expectBase64 []string
+		expectCount  int
+		promptEval   int
+	}{
+		{"float format", [][]float32{{0.1, -0.2, 0.3}}, "float", "float", nil, 1, 10},
+		{"base64 format", [][]float32{{0.1, -0.2, 0.3}}, "base64", "base64", []string{"zczMPc3MTL6amZk+"}, 1, 5},
+		{"default to float", [][]float32{{0.1, -0.2, 0.3}}, "", "float", nil, 1, 0},
+		{"invalid defaults to float", [][]float32{{0.1, -0.2, 0.3}}, "invalid", "float", nil, 1, 0},
+		{"multiple embeddings", [][]float32{{0.1, 0.2}, {0.3, 0.4}, {0.5, 0.6}}, "base64", "base64", []string{"zczMPc3MTD4=", "mpmZPs3MzD4=", "AAAAP5qZGT8="}, 3, 0},
+		{"empty embeddings", nil, "float", "", nil, 0, 0},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			resp := api.EmbedResponse{
+				Embeddings:      tc.embeddings,
+				PromptEvalCount: tc.promptEval,
+			}
+
+			result := ToEmbeddingList("test-model", resp, tc.format)
+
+			if tc.expectCount == 0 {
+				if len(result.Data) != 0 {
+					t.Errorf("expected 0 embeddings, got %d", len(result.Data))
+				}
+				return
+			}
+
+			if len(result.Data) != tc.expectCount {
+				t.Fatalf("expected %d embeddings, got %d", tc.expectCount, len(result.Data))
+			}
+
+			if result.Model != "test-model" {
+				t.Errorf("expected model 'test-model', got %q", result.Model)
+			}
+
+			// Check type of first embedding
+			switch tc.expectType {
+			case "float":
+				if _, ok := result.Data[0].Embedding.([]float32); !ok {
+					t.Errorf("expected []float32, got %T", result.Data[0].Embedding)
+				}
+			case "base64":
+				for i, data := range result.Data {
+					embStr, ok := data.Embedding.(string)
+					if !ok {
+						t.Errorf("embedding %d: expected string, got %T", i, data.Embedding)
+						continue
+					}
+
+					// Verify it's valid base64
+					if _, err := base64.StdEncoding.DecodeString(embStr); err != nil {
+						t.Errorf("embedding %d: invalid base64: %v", i, err)
+					}
+
+					// Compare against expected base64 string if provided
+					if tc.expectBase64 != nil && i < len(tc.expectBase64) {
+						if embStr != tc.expectBase64[i] {
+							t.Errorf("embedding %d: expected base64 %q, got %q", i, tc.expectBase64[i], embStr)
+						}
+					}
+				}
+			}
+
+			// Check indices
+			for i := range result.Data {
+				if result.Data[i].Index != i {
+					t.Errorf("embedding %d: expected index %d, got %d", i, i, result.Data[i].Index)
+				}
+			}
+
+			if tc.promptEval > 0 && result.Usage.PromptTokens != tc.promptEval {
+				t.Errorf("expected %d prompt tokens, got %d", tc.promptEval, result.Usage.PromptTokens)
+			}
+		})
+	}
+}
+
+func TestFloatsToBase64(t *testing.T) {
+	floats := []float32{0.1, -0.2, 0.3, -0.4, 0.5}
+
+	result := floatsToBase64(floats)
+
+	// Verify it's valid base64
+	decoded, err := base64.StdEncoding.DecodeString(result)
+	if err != nil {
+		t.Fatalf("failed to decode base64: %v", err)
+	}
+
+	// Check length
+	expectedBytes := len(floats) * 4
+	if len(decoded) != expectedBytes {
+		t.Errorf("expected %d bytes, got %d", expectedBytes, len(decoded))
+	}
+
+	// Decode and verify values
+	for i, expected := range floats {
+		offset := i * 4
+		bits := uint32(decoded[offset]) |
+			uint32(decoded[offset+1])<<8 |
+			uint32(decoded[offset+2])<<16 |
+			uint32(decoded[offset+3])<<24
+		decodedFloat := math.Float32frombits(bits)
+
+		if math.Abs(float64(decodedFloat-expected)) > 1e-6 {
+			t.Errorf("float[%d]: expected %f, got %f", i, expected, decodedFloat)
+		}
+	}
+}
+
+func TestFloatsToBase64_EmptySlice(t *testing.T) {
+	result := floatsToBase64([]float32{})
+
+	// Should return valid base64 for empty slice
+	decoded, err := base64.StdEncoding.DecodeString(result)
+	if err != nil {
+		t.Fatalf("failed to decode base64: %v", err)
+	}
+
+	if len(decoded) != 0 {
+		t.Errorf("expected 0 bytes, got %d", len(decoded))
+	}
+}
diff --git a/runner/llamarunner/runner.go b/runner/llamarunner/runner.go
index a5e7eb33..87b43256 100644
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -384,6 +384,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 	defer s.mu.Unlock()
 
 	var batch *llama.Batch
+	var numOutputs int
 
 	seqIdx := s.nextSeq - 1
 	for range s.seqs {
@@ -446,7 +447,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 				break
 			}
 
-			batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
+			output := i+1 == len(seq.inputs)
+			batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), output, seq.cache.Id)
+			if output {
+				numOutputs++
+			}
+
 			seq.pendingInputs = append(seq.pendingInputs, input)
 			seq.iBatch = batch.NumTokens() - 1
 		}
@@ -463,6 +469,10 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		return fmt.Errorf("failed to decode batch: %w", err)
 	}
 
+	if numOutputs > 0 {
+		s.lc.Synchronize()
+	}
+
 	for i, seq := range s.seqs {
 		if seq == nil {
 			continue
@@ -476,10 +486,10 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 
 		// don't sample prompt processing
 		if len(seq.inputs) != 0 {
+			seq.processingDuration += time.Since(t)
 			continue
 		}
 
-		s.lc.Synchronize()
 		seq.numDecoded++
 		if seq.numDecoded > 1 {
 			seq.generationDuration += time.Since(t)
diff --git a/runner/ollamarunner/cache.go b/runner/ollamarunner/cache.go
index a3ffc3bd..faab1b22 100644
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -235,15 +235,28 @@ func countCommonPrefix(a []*input.Input, b []*input.Input) int32 {
 	return count
 }
 
-// TODO(jessegross): If we need to reprocess the inputs we should ensure that
-// we don't split up a SameBatch
-func (c *InputCache) ShiftDiscard(inputLen int32, numKeep int32) int32 {
-	targetFree := (c.numCtx - numKeep) / 2
-	targetFree = max(targetFree, 1)
+// ShiftDiscard computes how many inputs can be discarded from the cache. Inputs in the same batch
+// are discarded together.
+func (c *InputCache) ShiftDiscard(inputs []*input.Input, numKeep int32) int32 {
+	targetFree := max((c.numCtx-numKeep)/2, 1)
+	currentFree := c.numCtx - int32(len(inputs))
 
-	currentFree := c.numCtx - inputLen
+	var discard, sameBatch int32
+	for _, input := range inputs[numKeep:] {
+		if sameBatch <= 0 && currentFree >= targetFree {
+			break
+		}
 
-	return max(targetFree-currentFree, 0)
+		sameBatch--
+		currentFree++
+		discard++
+
+		if input.SameBatch > 0 {
+			sameBatch = int32(input.SameBatch)
+		}
+	}
+
+	return discard
 }
 
 type ErrReprocessInputs struct {
@@ -264,7 +277,7 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error {
 	}
 
 	inputLen := int32(len(slot.Inputs))
-	discard := c.ShiftDiscard(inputLen, numKeep)
+	discard := c.ShiftDiscard(slot.Inputs, numKeep)
 
 	if discard <= 0 {
 		return nil
diff --git a/runner/ollamarunner/cache_test.go b/runner/ollamarunner/cache_test.go
index c0693e83..d78727e7 100644
--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@@ -3,6 +3,7 @@ package ollamarunner
 import (
 	"errors"
 	"fmt"
+	"slices"
 	"testing"
 	"time"
 
@@ -238,59 +239,137 @@ func TestShiftDiscard(t *testing.T) {
 		name     string
 		numCtx   int32
 		numKeep  int32
-		inputLen int32
+		inputs   []*input.Input
 		expected int32
 	}{
 		{
 			name:     "Shift",
 			numCtx:   2048,
 			numKeep:  5,
-			inputLen: 2048,
+			inputs:   slices.Repeat([]*input.Input{{}}, 2048),
 			expected: 1021,
 		},
 		{
 			name:     "Max Keep",
 			numCtx:   2048,
 			numKeep:  2047,
-			inputLen: 2048,
+			inputs:   slices.Repeat([]*input.Input{{}}, 2048),
 			expected: 1,
 		},
 		{
 			name:     "No Keep",
 			numCtx:   2048,
 			numKeep:  0,
-			inputLen: 2048,
+			inputs:   slices.Repeat([]*input.Input{{}}, 2048),
 			expected: 1024,
 		},
 		{
 			name:     "Truncate",
 			numCtx:   2048,
 			numKeep:  5,
-			inputLen: 5000,
+			inputs:   slices.Repeat([]*input.Input{{}}, 5000),
 			expected: 3973,
 		},
 		{
 			name:     "Truncate Keep",
 			numCtx:   2048,
 			numKeep:  2047,
-			inputLen: 5000,
+			inputs:   slices.Repeat([]*input.Input{{}}, 5000),
 			expected: 2953,
 		},
 		{
 			name:     "No Op",
 			numCtx:   2048,
 			numKeep:  5,
-			inputLen: 512,
+			inputs:   slices.Repeat([]*input.Input{{}}, 512),
 			expected: 0,
 		},
+		{
+			name:    "Same Batch",
+			numCtx:  2048,
+			numKeep: 5,
+			inputs: slices.Collect(func(yield func(*input.Input) bool) {
+				for range 1024 {
+					if !yield(&input.Input{}) {
+						return
+					}
+				}
+
+				if !yield(&input.Input{SameBatch: 512 - 1}) {
+					return
+				}
+
+				for range 2048 - 1024 - 1 {
+					if !yield(&input.Input{}) {
+						return
+					}
+				}
+			}),
+			expected: 1531,
+		},
+		{
+			name:    "Same Batch Near Start",
+			numCtx:  2048,
+			numKeep: 5,
+			inputs: slices.Collect(func(yield func(*input.Input) bool) {
+				for range 10 {
+					if !yield(&input.Input{}) {
+						return
+					}
+				}
+
+				if !yield(&input.Input{SameBatch: 512 - 1}) {
+					return
+				}
+
+				for range 2048 - 10 - 1 {
+					if !yield(&input.Input{}) {
+						return
+					}
+				}
+			}),
+			expected: 1021,
+		},
+		{
+			name:   "Consecutive Same Batch",
+			numCtx: 32,
+			inputs: slices.Collect(func(yield func(*input.Input) bool) {
+				for i := range 32 {
+					input := input.Input{}
+					if i%10 == 0 {
+						input.SameBatch = 10 - 1
+					}
+					if !yield(&input) {
+						return
+					}
+				}
+			}),
+			expected: 20,
+		},
+		{
+			name:   "Overlapping Same Batch",
+			numCtx: 32,
+			inputs: slices.Collect(func(yield func(*input.Input) bool) {
+				for i := range 32 {
+					input := input.Input{}
+					if slices.Contains([]int{4, 8, 14}, i) {
+						input.SameBatch = 10 - 1
+					}
+					if !yield(&input) {
+						return
+					}
+				}
+			}),
+			expected: 24,
+		},
 	}
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			c := InputCache{numCtx: tt.numCtx}
-			result := c.ShiftDiscard(tt.inputLen, tt.numKeep)
+			result := c.ShiftDiscard(tt.inputs, tt.numKeep)
 			if result != tt.expected {
-				t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected)
+				t.Errorf("shiftDiscard(ctx: %v, keep: %v inputs: %v): have %v; want %v", tt.numCtx, tt.numKeep, len(tt.inputs), result, tt.expected)
 			}
 		})
 	}
diff --git a/runner/ollamarunner/multimodal.go b/runner/ollamarunner/multimodal.go
index fbdc7d72..78ceb771 100644
--- a/runner/ollamarunner/multimodal.go
+++ b/runner/ollamarunner/multimodal.go
@@ -102,7 +102,7 @@ func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Ten
 	for i, t := range entry.mm {
 		if in == t.Tensor {
 			if !reserve {
-				return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...), nil
+				return ctx.Input().FromFloats(entry.data[i], t.Tensor.Shape()...), nil
 			} else {
 				return ctx.Input().Empty(t.Tensor.DType(), t.Tensor.Shape()...), nil
 			}
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index 7b72bf92..153a3e57 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -214,7 +214,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]*input.Input,
 		parts = []string{prompt}
 	}
 
-	postTokenize := false
 	for i, part := range parts {
 		// text - tokenize
 		tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0)
@@ -257,11 +256,10 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]*input.Input,
 			mmStore.addMultimodal(imageEmbeddings)
 
 			inputs = append(inputs, &input.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash})
-			postTokenize = true
 		}
 	}
 
-	if visionModel && postTokenize {
+	if visionModel {
 		var err error
 		inputs, err = multimodalProcessor.PostTokenize(inputs)
 		if err != nil {
@@ -599,7 +597,7 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
 
 	// Actual batchInputs values will be injected into the batch.Inputs tensor before calling Compute
 	batch.Inputs = nextBatch.ctx.Input().Empty(ml.DTypeI32, len(batchInputs))
-	batch.Outputs = nextBatch.ctx.Input().FromIntSlice(batchOutputs, len(batchOutputs))
+	batch.Outputs = nextBatch.ctx.Input().FromInts(batchOutputs, len(batchOutputs))
 	nextBatch.modelOutput, err = model.Forward(nextBatch.ctx, s.model, batch)
 	if err != nil {
 		err = fmt.Errorf("failed to build graph: %w", err)
@@ -692,7 +690,7 @@ func (s *Server) computeBatch(activeBatch batchState) {
 	// At this point the seqs are ready for forwardBatch to move forward so unblock
 	s.mu.Unlock()
 
-	activeBatch.batch.Inputs.SetValueFromIntSlice(batchInputs)
+	activeBatch.batch.Inputs.FromInts(batchInputs)
 	activeBatch.ctx.ComputeWithNotify(
 		func() {
 			logutil.Trace("computeBatch: signaling computeStartedCh", "batchID", activeBatch.id)
@@ -1089,7 +1087,7 @@ func (s *Server) reserveWorstCaseGraph() error {
 		batch.Positions[i] = int32(i)
 	}
 
-	batch.Inputs = ctx.Input().FromIntSlice(batchInputs, len(batchInputs))
+	batch.Inputs = ctx.Input().FromInts(batchInputs, len(batchInputs))
 	batch.Outputs = ctx.Input().Empty(ml.DTypeI32, s.parallel)
 
 	cache := s.model.Config().Cache
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index f1cd3fea..548545cb 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -84,11 +84,11 @@ function buildCPU() {
         Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
         New-Item "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ItemType Directory -ea 0
 
-        & cmake --fresh --preset CPU --install-prefix $script:DIST_DIR
+        & cmake -B build\cpu --preset CPU --install-prefix $script:DIST_DIR
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        & cmake --build --preset CPU  --config Release --parallel $script:JOBS
+        & cmake --build build\cpu --target ggml-cpu --config Release --parallel $script:JOBS
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        & cmake --install build --component CPU --strip
+        & cmake --install build\cpu --component CPU --strip
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     }
 }
@@ -105,11 +105,11 @@ function buildCUDA11() {
             $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x}  }}
             write-host "Building CUDA v11 backend libraries $cuda"
             $env:CUDAToolkit_ROOT=$cuda
-            & cmake --fresh --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11"
+            & cmake -B build\cuda_v11 --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11"
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --build --preset "CUDA 11"  --config Release --parallel $script:JOBS
+            & cmake --build build\cuda_v11 --target ggml-cuda --config Release --parallel $script:JOBS
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --install build --component "CUDA" --strip
+            & cmake --install build\cuda_v11 --component "CUDA" --strip
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
         }
     }
@@ -124,11 +124,11 @@ function buildCUDA12() {
             $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12_8")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x}  }}
             write-host "Building CUDA v12 backend libraries $cuda"
             $env:CUDAToolkit_ROOT=$cuda
-            & cmake --fresh --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12"
+            & cmake -B build\cuda_v12 --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12"
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --build --preset "CUDA 12"  --config Release --parallel $script:JOBS
+            & cmake --build build\cuda_v12 --target ggml-cuda --config Release --parallel $script:JOBS
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --install build --component "CUDA" --strip
+            & cmake --install build\cuda_v12 --component "CUDA" --strip
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
         }
     }
@@ -143,11 +143,11 @@ function buildCUDA13() {
             $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V13")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x}  }}
             $env:CUDAToolkit_ROOT=$cuda
             write-host "Building CUDA v13 backend libraries $cuda"
-            & cmake --fresh --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13"
+            & cmake -B build\cuda_v13 --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13"
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --build --preset "CUDA 13"  --config Release --parallel $script:JOBS
+            & cmake --build build\cuda_v13 --target ggml-cuda --config Release --parallel $script:JOBS
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --install build --component "CUDA" --strip
+            & cmake --install build\cuda_v13 --component "CUDA" --strip
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
         }
     }
@@ -165,7 +165,7 @@ function buildROCm() {
             $env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
             $env:HIP_PLATFORM="amd"
             $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-            & cmake --fresh --preset "ROCm 6" -G Ninja -DOLLAMA_RUNNER_DIR="rocm" `
+            & cmake --fresh -B build\rocm --preset "ROCm 6" -G Ninja -DOLLAMA_RUNNER_DIR="rocm" `
                 -DCMAKE_C_COMPILER=clang `
                 -DCMAKE_CXX_COMPILER=clang++ `
                 -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" `
@@ -175,9 +175,9 @@ function buildROCm() {
             $env:HIPCXX=""
             $env:HIP_PLATFORM=""
             $env:CMAKE_PREFIX_PATH=""
-            & cmake --build --preset "ROCm 6" --config Release --parallel $script:JOBS
+            & cmake --build build\rocm --target ggml-hip --config Release --parallel $script:JOBS
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --install build --component "HIP" --strip
+            & cmake --install build\rocm --component "HIP" --strip
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
             Remove-Item -Path $script:DIST_DIR\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
         }
diff --git a/server/create.go b/server/create.go
index 19f24ec8..4fdf4104 100644
--- a/server/create.go
+++ b/server/create.go
@@ -119,6 +119,27 @@ func (s *Server) CreateHandler(c *gin.Context) {
 				if err != nil {
 					ch <- gin.H{"error": err.Error()}
 				}
+
+				if err == nil && !remote && (config.Renderer == "" || config.Parser == "") {
+					manifest, mErr := ParseNamedManifest(fromName)
+					if mErr == nil && manifest.Config.Digest != "" {
+						configPath, pErr := GetBlobsPath(manifest.Config.Digest)
+						if pErr == nil {
+							if cfgFile, fErr := os.Open(configPath); fErr == nil {
+								var baseConfig ConfigV2
+								if decErr := json.NewDecoder(cfgFile).Decode(&baseConfig); decErr == nil {
+									if config.Renderer == "" {
+										config.Renderer = baseConfig.Renderer
+									}
+									if config.Parser == "" {
+										config.Parser = baseConfig.Parser
+									}
+								}
+								cfgFile.Close()
+							}
+						}
+					}
+				}
 			}
 		} else if r.Files != nil {
 			baseLayers, err = convertModelFromFiles(r.Files, baseLayers, false, fn)
diff --git a/server/routes.go b/server/routes.go
index 80c00cb6..5b4d5f5d 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -142,7 +142,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
 
 	// This model is much more capable with a larger context, so set that
 	// unless it would penalize performance too much
-	if !s.lowVRAM && slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
+	if !s.lowVRAM && slices.Contains([]string{
+		"gptoss", "gpt-oss",
+		"qwen3vl", "qwen3vlmoe",
+	}, model.Config.ModelFamily) {
 		opts.NumCtx = max(opts.NumCtx, 8192)
 	}
 
@@ -289,6 +292,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			return
 		}
 
+		contentType := "application/json; charset=utf-8"
+		if req.Stream != nil && *req.Stream {
+			contentType = "application/x-ndjson"
+		}
+		c.Header("Content-Type", contentType)
+
 		return
 	}
 
@@ -1874,10 +1883,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			req.Options = map[string]any{}
 		}
 
-		msgs := append(m.Messages, req.Messages...)
-		if req.Messages[0].Role != "system" && m.System != "" {
-			msgs = append([]api.Message{{Role: "system", Content: m.System}}, msgs...)
+		var msgs []api.Message
+		if len(req.Messages) > 0 {
+			msgs = append(m.Messages, req.Messages...)
+			if req.Messages[0].Role != "system" && m.System != "" {
+				msgs = append([]api.Message{{Role: "system", Content: m.System}}, msgs...)
+			}
 		}
+
 		msgs = filterThinkTags(msgs, m)
 		req.Messages = msgs
 
@@ -1928,6 +1941,12 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			return
 		}
 
+		contentType := "application/json; charset=utf-8"
+		if req.Stream != nil && *req.Stream {
+			contentType = "application/x-ndjson"
+		}
+		c.Header("Content-Type", contentType)
+
 		return
 	}
 
diff --git a/server/routes_create_test.go b/server/routes_create_test.go
index f4f7b76c..909ebfe5 100644
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -188,6 +188,72 @@ func TestCreateFromModel(t *testing.T) {
 	})
 }
 
+func TestCreateFromModelInheritsRendererParser(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	p := t.TempDir()
+	t.Setenv("OLLAMA_MODELS", p)
+	var s Server
+
+	const (
+		renderer = "custom-renderer"
+		parser   = "custom-parser"
+	)
+
+	_, digest := createBinFile(t, nil, nil)
+
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+		Name:     "base",
+		Files:    map[string]string{"base.gguf": digest},
+		Renderer: renderer,
+		Parser:   parser,
+		Stream:   &stream,
+	})
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected status code 200, actual %d", w.Code)
+	}
+
+	w = createRequest(t, s.CreateHandler, api.CreateRequest{
+		Name:   "child",
+		From:   "base",
+		Stream: &stream,
+	})
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected status code 200, actual %d", w.Code)
+	}
+
+	manifest, err := ParseNamedManifest(model.ParseName("child"))
+	if err != nil {
+		t.Fatalf("parse manifest: %v", err)
+	}
+	if manifest.Config.Digest == "" {
+		t.Fatalf("unexpected empty config digest for child manifest")
+	}
+
+	configPath, err := GetBlobsPath(manifest.Config.Digest)
+	if err != nil {
+		t.Fatalf("config blob path: %v", err)
+	}
+
+	cfgFile, err := os.Open(configPath)
+	if err != nil {
+		t.Fatalf("open config blob: %v", err)
+	}
+	defer cfgFile.Close()
+
+	var cfg ConfigV2
+	if err := json.NewDecoder(cfgFile).Decode(&cfg); err != nil {
+		t.Fatalf("decode config: %v", err)
+	}
+
+	if cfg.Renderer != renderer {
+		t.Fatalf("expected renderer %q, got %q", renderer, cfg.Renderer)
+	}
+	if cfg.Parser != parser {
+		t.Fatalf("expected parser %q, got %q", parser, cfg.Parser)
+	}
+}
+
 func TestCreateRemovesLayers(t *testing.T) {
 	gin.SetMode(gin.TestMode)
 
diff --git a/server/routes_debug_test.go b/server/routes_debug_test.go
index 466951a1..bf822c68 100644
--- a/server/routes_debug_test.go
+++ b/server/routes_debug_test.go
@@ -9,9 +9,9 @@ import (
 
 	"github.com/gin-gonic/gin"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/ml"
 )
 
 func TestGenerateDebugRenderOnly(t *testing.T) {
@@ -37,9 +37,9 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
 			loaded:          make(map[string]*runnerRef),
 			newServerFn:     newMockServer(&mock),
 			getGpuFn:        getGpuFn,
-			getCpuFn:        getCpuFn,
+			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
@@ -230,9 +230,9 @@ func TestChatDebugRenderOnly(t *testing.T) {
 			loaded:          make(map[string]*runnerRef),
 			newServerFn:     newMockServer(&mock),
 			getGpuFn:        getGpuFn,
-			getCpuFn:        getCpuFn,
+			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
diff --git a/server/routes_generate_renderer_test.go b/server/routes_generate_renderer_test.go
index ea18b1e5..e6473e08 100644
--- a/server/routes_generate_renderer_test.go
+++ b/server/routes_generate_renderer_test.go
@@ -12,9 +12,9 @@ import (
 	"github.com/google/go-cmp/cmp"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/ml"
 )
 
 // TestGenerateWithBuiltinRenderer tests that api/generate uses built-in renderers
@@ -42,9 +42,9 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) {
 			loaded:          make(map[string]*runnerRef),
 			newServerFn:     newMockServer(&mock),
 			getGpuFn:        getGpuFn,
-			getCpuFn:        getCpuFn,
+			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
 					llama: &mock,
@@ -226,9 +226,9 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) {
 			loaded:          make(map[string]*runnerRef),
 			newServerFn:     newMockServer(&mock),
 			getGpuFn:        getGpuFn,
-			getCpuFn:        getCpuFn,
+			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
 					llama: &mock,
diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go
index 75d4f012..4c6b934b 100644
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -6,6 +6,8 @@ import (
 	"encoding/json"
 	"io"
 	"net/http"
+	"net/http/httptest"
+	"net/url"
 	"strings"
 	"sync"
 	"testing"
@@ -15,9 +17,9 @@ import (
 	"github.com/google/go-cmp/cmp"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/ml"
 )
 
 type mockRunner struct {
@@ -46,12 +48,92 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
 	return
 }
 
-func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
-	return func(_ discover.GpuInfoList, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
+func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
+	return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
 		return mock, nil
 	}
 }
 
+func TestGenerateChatRemote(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	rs := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.Method != http.MethodPost {
+			t.Errorf("Expected POST request, got %s", r.Method)
+		}
+		if r.URL.Path != "/api/chat" {
+			t.Errorf("Expected path '/api/chat', got %s", r.URL.Path)
+		}
+
+		w.WriteHeader(http.StatusOK)
+		w.Header().Set("Content-Type", "application/json")
+		resp := api.ChatResponse{
+			Model:      "test",
+			Done:       true,
+			DoneReason: "load",
+		}
+		if err := json.NewEncoder(w).Encode(&resp); err != nil {
+			t.Fatal(err)
+		}
+	}))
+	defer rs.Close()
+
+	p, err := url.Parse(rs.URL)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	t.Setenv("OLLAMA_REMOTES", p.Hostname())
+	s := Server{}
+	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+		Model:      "test-cloud",
+		RemoteHost: rs.URL,
+		From:       "test",
+		Info: map[string]any{
+			"capabilities": []string{"completion", "thinking"},
+		},
+		Stream: &stream,
+	})
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected status 200, got %d", w.Code)
+	}
+
+	t.Run("missing messages", func(t *testing.T) {
+		w := createRequest(t, s.ChatHandler, api.ChatRequest{
+			Model: "test-cloud",
+		})
+		if w.Code != http.StatusOK {
+			t.Errorf("expected status 200, got %d", w.Code)
+		}
+
+		var actual api.ChatResponse
+		if err := json.NewDecoder(w.Body).Decode(&actual); err != nil {
+			t.Fatal(err)
+		}
+
+		if actual.Model != "test-cloud" {
+			t.Errorf("expected model test-cloud, got %s", actual.Model)
+		}
+
+		if actual.RemoteModel != "test" {
+			t.Errorf("expected remote model test, got %s", actual.RemoteModel)
+		}
+
+		if actual.RemoteHost != rs.URL {
+			t.Errorf("expected remote host '%s', got %s", rs.URL, actual.RemoteHost)
+		}
+
+		if !actual.Done {
+			t.Errorf("expected done true, got false")
+		}
+
+		if actual.DoneReason != "load" {
+			t.Errorf("expected done reason load, got %s", actual.DoneReason)
+		}
+	})
+}
+
 func TestGenerateChat(t *testing.T) {
 	gin.SetMode(gin.TestMode)
 
@@ -75,9 +157,9 @@ func TestGenerateChat(t *testing.T) {
 			loaded:          make(map[string]*runnerRef),
 			newServerFn:     newMockServer(&mock),
 			getGpuFn:        getGpuFn,
-			getCpuFn:        getCpuFn,
+			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
@@ -686,9 +768,9 @@ func TestGenerate(t *testing.T) {
 			loaded:          make(map[string]*runnerRef),
 			newServerFn:     newMockServer(&mock),
 			getGpuFn:        getGpuFn,
-			getCpuFn:        getCpuFn,
+			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
@@ -1111,9 +1193,9 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
 				loaded:          make(map[string]*runnerRef),
 				newServerFn:     newMockServer(mock),
 				getGpuFn:        getGpuFn,
-				getCpuFn:        getCpuFn,
+				getSystemInfoFn: getSystemInfoFn,
 				waitForRecovery: 250 * time.Millisecond,
-				loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+				loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 					time.Sleep(time.Millisecond)
 					req.successCh <- &runnerRef{llama: mock}
 					return false
diff --git a/server/routes_harmony_streaming_test.go b/server/routes_harmony_streaming_test.go
index caf2cf6d..8e58ad96 100644
--- a/server/routes_harmony_streaming_test.go
+++ b/server/routes_harmony_streaming_test.go
@@ -14,9 +14,9 @@ import (
 
 	"github.com/gin-gonic/gin"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/ml"
 )
 
 func getTestTools() []api.Tool {
@@ -275,9 +275,9 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
 					loaded:          make(map[string]*runnerRef),
 					newServerFn:     newMockServer(&mock),
 					getGpuFn:        getGpuFn,
-					getCpuFn:        getCpuFn,
+					getSystemInfoFn: getSystemInfoFn,
 					waitForRecovery: 100 * time.Millisecond,
-					loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+					loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 						req.successCh <- &runnerRef{
 							llama: &mock,
 						}
@@ -426,9 +426,9 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
 			loaded:          make(map[string]*runnerRef),
 			newServerFn:     newMockServer(&mock),
 			getGpuFn:        getGpuFn,
-			getCpuFn:        getCpuFn,
+			getSystemInfoFn: getSystemInfoFn,
 			waitForRecovery: 100 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 				req.successCh <- &runnerRef{
 					llama: &mock,
 				}
@@ -608,9 +608,9 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
 					loaded:          make(map[string]*runnerRef),
 					newServerFn:     newMockServer(&mock),
 					getGpuFn:        getGpuFn,
-					getCpuFn:        getCpuFn,
+					getSystemInfoFn: getSystemInfoFn,
 					waitForRecovery: 250 * time.Millisecond,
-					loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+					loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
 						req.successCh <- &runnerRef{
 							llama: &mock,
 						}
diff --git a/server/sched.go b/server/sched.go
index 7c639953..1c04047e 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -5,12 +5,9 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
-	"os"
 	"reflect"
-	"runtime"
 	"slices"
 	"sort"
-	"strconv"
 	"strings"
 	"sync"
 	"time"
@@ -52,12 +49,10 @@ type Scheduler struct {
 	activeLoading llm.LlamaServer
 	loaded        map[string]*runnerRef
 
-	loadFn      func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool
-	newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
-	getGpuFn    func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList
-	getCpuFn    func() discover.GpuInfo
-
-	// waitForRecovery sets the limit for how long to wait for memory usage to recover after unload before scheduling the next model
+	loadFn          func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
+	newServerFn     func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
+	getGpuFn        func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo
+	getSystemInfoFn func() ml.SystemInfo
 	waitForRecovery time.Duration
 }
 
@@ -77,8 +72,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
 		unloadedCh:      make(chan any, maxQueue),
 		loaded:          make(map[string]*runnerRef),
 		newServerFn:     llm.NewLlamaServer,
-		getGpuFn:        discover.GetGPUInfo,
-		getCpuFn:        discover.GetCPUInfo,
+		getGpuFn:        discover.GPUDevices,
+		getSystemInfoFn: discover.GetSystemInfo,
 		waitForRecovery: 5 * time.Second,
 	}
 	sched.loadFn = sched.load
@@ -133,6 +128,8 @@ func (s *Scheduler) Run(ctx context.Context) {
 }
 
 func (s *Scheduler) processPending(ctx context.Context) {
+	maxRunners := envconfig.MaxRunners()
+
 	for {
 		select {
 		case <-ctx.Done():
@@ -152,7 +149,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				s.loadedMu.Lock()
 				runner := s.loaded[pending.model.ModelPath]
 				loadedCount := len(s.loaded)
-				runnersSnapshot := make([]discover.FilteredRunnerDiscovery, 0, len(s.loaded))
+				runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded))
 				for _, r := range s.loaded {
 					runnersSnapshot = append(runnersSnapshot, r)
 				}
@@ -167,39 +164,29 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						pending.useLoadedRunner(runner, s.finishedReqCh)
 						break
 					}
-				} else if envconfig.MaxRunners() > 0 && loadedCount >= int(envconfig.MaxRunners()) {
+				} else if maxRunners > 0 && loadedCount >= int(maxRunners) {
 					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
 					runnerToExpire = s.findRunnerToUnload()
 				} else {
 					// Either no models are loaded or below envconfig.MaxRunners
 					// Get a refreshed GPU list
-					var gpus discover.GpuInfoList
+					var gpus []ml.DeviceInfo
 					if pending.opts.NumGPU == 0 {
-						gpus = discover.GpuInfoList{s.getCpuFn()}
+						gpus = []ml.DeviceInfo{}
 					} else {
 						gpus = s.getGpuFn(ctx, runnersSnapshot)
 					}
-
-					if envconfig.MaxRunners() <= 0 {
-						// No user specified MaxRunners, so figure out what automatic setting to use
-						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
-						// if any GPU has unreliable free memory reporting, 1x the number of GPUs
-						allReliable := true
-						for _, gpu := range gpus {
-							if gpu.UnreliableFreeMemory {
-								allReliable = false
-								break
-							}
-						}
-						if allReliable {
-							// HACK
-							os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(defaultModelsPerGPU*len(gpus)))
-							slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners(), "gpu_count", len(gpus))
+					systemInfo := s.getSystemInfoFn()
+					if maxRunners <= 0 {
+						// No user specified MaxRunners, so figure out what automatic setting to use for the next load attempt
+						if pending.opts.NumGPU == 0 {
+							// Need to get actual GPU list to set the correct default max models
+							g := s.getGpuFn(ctx, runnersSnapshot)
+							maxRunners = uint(defaultModelsPerGPU * max(len(g), 1))
 						} else {
-							// HACK
-							os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(len(gpus)))
-							slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
+							maxRunners = uint(defaultModelsPerGPU * max(len(gpus), 1))
 						}
+						slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "gpu_count", len(gpus))
 					}
 
 					// Load model for fitting
@@ -215,14 +202,14 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					if loadedCount == 0 {
 						// No models loaded. Load the model but prefer the best fit.
 						slog.Debug("loading first model", "model", pending.model.ModelPath)
-						s.loadFn(pending, ggml, gpus, false)
+						s.loadFn(pending, ggml, systemInfo, gpus, false)
 						break
 					}
 
 					// More than one loaded model, so we have to see if the
 					// new one fits
 
-					needEvict := s.loadFn(pending, ggml, gpus, true)
+					needEvict := s.loadFn(pending, ggml, systemInfo, gpus, true)
 					if !needEvict {
 						slog.Debug("new model fits with existing models, loading")
 						break
@@ -353,7 +340,7 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 				runner.refMu.Unlock()
 			} else {
 				slog.Debug("starting background wait for VRAM recovery", "runner", runner)
-				runnersSnapshot := make([]discover.FilteredRunnerDiscovery, 0, len(s.loaded))
+				runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded))
 				for _, r := range s.loaded {
 					runnersSnapshot = append(runnersSnapshot, r)
 				}
@@ -395,7 +382,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 
 // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
 // (if any). Returns whether the scheduler needs to evict a model to make this one fit.
-func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
+func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
 	numParallel := max(int(envconfig.NumParallel()), 1)
 
 	// Embedding models should always be loaded with parallel=1
@@ -403,11 +390,11 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
 		numParallel = 1
 	}
 
-	// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
+	// `mllama`, `qwen3vl`, and `qwen3vlmoe` are snowflakes and uses an encoder cache which cannot be used with num_parallel > 1
 	// ref: https://github.com/ollama/ollama/issues/4165
-	if slices.Contains(req.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
+	if slices.Contains([]string{"mllama", "qwen3vl", "qwen3vlmoe"}, req.model.Config.ModelFamily) && numParallel != 1 {
 		numParallel = 1
-		slog.Warn("mllama does not currently support parallel requests")
+		slog.Warn("model architecture does not currently support parallel requests", "architecture", req.model.Config.ModelFamily)
 	}
 
 	sessionDuration := envconfig.KeepAlive()
@@ -420,7 +407,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
 
 	if llama == nil {
 		var err error
-		llama, err = s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
+		llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
 		if err != nil {
 			// some older models are not compatible with newer versions of llama.cpp
 			// show a generalized compatibility error until there is a better way to
@@ -443,9 +430,16 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
 
 	s.loadedMu.Unlock()
 
-	gpuIDs, err := llama.Load(req.ctx, gpus, requireFull)
+	gpuIDs, err := llama.Load(req.ctx, systemInfo, gpus, requireFull)
 	if err != nil {
 		if errors.Is(err, llm.ErrLoadRequiredFull) {
+			if !requireFull {
+				// No other models loaded, yet we still don't fit, so report an error
+				slog.Info("model is too large for system memory", "requireFull", requireFull)
+				s.activeLoading.Close()
+				s.activeLoading = nil
+				req.errCh <- err
+			}
 			return true
 		}
 
@@ -456,6 +450,20 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
 		return false
 	}
 
+	// Determine if we have discrete GPUs which we should monitor VRAM usage on during shutdown
+	discreteGPUs := false
+iGPUScan:
+	for _, devid := range gpuIDs {
+		for _, dev := range gpus {
+			if dev.DeviceID == devid {
+				if !dev.Integrated {
+					discreteGPUs = true
+					break iGPUScan
+				}
+			}
+		}
+	}
+
 	runner := &runnerRef{
 		model:           req.model,
 		modelPath:       req.model.ModelPath,
@@ -463,6 +471,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
 		Options:         &req.opts,
 		sessionDuration: sessionDuration,
 		gpus:            gpuIDs,
+		discreteGPUs:    discreteGPUs,
 		vramSize:        llama.VRAMSize(),
 		totalSize:       llama.TotalSize(),
 		loading:         true,
@@ -510,7 +519,10 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
 	return false
 }
 
-func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
+func (s *Scheduler) updateFreeSpace(allGpus []ml.DeviceInfo) {
+	if len(allGpus) == 0 {
+		return
+	}
 	predMap := map[ml.DeviceID]uint64{} // Sum up the total predicted usage per GPU for all runners
 	s.loadedMu.Lock()
 	runners := make([]*runnerRef, 0, len(s.loaded))
@@ -554,12 +566,13 @@ type runnerRef struct {
 	refMu    sync.Mutex
 	refCount uint // prevent unloading if > 0
 
-	llama     llm.LlamaServer
-	pid       int
-	loading   bool          // True only during initial load, then false forever
-	gpus      []ml.DeviceID // Recorded at time of provisioning
-	vramSize  uint64
-	totalSize uint64
+	llama        llm.LlamaServer
+	pid          int
+	loading      bool          // True only during initial load, then false forever
+	gpus         []ml.DeviceID // Recorded at time of provisioning
+	discreteGPUs bool          // True if all devices are discrete GPUs - used to skip VRAM recovery check for iGPUs
+	vramSize     uint64
+	totalSize    uint64
 
 	sessionDuration time.Duration
 	expireTimer     *time.Timer
@@ -627,14 +640,12 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 // a before and after GPU memory allocation.  The returned channel
 // will be notified when we're done waiting, or have timed out and should
 // proceed anyway
-func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.FilteredRunnerDiscovery) chan any {
+func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []ml.FilteredRunnerDiscovery) chan any {
 	finished := make(chan any, 1)
 
-	// CPU or Metal don't need checking, so no waiting required
-	// windows can page VRAM, only cuda currently can report accurate used vram usage
-	if len(runner.gpus) == 0 ||
-		(len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "Metal")) ||
-		(runtime.GOOS == "windows" && runner.gpus[0].Library != "CUDA") {
+	// CPU, Metal and iGPUs don't need checking, so no waiting required
+	if len(runner.gpus) == 0 || !runner.discreteGPUs ||
+		(len(runner.gpus) == 1 && runner.gpus[0].Library == "Metal") {
 		finished <- struct{}{}
 		slog.Debug("no need to wait for VRAM recovery", "runner", runner)
 		return finished
@@ -668,7 +679,11 @@ func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.Fi
 					totalMemoryNow += gpu.TotalMemory
 					freeMemoryNow += gpu.FreeMemory
 				}
-				logutil.Trace("gpu VRAM convergence", "percent", int(max(float32(freeMemoryNow-freeMemoryBefore), 0.0)/float32(runner.vramSize)*100))
+				if freeMemoryNow > freeMemoryBefore {
+					logutil.Trace("gpu VRAM convergence", "percent", int(float32(freeMemoryNow-freeMemoryBefore)/float32(runner.vramSize)*100))
+				} else {
+					logutil.Trace("gpu VRAM convergence", "percent", 0)
+				}
 				// If we're within ~75% of the estimated memory usage recovered, bail out
 				if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.75 {
 					slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "free_before", format.HumanBytes2(freeMemoryBefore), "free_now", format.HumanBytes2(freeMemoryNow), "runner", runner)
diff --git a/server/sched_test.go b/server/sched_test.go
index 66d43338..316a817f 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -13,7 +13,6 @@ import (
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/app/lifecycle"
-	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
@@ -50,11 +49,12 @@ func TestSchedLoad(t *testing.T) {
 		sessionDuration: &api.Duration{Duration: 2 * time.Second},
 	}
 	// Fail to load model first
-	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return nil, errors.New("something failed to load model blah")
 	}
-	gpus := discover.GpuInfoList{}
-	s.load(req, f, gpus, false)
+	gpus := []ml.DeviceInfo{}
+	systemInfo := ml.SystemInfo{}
+	s.load(req, f, systemInfo, gpus, false)
 	require.Empty(t, req.successCh)
 	require.Len(t, req.errCh, 1)
 	s.loadedMu.Lock()
@@ -64,11 +64,11 @@ func TestSchedLoad(t *testing.T) {
 	require.Contains(t, err.Error(), "this model may be incompatible")
 
 	server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
-	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		server.modelPath = model
 		return server, nil
 	}
-	s.load(req, f, gpus, false)
+	s.load(req, f, systemInfo, gpus, false)
 	select {
 	case err := <-req.errCh:
 		require.NoError(t, err)
@@ -82,7 +82,7 @@ func TestSchedLoad(t *testing.T) {
 
 	req.model.ModelPath = "dummy_model_path"
 	server.waitResp = errors.New("wait failure")
-	s.load(req, f, gpus, false)
+	s.load(req, f, systemInfo, gpus, false)
 	select {
 	case err := <-req.errCh:
 		require.Contains(t, err.Error(), "wait failure")
@@ -106,7 +106,7 @@ type reqBundle struct {
 	f       *ggml.GGML
 }
 
-func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 	scenario.srv.modelPath = model
 	return scenario.srv, nil
 }
@@ -152,20 +152,20 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
 	return b
 }
 
-func getGpuFn(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
+func getGpuFn(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
 	slog.Info("test getGpuFn called", "runners", runners)
-	g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
+	g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
 	g.TotalMemory = 24 * format.GigaByte
 	g.FreeMemory = 12 * format.GigaByte
-	return []discover.GpuInfo{g}
+	return []ml.DeviceInfo{g}
 }
 
-func getCpuFn() discover.GpuInfo {
-	slog.Info("test getCpuFn called")
-	g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "cpu"}}
-	g.TotalMemory = 32 * format.GigaByte
-	g.FreeMemory = 26 * format.GigaByte
-	return g
+func getSystemInfoFn() ml.SystemInfo {
+	slog.Info("test getSystemInfoFn called")
+	return ml.SystemInfo{
+		TotalMemory: 32 * format.GigaByte,
+		FreeMemory:  26 * format.GigaByte,
+	}
 }
 
 func TestSchedRequestsSameModelSameRequest(t *testing.T) {
@@ -174,7 +174,7 @@ func TestSchedRequestsSameModelSameRequest(t *testing.T) {
 	s := InitScheduler(ctx)
 	s.waitForRecovery = 10 * time.Millisecond
 	s.getGpuFn = getGpuFn
-	s.getCpuFn = getCpuFn
+	s.getSystemInfoFn = getSystemInfoFn
 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
 	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0}, nil)
 	b.req.model = a.req.model
@@ -218,7 +218,7 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
 	s := InitScheduler(ctx)
 	s.waitForRecovery = 10 * time.Millisecond
 	s.getGpuFn = getGpuFn
-	s.getCpuFn = getCpuFn
+	s.getSystemInfoFn = getSystemInfoFn
 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
 	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond}, nil)
 	tmpModel := *a.req.model
@@ -251,12 +251,12 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
 	a.ctxDone()
 	// Report recovered VRAM usage
 	time.Sleep(1 * time.Millisecond)
-	s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
-		slog.Info("XXX altered getGpuFn called")
-		g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
+	s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
+		slog.Info("altered getGpuFn called")
+		g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
 		g.TotalMemory = 24 * format.GigaByte
 		g.FreeMemory = 24 * format.GigaByte
-		return []discover.GpuInfo{g}
+		return []ml.DeviceInfo{g}
 	}
 	select {
 	case resp := <-b.req.successCh:
@@ -271,26 +271,26 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
 }
 
 func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
-	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
+	slog.Info("TestRequestsMultipleLoadedModels")
+	ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
 	defer done()
 	s := InitScheduler(ctx)
 	s.waitForRecovery = 10 * time.Millisecond
-	s.getGpuFn = getGpuFn // 1 metal GPU
-	s.getCpuFn = getCpuFn // 1 CPU
+	s.getGpuFn = getGpuFn // 1 Metal GPU
+	s.getSystemInfoFn = getSystemInfoFn
 
 	// Multiple loaded models
-	a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 1 * format.GigaByte})
+	a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 1 * format.GigaByte})
 	a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
-	b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 10 * format.GigaByte})
+	b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 10 * format.GigaByte})
 	b.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
 	c := newScenarioRequest(t, ctx, "model-c-10g-cpu", 10*format.GigaByte, nil, nil /* No GPU load */)
 	c.req.opts.NumGPU = 0                                                                                                                         // CPU load, will be allowed
 	b.req.sessionDuration = &api.Duration{Duration: 10 * time.Millisecond}                                                                        // longer than b to cause the scheduler to favor unloading b over c
-	d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 13 * format.GigaByte}) // Needs prior unloaded
+	d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 13 * format.GigaByte}) // Needs prior unloaded
 
-	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
 	s.newServerFn = a.newServer
-	slog.Info("a")
+	slog.Info("Loading A")
 	s.pendingReqCh <- a.req
 	s.Run(ctx)
 	select {
@@ -309,7 +309,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
 
 	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
 	s.newServerFn = b.newServer
-	slog.Info("b")
+	slog.Info("Loading B")
 	s.pendingReqCh <- b.req
 	select {
 	case resp := <-b.req.successCh:
@@ -327,7 +327,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
 
 	// This is a CPU load with NumGPU = 0 so it should load
 	s.newServerFn = c.newServer
-	slog.Info("c")
+	slog.Info("Loading C")
 	s.pendingReqCh <- c.req
 	select {
 	case resp := <-c.req.successCh:
@@ -337,6 +337,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
 	case err := <-c.req.errCh:
 		t.Fatal(err.Error())
 	case <-ctx.Done():
+		slog.Info("FAIL: scheduler state", "s.loaded", s.loaded)
 		t.Fatal("timeout")
 	}
 	s.loadedMu.Lock()
@@ -361,11 +362,11 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
 	b.ctxDone()
 	// Report recovered VRAM usage so scheduler will finish waiting and unload
 	time.Sleep(1 * time.Millisecond)
-	s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
-		g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
+	s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
+		g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
 		g.TotalMemory = 24 * format.GigaByte
 		g.FreeMemory = 24 * format.GigaByte
-		return []discover.GpuInfo{g}
+		return []ml.DeviceInfo{g}
 	}
 	select {
 	case resp := <-d.req.successCh:
@@ -404,7 +405,7 @@ func TestSchedGetRunner(t *testing.T) {
 	s := InitScheduler(ctx)
 	s.waitForRecovery = 10 * time.Millisecond
 	s.getGpuFn = getGpuFn
-	s.getCpuFn = getCpuFn
+	s.getSystemInfoFn = getSystemInfoFn
 	s.newServerFn = a.newServer
 	slog.Info("a")
 	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
@@ -462,13 +463,14 @@ func TestSchedExpireRunner(t *testing.T) {
 	}
 
 	var f *ggml.GGML
-	gpus := discover.GpuInfoList{}
+	gpus := []ml.DeviceInfo{}
+	systemInfo := ml.SystemInfo{}
 	server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
-	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		server.modelPath = model
 		return server, nil
 	}
-	s.load(req, f, gpus, false)
+	s.load(req, f, systemInfo, gpus, false)
 
 	select {
 	case err := <-req.errCh:
@@ -497,19 +499,15 @@ func TestSchedExpireRunner(t *testing.T) {
 
 // TODO - add one scenario that triggers the bogus finished event with positive ref count
 func TestSchedPrematureExpired(t *testing.T) {
-	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
+	ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
 	defer done()
 
 	// Same model, same request
-	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil, nil)
+	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 100 * time.Millisecond}, nil)
 	s := InitScheduler(ctx)
 	s.waitForRecovery = 10 * time.Millisecond
-	s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
-		g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
-		g.TotalMemory = 24 * format.GigaByte
-		g.FreeMemory = 12 * format.GigaByte
-		return []discover.GpuInfo{g}
-	}
+	s.getGpuFn = getGpuFn
+	s.getSystemInfoFn = getSystemInfoFn
 	s.newServerFn = scenario1a.newServer
 	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
 	require.Len(t, s.pendingReqCh, 1)
@@ -574,7 +572,7 @@ func TestSchedUseLoadedRunner(t *testing.T) {
 func TestSchedUpdateFreeSpace(t *testing.T) {
 	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
 	defer done()
-	gpus := discover.GpuInfoList{
+	gpus := []ml.DeviceInfo{
 		{
 			DeviceID: ml.DeviceID{
 				ID: "1",
@@ -756,8 +754,12 @@ func (s *mockLlm) ModelPath() string {
 	return s.modelPath
 }
 
-func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
+func (s *mockLlm) Load(ctx context.Context, sytemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
 	if requireFull {
+		if len(gpus) == 0 {
+			slog.Info("mockLlm.Load CPU based load")
+			return nil, nil
+		}
 		for _, g := range gpus {
 			if g.FreeMemory >= s.vramSize {
 				return []ml.DeviceID{g.DeviceID}, nil
diff --git a/tools/tools.go b/tools/tools.go
index f9a2d3b9..7b8d726b 100644
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -125,7 +125,7 @@ func (p *Parser) parseToolCall() *api.ToolCall {
 	}
 
 	var args map[string]any
-	if found, i := findArguments(p.buffer); found == nil {
+	if found, i := findArguments(tool, p.buffer); found == nil {
 		return nil
 	} else {
 		args = found
@@ -219,7 +219,7 @@ func findTool(tools []api.Tool, buf []byte) (*api.Tool, int) {
 // objects for functions that have all-optional parameters
 // e.g. `{"name": "get_conditions", "arguments": {}}` will work but
 // `{"name": "get_conditions"}` will not currently work
-func findArguments(buffer []byte) (map[string]any, int) {
+func findArguments(tool *api.Tool, buffer []byte) (map[string]any, int) {
 	if len(buffer) == 0 {
 		return nil, 0
 	}
@@ -269,27 +269,30 @@ func findArguments(buffer []byte) (map[string]any, int) {
 
 				var findObject func(obj map[string]any) (map[string]any, bool)
 				findObject = func(obj map[string]any) (map[string]any, bool) {
+					findMap := func(name string, obj map[string]any) (map[string]any, bool) {
+						if args, ok := obj[name].(map[string]any); ok {
+							return args, true
+						}
+						if argsStr, ok := obj[name].(string); ok {
+							var argsData map[string]interface{}
+							if err := json.Unmarshal([]byte(argsStr), &argsData); err == nil {
+								return argsData, ok
+							}
+						}
+						return nil, false
+					}
 					if _, hasName := obj["name"]; hasName {
-						if args, ok := obj["arguments"].(map[string]any); ok {
+						if args, ok := findMap("arguments", obj); ok {
 							return args, true
 						}
-						if argsStr, ok := obj["arguments"].(string); ok {
-							var argsData map[string]interface{}
-							if err := json.Unmarshal([]byte(argsStr), &argsData); err == nil {
-								return argsData, ok
-							}
-						}
-						if args, ok := obj["parameters"].(map[string]any); ok {
+						if args, ok := findMap("parameters", obj); ok {
 							return args, true
 						}
-						if argsStr, ok := obj["parameters"].(string); ok {
-							var argsData map[string]interface{}
-							if err := json.Unmarshal([]byte(argsStr), &argsData); err == nil {
-								return argsData, ok
-							}
-						}
 						return nil, true
 					}
+					if args, ok := findMap(tool.Function.Name, obj); ok {
+						return args, true
+					}
 
 					for _, v := range obj {
 						switch child := v.(type) {
diff --git a/tools/tools_test.go b/tools/tools_test.go
index 288fa73c..b849e219 100644
--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@@ -1033,6 +1033,7 @@ func TestFindArguments(t *testing.T) {
 		name   string
 		buffer []byte
 		want   map[string]any
+		tool   string
 	}{
 		{
 			name:   "empty string",
@@ -1290,11 +1291,29 @@ func TestFindArguments(t *testing.T) {
 				"location": "San Francisco, CA",
 			},
 		},
+		{
+			name:   "simple tool call",
+			tool:   "get_temperature",
+			buffer: []byte(`{"get_temperature": {"format": "fahrenheit", "location": "San Francisco, CA"}}`),
+			want: map[string]any{
+				"format":   "fahrenheit",
+				"location": "San Francisco, CA",
+			},
+		},
+		{
+			name:   "stringified simple tool call",
+			tool:   "get_temperature",
+			buffer: []byte(`{"get_temperature": "{\"format\": \"fahrenheit\", \"location\": \"San Francisco, CA\"}"}`),
+			want: map[string]any{
+				"format":   "fahrenheit",
+				"location": "San Francisco, CA",
+			},
+		},
 	}
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			got, _ := findArguments(tt.buffer)
+			got, _ := findArguments(&api.Tool{Function: api.ToolFunction{Name: tt.tool}}, tt.buffer)
 
 			if diff := cmp.Diff(got, tt.want); diff != "" {
 				t.Errorf("scanArguments() args mismatch (-got +want):\n%s", diff)