diff --git a/README.md b/README.md
index eb11483c..03cb725c 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ support lists.  Explore its through self-build as guided on the wiki.
 curl -fsSL https://ollama.com/install.sh | sh
 ```
 
-[Manual install instructions](https://github.com/ollama/ollama/blob/main/docs/linux.md)
+[Manual install instructions](https://docs.ollama.com/linux#manual-install)
 
 [Configuring Environment Variables Tip For Unsupport GPUs](https://github.com/likelovewant/ollama-for-amd/wiki#troubleshooting-amd-gpu-support-in-linux)
 
@@ -132,7 +132,7 @@ Ollama supports importing GGUF models in the Modelfile:
 
 ### Import from Safetensors
 
-See the [guide](docs/import.md) on importing models for more information.
+See the [guide](https://docs.ollama.com/import) on importing models for more information.
 
 ### Customize a prompt
 
@@ -165,7 +165,7 @@ ollama run mario
 Hello! It's your friend Mario.
 ```
 
-For more information on working with a Modelfile, see the [Modelfile](docs/modelfile.md) documentation.
+For more information on working with a Modelfile, see the [Modelfile](https://docs.ollama.com/modelfile) documentation.
 
 ## CLI Reference
 
diff --git a/cmd/cmd.go b/cmd/cmd.go
index 369a27a4..1050d7b0 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -280,6 +280,13 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
 		return err
 	}
 
+	if info, err := client.Show(cmd.Context(), &api.ShowRequest{Model: opts.Model}); err != nil {
+		return err
+	} else if info.RemoteHost != "" {
+		// Cloud model, no need to load/unload
+		return nil
+	}
+
 	req := &api.GenerateRequest{
 		Model:     opts.Model,
 		KeepAlive: opts.KeepAlive,
@@ -720,23 +727,21 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}
 
-	// Unload the model if it's running before deletion
-	opts := &runOptions{
-		Model:     args[0],
-		KeepAlive: &api.Duration{Duration: 0},
-	}
-	if err := loadOrUnloadModel(cmd, opts); err != nil {
-		if !strings.Contains(strings.ToLower(err.Error()), "not found") {
-			fmt.Fprintf(os.Stderr, "Warning: unable to stop model '%s'\n", args[0])
+	for _, arg := range args {
+		// Unload the model if it's running before deletion
+		if err := loadOrUnloadModel(cmd, &runOptions{
+			Model:     args[0],
+			KeepAlive: &api.Duration{Duration: 0},
+		}); err != nil {
+			if !strings.Contains(strings.ToLower(err.Error()), "not found") {
+				fmt.Fprintf(os.Stderr, "Warning: unable to stop model '%s'\n", args[0])
+			}
 		}
-	}
 
-	for _, name := range args {
-		req := api.DeleteRequest{Name: name}
-		if err := client.Delete(cmd.Context(), &req); err != nil {
+		if err := client.Delete(cmd.Context(), &api.DeleteRequest{Name: arg}); err != nil {
 			return err
 		}
-		fmt.Printf("deleted '%s'\n", name)
+		fmt.Printf("deleted '%s'\n", arg)
 	}
 	return nil
 }
diff --git a/discover/runner.go b/discover/runner.go
index e74050d0..caaef222 100644
--- a/discover/runner.go
+++ b/discover/runner.go
@@ -335,11 +335,14 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 			ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
 			defer cancel()
 
+			// Apply any dev filters to avoid re-discovering unsupported devices, and get IDs correct
+			devFilter := ml.GetVisibleDevicesEnv(devices)
+
 			for dir := range libDirs {
-				updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, nil)
+				updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, devFilter)
 				for _, u := range updatedDevices {
 					for i := range devices {
-						if u.DeviceID == devices[i].DeviceID {
+						if u.DeviceID == devices[i].DeviceID && u.PCIID == devices[i].PCIID {
 							updated[i] = true
 							devices[i].FreeMemory = u.FreeMemory
 							break
diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index c0ca068a..909104c6 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -895,8 +895,8 @@ func (f GGML) FlashAttention() bool {
 	return slices.Contains([]string{
 		"gemma3",
 		"gptoss", "gpt-oss",
-		"qwen3",
-		"qwen3moe",
+		"qwen3", "qwen3moe",
+		"qwen3vl", "qwen3vlmoe",
 	}, f.KV().String("general.architecture"))
 }
 
diff --git a/integration/api_test.go b/integration/api_test.go
index 48572085..39eea39c 100644
--- a/integration/api_test.go
+++ b/integration/api_test.go
@@ -381,126 +381,3 @@ func TestAPIShowModel(t *testing.T) {
 		t.Errorf("%s missing modified_at: %#v", modelName, resp)
 	}
 }
-
-func TestAPIEmbeddings(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
-	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-	req := api.EmbeddingRequest{
-		Model:  libraryEmbedModels[0],
-		Prompt: "why is the sky blue?",
-		Options: map[string]interface{}{
-			"temperature": 0,
-			"seed":        123,
-		},
-	}
-
-	if err := PullIfMissing(ctx, client, req.Model); err != nil {
-		t.Fatalf("pull failed %s", err)
-	}
-
-	resp, err := client.Embeddings(ctx, &req)
-	if err != nil {
-		t.Fatalf("embeddings call failed %s", err)
-	}
-	if len(resp.Embedding) == 0 {
-		t.Errorf("zero length embedding response")
-	}
-}
-
-func TestAPIToolCalling(t *testing.T) {
-	initialTimeout := 60 * time.Second
-	streamTimeout := 30 * time.Second
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
-	defer cancel()
-
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	modelName := "qwen3:0.6b"
-	if err := PullIfMissing(ctx, client, modelName); err != nil {
-		t.Fatalf("pull failed %s", err)
-	}
-
-	tools := []api.Tool{
-		{
-			Type: "function",
-			Function: api.ToolFunction{
-				Name:        "get_weather",
-				Description: "Get the current weather in a given location",
-				Parameters: api.ToolFunctionParameters{
-					Type:     "object",
-					Required: []string{"location"},
-					Properties: map[string]api.ToolProperty{
-						"location": {
-							Type:        api.PropertyType{"string"},
-							Description: "The city and state, e.g. San Francisco, CA",
-						},
-					},
-				},
-			},
-		},
-	}
-
-	req := api.ChatRequest{
-		Model: modelName,
-		Messages: []api.Message{
-			{
-				Role:    "user",
-				Content: "Call get_weather with location set to San Francisco.",
-			},
-		},
-		Tools: tools,
-		Options: map[string]any{
-			"temperature": 0,
-		},
-	}
-
-	stallTimer := time.NewTimer(initialTimeout)
-	var gotToolCall bool
-	var lastToolCall api.ToolCall
-
-	fn := func(response api.ChatResponse) error {
-		if len(response.Message.ToolCalls) > 0 {
-			gotToolCall = true
-			lastToolCall = response.Message.ToolCalls[len(response.Message.ToolCalls)-1]
-		}
-		if !stallTimer.Reset(streamTimeout) {
-			return fmt.Errorf("stall was detected while streaming response, aborting")
-		}
-		return nil
-	}
-
-	stream := true
-	req.Stream = &stream
-	done := make(chan int)
-	var genErr error
-	go func() {
-		genErr = client.Chat(ctx, &req, fn)
-		done <- 0
-	}()
-
-	select {
-	case <-stallTimer.C:
-		t.Errorf("tool-calling chat never started. Timed out after: %s", initialTimeout.String())
-	case <-done:
-		if genErr != nil {
-			t.Fatalf("chat failed: %v", genErr)
-		}
-
-		if !gotToolCall {
-			t.Fatalf("expected at least one tool call, got none")
-		}
-
-		if lastToolCall.Function.Name != "get_weather" {
-			t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather")
-		}
-
-		if _, ok := lastToolCall.Function.Arguments["location"]; !ok {
-			t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String())
-		}
-	case <-ctx.Done():
-		t.Error("outer test context done while waiting for tool-calling chat")
-	}
-}
diff --git a/integration/tools_test.go b/integration/tools_test.go
new file mode 100644
index 00000000..d6b8dfa5
--- /dev/null
+++ b/integration/tools_test.go
@@ -0,0 +1,132 @@
+//go:build integration
+
+package integration
+
+import (
+	"context"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+func TestAPIToolCalling(t *testing.T) {
+	initialTimeout := 60 * time.Second
+	streamTimeout := 60 * time.Second
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
+	defer cancel()
+
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	minVRAM := map[string]uint64{
+		"qwen3-vl":      16,
+		"gpt-oss:20b":   16,
+		"gpt-oss:120b":  70,
+		"qwen3":         6,
+		"llama3.1":      8,
+		"llama3.2":      4,
+		"mistral":       6,
+		"qwen2.5":       6,
+		"qwen2":         6,
+		"mistral-nemo":  9,
+		"mistral-small": 16,
+		"mixtral:8x22b": 80,
+		"qwq":           20,
+		"granite3.3":    7,
+	}
+
+	for _, model := range libraryToolsModels {
+		t.Run(model, func(t *testing.T) {
+			if v, ok := minVRAM[model]; ok {
+				skipUnderMinVRAM(t, v)
+			}
+
+			if err := PullIfMissing(ctx, client, model); err != nil {
+				t.Fatalf("pull failed %s", err)
+			}
+
+			tools := []api.Tool{
+				{
+					Type: "function",
+					Function: api.ToolFunction{
+						Name:        "get_weather",
+						Description: "Get the current weather in a given location",
+						Parameters: api.ToolFunctionParameters{
+							Type:     "object",
+							Required: []string{"location"},
+							Properties: map[string]api.ToolProperty{
+								"location": {
+									Type:        api.PropertyType{"string"},
+									Description: "The city and state, e.g. San Francisco, CA",
+								},
+							},
+						},
+					},
+				},
+			}
+
+			req := api.ChatRequest{
+				Model: model,
+				Messages: []api.Message{
+					{
+						Role:    "user",
+						Content: "Call get_weather with location set to San Francisco.",
+					},
+				},
+				Tools: tools,
+				Options: map[string]any{
+					"temperature": 0,
+				},
+			}
+
+			stallTimer := time.NewTimer(initialTimeout)
+			var gotToolCall bool
+			var lastToolCall api.ToolCall
+
+			fn := func(response api.ChatResponse) error {
+				if len(response.Message.ToolCalls) > 0 {
+					gotToolCall = true
+					lastToolCall = response.Message.ToolCalls[len(response.Message.ToolCalls)-1]
+				}
+				if !stallTimer.Reset(streamTimeout) {
+					return fmt.Errorf("stall was detected while streaming response, aborting")
+				}
+				return nil
+			}
+
+			stream := true
+			req.Stream = &stream
+			done := make(chan int)
+			var genErr error
+			go func() {
+				genErr = client.Chat(ctx, &req, fn)
+				done <- 0
+			}()
+
+			select {
+			case <-stallTimer.C:
+				t.Errorf("tool-calling chat never started. Timed out after: %s", initialTimeout.String())
+			case <-done:
+				if genErr != nil {
+					t.Fatalf("chat failed: %v", genErr)
+				}
+
+				if !gotToolCall {
+					t.Fatalf("expected at least one tool call, got none")
+				}
+
+				if lastToolCall.Function.Name != "get_weather" {
+					t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather")
+				}
+
+				if _, ok := lastToolCall.Function.Arguments["location"]; !ok {
+					t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String())
+				}
+			case <-ctx.Done():
+				t.Error("outer test context done while waiting for tool-calling chat")
+			}
+		})
+	}
+}
diff --git a/integration/utils_test.go b/integration/utils_test.go
index c0bac5e1..8a362408 100644
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -260,6 +260,22 @@ var (
 		"snowflake-arctic-embed",
 		"snowflake-arctic-embed2",
 	}
+	libraryToolsModels = []string{
+		"qwen3-vl",
+		"gpt-oss:20b",
+		"gpt-oss:120b",
+		"qwen3",
+		"llama3.1",
+		"llama3.2",
+		"mistral",
+		"qwen2.5",
+		"qwen2",
+		"mistral-nemo",
+		"mistral-small",
+		"mixtral:8x22b",
+		"qwq",
+		"granite3.3",
+	}
 
 	blueSkyPrompt   = "why is the sky blue? Be brief but factual in your reply"
 	blueSkyExpected = []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength", "interact"}
diff --git a/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch b/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch
deleted file mode 100644
index 85cba5b3..00000000
--- a/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch
+++ /dev/null
@@ -1,58 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Oliver Simons <osimons@nvidia.com>
-Date: Tue, 22 Jul 2025 11:02:28 +0200
-Subject: [PATCH] Enable CUDA Graphs for gemma3n.
-
-Similar to
-https://github.com/ggml-org/llama.cpp/pull/14741,
-though ollama has a slightly different model graph
-than llama.cpp which requires different workaround
-checks.
----
- ggml/src/ggml-cuda/ggml-cuda.cu | 18 ++++++++++++++++++
- 1 file changed, 18 insertions(+)
-
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 5b852f69..827e3205 100644
---- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2689,14 +2689,26 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
-     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
-     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
- 
-+    // This fix was added in llama.cpp and Ollama in parallel, but with
-+    // different tensor names.
-+    // llama.cpp: https://github.com/ggml-org/llama.cpp/pull/14741
-+    // ollama: https://github.com/ollama/ollama/pull/11525
-+
-+    const std::string gemma3n_per_layer_proj_src1_name_ollama = " (reshaped)";
-+    const std::string gemma3n_node_name_ollama                = "node_";
-+
-     const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
-     const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
-+
-+    const std::string ffn_moe_bias_suffix = "_exps.bias";
-+
-     const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
-     const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
-     const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
-     const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
-     const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
- 
-+
-     for (int i = 0; i < cgraph->n_nodes; i++) {
-         ggml_tensor * node = cgraph->nodes[i];
- 
-@@ -2720,6 +2732,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
- 
-         if (node->op == GGML_OP_ADD &&
-             node->src[1] && node->src[1]->ne[1] > 1 &&
-+            // ollama
-+            // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
-+            // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
-+            !(node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name_ollama) != std::string::npos : false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name_ollama : false) &&
-+            node->src[1] ? std::string(node->src[1]->name).find(ffn_moe_bias_suffix) == std::string::npos : false &&
-+            // upstream
-             (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
-             (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
-             strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
diff --git a/llama/patches/0019-ggml-Add-batch-size-hint.patch b/llama/patches/0019-ggml-Add-batch-size-hint.patch
new file mode 100644
index 00000000..76d61e2d
--- /dev/null
+++ b/llama/patches/0019-ggml-Add-batch-size-hint.patch
@@ -0,0 +1,300 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Tue, 28 Oct 2025 17:36:54 -0700
+Subject: [PATCH] ggml: Add batch size hint
+
+Some operations use heuristics to determine the batch size, which
+affects offloading decisions. However, these are not always
+accurate when looking at single operations. This provides an
+explicit signal on the batch size from higher layers to ensure
+consistent performance.
+---
+ ggml/include/ggml-backend.h          |  5 ++-
+ ggml/src/ggml-backend-impl.h         |  4 +--
+ ggml/src/ggml-backend.cpp            | 19 +++++++----
+ ggml/src/ggml-blas/ggml-blas.cpp     |  3 +-
+ ggml/src/ggml-cpu/ggml-cpu.cpp       |  4 ++-
+ ggml/src/ggml-cuda/ggml-cuda.cu      | 48 +++++++++++++++++-----------
+ ggml/src/ggml-metal/ggml-metal.cpp   |  4 ++-
+ ggml/src/ggml-vulkan/ggml-vulkan.cpp |  3 +-
+ 8 files changed, 58 insertions(+), 32 deletions(-)
+
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index 229bf387b..2763f2bd6 100644
+--- a/ggml/include/ggml-backend.h
++++ b/ggml/include/ggml-backend.h
+@@ -98,7 +98,7 @@ extern "C" {
+ 
+     GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+     GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+-    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
++    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
+ 
+     // NOTE: will be removed, use device version instead
+     GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
+@@ -307,6 +307,9 @@ extern "C" {
+     GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
+     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
+ 
++    // Provide a hint on the batch size to optimize processing (uses heuristics if unset)
++    GGML_API void                 ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size);
++
+     // Initialize backend buffers from a measure graph
+     GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
+ 
+diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
+index 6792ba986..0f5b03cef 100644
+--- a/ggml/src/ggml-backend-impl.h
++++ b/ggml/src/ggml-backend-impl.h
+@@ -106,8 +106,8 @@ extern "C" {
+         // compute the graph with the plan
+         enum ggml_status          (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+ 
+-        // compute graph (always async if supported by the backend)
+-        enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
++        // compute graph (always async if supported by the backend). batch_size may be -1 if unknown
++        enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
+ 
+         // (optional) event synchronization
+         // record an event on this stream
+diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
+index cb2b99562..41eef3b5f 100644
+--- a/ggml/src/ggml-backend.cpp
++++ b/ggml/src/ggml-backend.cpp
+@@ -348,14 +348,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
+ }
+ 
+ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+-    enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
++    enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, -1);
+     ggml_backend_synchronize(backend);
+     return err;
+ }
+ 
+-enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
++enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
+     GGML_ASSERT(backend);
+-    return backend->iface.graph_compute(backend, cgraph);
++    return backend->iface.graph_compute(backend, cgraph, batch_size);
+ }
+ 
+ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+@@ -722,6 +722,8 @@ struct ggml_backend_sched {
+ 
+     bool op_offload;
+ 
++    int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics
++
+     int debug;
+ };
+ 
+@@ -814,7 +816,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
+         if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+             int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
+             // check if a backend with higher prio wants to offload the op
+-            if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
++            if (sched->op_offload && (sched->batch_size < 0 || sched->batch_size >= 32) && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
+                 for (int b = 0; b < src_backend_id; b++) {
+                     if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
+                         SET_CAUSE(tensor, "1.off");
+@@ -1550,7 +1552,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
+         }
+ 
+         if (!sched->callback_eval) {
+-            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
++            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, sched->batch_size);
+             if (ec != GGML_STATUS_SUCCESS) {
+                 return ec;
+             }
+@@ -1572,7 +1574,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
+ 
+                 struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
+ 
+-                enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
++                enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, sched->batch_size);
+                 if (ec != GGML_STATUS_SUCCESS) {
+                     return ec;
+                 }
+@@ -1651,6 +1653,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
+ 
+     sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
+     sched->op_offload = op_offload;
++    sched->batch_size = -1;
+ 
+     ggml_backend_sched_reset(sched);
+ 
+@@ -1682,6 +1685,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
+     free(sched);
+ }
+ 
++void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size) {
++    sched->batch_size = batch_size;
++}
++
+ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
+     GGML_ASSERT(sched);
+     // reset state for the next run
+diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
+index 5b888cdd8..88d088952 100644
+--- a/ggml/src/ggml-blas/ggml-blas.cpp
++++ b/ggml/src/ggml-blas/ggml-blas.cpp
+@@ -224,7 +224,7 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
+     delete backend;
+ }
+ 
+-static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
++static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
+     ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
+ 
+     for (int i = 0; i < cgraph->n_nodes; i++) {
+@@ -254,6 +254,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
+     return GGML_STATUS_SUCCESS;
+ 
+     GGML_UNUSED(backend);
++    GGML_UNUSED(batch_size);
+ }
+ 
+ static struct ggml_backend_i blas_backend_i = {
+diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
+index 3191faaa4..32f14c811 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
++++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
+@@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe
+     GGML_UNUSED(backend);
+ }
+ 
+-static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
++static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
+     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+ 
+     struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+@@ -184,6 +184,8 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
+     cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+ 
+     return ggml_graph_compute(cgraph, &cplan);
++
++    GGML_UNUSED(batch_size);
+ }
+ 
+ static const struct ggml_backend_i ggml_backend_cpu_i = {
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 5b852f690..c555cd30f 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
++++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2684,7 +2684,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
+ 
+ #ifdef USE_CUDA_GRAPH
+ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+-    bool use_cuda_graph) {
++    int batch_size, bool use_cuda_graph) {
+ 
+     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
+     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
+@@ -2718,24 +2718,34 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+ #endif
+         }
+ 
+-        if (node->op == GGML_OP_ADD &&
+-            node->src[1] && node->src[1]->ne[1] > 1 &&
+-            (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
+-            (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
+-            strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
+-            strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
+-            strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
+-            strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
+-            strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
+-            // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
+-            // by means of matching node names. See
+-            // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
+-            // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
+-            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
+-            use_cuda_graph = false;
++        // If we have an explicit batch size hint then we don't need to use the tensor name heuristics
++        if (batch_size >= 0) {
++            if (batch_size > 1) {
++                use_cuda_graph = false;
+ #ifndef NDEBUG
+-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
++                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%d]\n", __func__, batch_size);
+ #endif
++            }
++        } else {
++            if (node->op == GGML_OP_ADD &&
++                node->src[1] && node->src[1]->ne[1] > 1 &&
++                (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
++                (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
++                strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
++                strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
++                strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
++                strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
++                strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
++                // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
++                // by means of matching node names. See
++                // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
++                // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
++                // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
++                use_cuda_graph = false;
++#ifndef NDEBUG
++                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
++#endif
++            }
+         }
+ 
+         if (node->op == GGML_OP_CPY) {
+@@ -3132,7 +3142,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+     }
+ }
+ 
+-static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
++static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
+     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+ 
+     ggml_cuda_set_device(cuda_ctx->device);
+@@ -3170,7 +3180,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+     if (use_cuda_graph) {
+         cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
+ 
+-        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
++        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, batch_size, use_cuda_graph);
+ 
+         // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
+         if (use_cuda_graph && cuda_graph_update_required) {
+diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
+index f2ff9f322..05ff6a5a6 100644
+--- a/ggml/src/ggml-metal/ggml-metal.cpp
++++ b/ggml/src/ggml-metal/ggml-metal.cpp
+@@ -410,10 +410,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml
+     GGML_UNUSED(dst);
+ }
+ 
+-static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
++static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
+     ggml_metal_t ctx = (ggml_metal_t)backend->context;
+ 
+     return ggml_metal_graph_compute(ctx, cgraph);
++
++    GGML_UNUSED(batch_size);
+ }
+ 
+ static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
+diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+index ed83236f4..bd3ece516 100644
+--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
++++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+@@ -12015,7 +12015,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
+     return num_adds;
+ }
+ 
+-static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
++static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
+     VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
+     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+ 
+@@ -12211,6 +12211,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
+     return GGML_STATUS_SUCCESS;
+ 
+     UNUSED(backend);
++    UNUSED(batch_size);
+ }
+ 
+ // Sort the graph for improved parallelism.
diff --git a/llama/patches/0022-ggml-No-alloc-mode.patch b/llama/patches/0022-ggml-No-alloc-mode.patch
index 019cb886..d03c6c84 100644
--- a/llama/patches/0022-ggml-No-alloc-mode.patch
+++ b/llama/patches/0022-ggml-No-alloc-mode.patch
@@ -16,7 +16,7 @@ must be recreated with no-alloc set to false before loading data.
  5 files changed, 310 insertions(+), 44 deletions(-)
 
 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 229bf387..1ff53ed0 100644
+index 2763f2bd6..b3b5b356a 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
 @@ -305,6 +305,7 @@ extern "C" {
@@ -26,9 +26,9 @@ index 229bf387..1ff53ed0 100644
 +    GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers);
      GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
  
-     // Initialize backend buffers from a measure graph
+     // Provide a hint on the batch size to optimize processing (uses heuristics if unset)
 diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
-index 6792ba98..3c3f22fc 100644
+index 0f5b03cef..7bdf9d81f 100644
 --- a/ggml/src/ggml-backend-impl.h
 +++ b/ggml/src/ggml-backend-impl.h
 @@ -26,12 +26,17 @@ extern "C" {
@@ -75,7 +75,7 @@ index 6792ba98..3c3f22fc 100644
  
      struct ggml_backend {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index cb2b9956..6ef5eeaf 100644
+index 41eef3b5f..c81a2e48a 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
 @@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
@@ -121,8 +121,8 @@ index cb2b9956..6ef5eeaf 100644
      void * base = buffer->iface.get_base(buffer);
  
      GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
-@@ -723,6 +743,12 @@ struct ggml_backend_sched {
-     bool op_offload;
+@@ -725,6 +745,12 @@ struct ggml_backend_sched {
+     int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics
  
      int debug;
 +
@@ -134,7 +134,7 @@ index cb2b9956..6ef5eeaf 100644
  };
  
  #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
-@@ -1606,6 +1632,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
+@@ -1608,6 +1634,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
          size_t graph_size,
          bool parallel,
          bool op_offload) {
@@ -152,7 +152,7 @@ index cb2b9956..6ef5eeaf 100644
      GGML_ASSERT(n_backends > 0);
      GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
      GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
-@@ -1647,10 +1684,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
+@@ -1649,11 +1686,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
                  sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
              }
          }
@@ -162,11 +162,12 @@ index cb2b9956..6ef5eeaf 100644
  
      sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
      sched->op_offload = op_offload;
+     sched->batch_size = -1;
 +    sched->alloc_buffers = alloc_buffers;
  
      ggml_backend_sched_reset(sched);
  
-@@ -1665,6 +1705,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
+@@ -1668,6 +1708,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
          for (int c = 0; c < sched->n_copies; c++) {
              ggml_backend_event_free(sched->events[b][c]);
          }
@@ -177,7 +178,7 @@ index cb2b9956..6ef5eeaf 100644
      }
      ggml_gallocr_free(sched->galloc);
      ggml_free(sched->ctx);
-@@ -1708,6 +1752,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
+@@ -1715,6 +1759,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
          return false;
      }
  
@@ -202,7 +203,7 @@ index cb2b9956..6ef5eeaf 100644
      ggml_backend_sched_reset(sched);
  
      return true;
-@@ -1813,7 +1875,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
+@@ -1820,7 +1882,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
      int backend_index = ggml_backend_sched_backend_id(sched, backend);
      GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
  
@@ -218,7 +219,7 @@ index cb2b9956..6ef5eeaf 100644
  
  void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
 diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
-index e0abde54..28d6bcd7 100644
+index e0abde542..28d6bcd71 100644
 --- a/ggml/src/ggml-cuda/common.cuh
 +++ b/ggml/src/ggml-cuda/common.cuh
 @@ -35,6 +35,31 @@
@@ -299,7 +300,7 @@ index e0abde54..28d6bcd7 100644
 +    }
  };
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 827e3205..811462c7 100644
+index f4d4a4267..ac70dcac8 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
@@ -540,7 +541,7 @@ index 827e3205..811462c7 100644
  };
  
  ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
-@@ -3011,6 +3073,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
+@@ -3003,6 +3065,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
  
  static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
      bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
@@ -548,7 +549,7 @@ index 827e3205..811462c7 100644
      // flag used to determine whether it is an integrated_gpu
      const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
  
-@@ -3026,6 +3089,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+@@ -3018,6 +3081,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                      continue;
                  }
  
@@ -560,15 +561,15 @@ index 827e3205..811462c7 100644
                  static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
                  if (!disable_fusion) {
  
-@@ -3152,6 +3220,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+@@ -3144,6 +3212,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
  
- static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
      ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 +    cuda_ctx->pool_set_alloc(true);
  
      ggml_cuda_set_device(cuda_ctx->device);
  
-@@ -3231,6 +3300,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+@@ -3223,6 +3292,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
      return GGML_STATUS_SUCCESS;
  }
  
@@ -640,7 +641,7 @@ index 827e3205..811462c7 100644
  static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
      ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
  
-@@ -3271,6 +3405,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
+@@ -3263,6 +3397,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
      /* .event_record            = */ ggml_backend_cuda_event_record,
      /* .event_wait              = */ ggml_backend_cuda_event_wait,
      /* .graph_optimize          = */ NULL,
diff --git a/llama/patches/0032-interleave-multi-rope.patch b/llama/patches/0032-interleave-multi-rope.patch
new file mode 100644
index 00000000..eb41639e
--- /dev/null
+++ b/llama/patches/0032-interleave-multi-rope.patch
@@ -0,0 +1,113 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <git@mxy.ng>
+Date: Web, 16 Oct 2025 20:37:19 -0700
+Subject: [PATCH] interleave multi rope
+
+since ollama doesn't use mrope for anything else, change it to mean the
+interleaved version used for qwen3vl
+---
+ ggml/src/ggml-cpu/ops.cpp                           |  7 ++-----
+ ggml/src/ggml-cuda/rope.cu                          | 12 +++---------
+ ggml/src/ggml-metal/ggml-metal.metal                | 10 +++-------
+ ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp | 12 +++---------
+ 4 files changed, 11 insertions(+), 30 deletions(-)
+
+diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
+index 31478dd8e..4d1ed207e 100644
+--- a/ggml/src/ggml-cpu/ops.cpp
++++ b/ggml/src/ggml-cpu/ops.cpp
+@@ -5509,15 +5509,12 @@ static void ggml_mrope_cache_init(
+         }
+ 
+         float theta = theta_t;
+-        if (sector >= sections[0] && sector < sec_w) {
++        if (sector % 3 == 1 && sector < 1 + 3 * sections[1]) {
+             theta = theta_h;
+         }
+-        else if (sector >= sec_w && sector < sec_w + sections[2]) {
++        else if (sector % 3 == 2 && sector < 2 + 3 * sections[2]) {
+             theta = theta_w;
+         }
+-        else if (sector >= sec_w + sections[2]) {
+-            theta = theta_e;
+-        }
+ 
+         rope_yarn(
+             theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
+diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
+index d058504cd..287fe9d2c 100644
+--- a/ggml/src/ggml-cuda/rope.cu
++++ b/ggml/src/ggml-cuda/rope.cu
+@@ -151,19 +151,13 @@ static __global__ void rope_multi(
+     const int sec_w = sections.v[1] + sections.v[0];
+     const int sector = (i0 / 2) % sect_dims;
+ 
+-    float theta_base = 0.0;
+-    if (sector < sections.v[0]) {
+-        theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
+-    }
+-    else if (sector >= sections.v[0] && sector < sec_w) {
++    float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
++    if (sector % 3 == 1 && sector < 1 + 3 * sections.v[1]) {
+         theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
+     }
+-    else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
++    else if (sector % 3 == 2 && sector < 2 + 3 * sections.v[2]) {
+         theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
+     }
+-    else if (sector >= sec_w + sections.v[2]) {
+-        theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
+-    }
+ 
+     const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+ 
+diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
+index 375a0c7fd..9866c96b4 100644
+--- a/ggml/src/ggml-metal/ggml-metal.metal
++++ b/ggml/src/ggml-metal/ggml-metal.metal
+@@ -3858,15 +3858,11 @@ kernel void kernel_rope_multi(
+             const int sec_w012  = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
+             const int sector    = ic % sect_dims;
+ 
+-            float theta_base;
+-            if (sector < args.sect_0) {
+-                theta_base = (float) pos[i2];
+-            } else if (sector < sec_w01) {
++            float theta_base = (float) pos[i2];
++            if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) {
+                 theta_base = (float) pos[i2 + args.ne02];
+-            } else if (sector < sec_w012) {
++            } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) {
+                 theta_base = (float) pos[i2 + args.ne02 * 2];
+-            } else {
+-                theta_base = (float) pos[i2 + args.ne02 * 3];
+             }
+             // end of mrope
+ 
+diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
+index 111286b49..6fc2b42f8 100644
+--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
++++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
+@@ -31,19 +31,13 @@ void main() {
+     const int sec_w = p.sections[1] + p.sections[0];
+     const uint sector = (i0 / 2) % sect_dims;
+ 
+-    float theta_base = 0.0;
+-    if (sector < p.sections[0]) {
+-        theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
+-    }
+-    else if (sector >= p.sections[0] && sector < sec_w) {
++    float theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
++    if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) {
+         theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
+     }
+-    else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
++    else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) {
+         theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
+     }
+-    else if (sector >= sec_w + p.sections[2]) {
+-        theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
+-    }
+ 
+     const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
+ 
diff --git a/llm/server.go b/llm/server.go
index f8b232df..4f7c3760 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -339,8 +339,23 @@ func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.W
 	cmd = exec.Command(exe, params...)
 
 	cmd.Env = os.Environ()
-	cmd.Stdout = out
-	cmd.Stderr = out
+
+	if out != nil {
+		stdout, err := cmd.StdoutPipe()
+		if err != nil {
+			return nil, 0, fmt.Errorf("failed to spawn server stdout pipe: %w", err)
+		}
+		stderr, err := cmd.StderrPipe()
+		if err != nil {
+			return nil, 0, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
+		}
+		go func() {
+			io.Copy(out, stdout) //nolint:errcheck
+		}()
+		go func() {
+			io.Copy(out, stderr) //nolint:errcheck
+		}()
+	}
 	cmd.SysProcAttr = LlamaServerSysProcAttr
 
 	// Always filter down the set of GPUs in case there are any unsupported devices that might crash
diff --git a/ml/backend.go b/ml/backend.go
index bf390c01..b07039e2 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -106,6 +106,11 @@ type Context interface {
 	Arange(start, stop, step float32, dtype DType) Tensor
 
 	Forward(...Tensor) Context
+
+	// SetBatchSize provides a hint on the batch size to optimize processing
+	// Uses heuristics if not set
+	SetBatchSize(int)
+
 	Compute(...Tensor)
 	ComputeWithNotify(func(), ...Tensor) // notify callback once compute has begun
 
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 33401c30..eb02c3b1 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -11,6 +11,7 @@ package ggml
 import "C"
 
 import (
+	"cmp"
 	"context"
 	"encoding/binary"
 	"errors"
@@ -385,7 +386,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		C.int(len(schedBackends)),
 		C.size_t(maxGraphNodes),
 		C._Bool(false),
-		C._Bool(false),
+		C._Bool(true),
 		C._Bool(params.AllocMemory),
 	)
 
@@ -748,6 +749,9 @@ type Context struct {
 	ctx   *C.struct_ggml_context
 	graph *C.struct_ggml_cgraph
 
+	// batchSize is a hint to optimize processing
+	batchSize int
+
 	// buft is the buffer type used for new tensors
 	buft C.ggml_backend_buffer_type_t
 
@@ -804,6 +808,10 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
 	return c
 }
 
+func (c *Context) SetBatchSize(batchSize int) {
+	c.batchSize = batchSize
+}
+
 func (c *Context) Compute(tensors ...ml.Tensor) {
 	c.ComputeWithNotify(nil, tensors...)
 }
@@ -814,6 +822,11 @@ func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) {
 	if cb != nil {
 		go cb()
 	}
+
+	if c.batchSize > 0 {
+		C.ggml_backend_sched_set_batch_size(c.b.sched, C.int(c.batchSize))
+	}
+
 	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
 		panic(fmt.Errorf("error computing ggml graph: %v", status))
 	}
@@ -835,6 +848,10 @@ func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) {
 }
 
 func (c *Context) Reserve() {
+	if c.batchSize > 0 {
+		C.ggml_backend_sched_set_batch_size(c.b.sched, C.int(c.batchSize))
+	}
+
 	reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
 
 	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
@@ -1231,6 +1248,11 @@ func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }
 
+// Mulmat performs matrix multiplication between two tensors.
+// If t has shape [m, p, ...] and t2 has shape [m, n, ...],
+// Mulmat returns a new Tensor with shape [p, n, ...].
+//
+// Note: this is similar to matmul(t2, t.tranpose(-1, -2)) in other libraries.
 func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1303,14 +1325,21 @@ func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
 	}
 }
 
-func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
-	if len(shape) != 4 {
-		panic("expected 4 dimensions")
+// Permute permutes t according to order. Permute panics if the number of dimensions
+// in order does not match the number of dimensions in t.
+func (t *Tensor) Permute(ctx ml.Context, order ...int) ml.Tensor {
+	if len(order) != len(t.Shape()) && len(order) != 4 {
+		panic("invalid number of dimensions for permute")
+	}
+
+	// ggml_permute requires 4 dimensions so fill in the rest
+	for i := len(order); i < 4; i++ {
+		order = append(order, i)
 	}
 
 	return &Tensor{
 		b: t.b,
-		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
+		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(order[0]), C.int(order[1]), C.int(order[2]), C.int(order[3])),
 	}
 }
 
@@ -1478,14 +1507,7 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 
 func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
 	// Default options
-	opts := rope.Options{
-		Factors:               &Tensor{},
-		OriginalContextLength: 131072,
-		ExtrapolationFactor:   0.,
-		AttentionFactor:       1.,
-		BetaFast:              32.,
-		BetaSlow:              1.,
-	}
+	opts := rope.Options{Factors: &Tensor{}}
 
 	// Apply any provided options
 	for _, option := range options {
@@ -1497,24 +1519,44 @@ func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase
 		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
 	}
 
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_rope_ext(
+	var tt *C.struct_ggml_tensor
+	if len(opts.MRoPE.Sections) > 0 {
+		mropeSections := make([]C.int32_t, 4)
+		for i, section := range opts.MRoPE.Sections {
+			mropeSections[i] = C.int32_t(section)
+		}
+
+		tt = C.ggml_rope_multi(
 			ctx.(*Context).ctx,
 			dequant,
 			positions.(*Tensor).t,
 			opts.Factors.(*Tensor).t,
 			C.int(ropeDim),
+			unsafe.SliceData(mropeSections),
 			C.int(opts.Type),
-			C.int(opts.OriginalContextLength),
-			C.float(ropeBase),
-			C.float(ropeScale),
-			C.float(opts.ExtrapolationFactor),
-			C.float(opts.AttentionFactor),
-			C.float(opts.BetaFast),
-			C.float(opts.BetaSlow),
-		),
+			cmp.Or(C.int(opts.YaRN.OriginalContextLength), 128<<10),
+			C.float(ropeBase), C.float(ropeScale),
+			C.float(opts.YaRN.ExtrapolationFactor),
+			cmp.Or(C.float(opts.YaRN.AttentionFactor), 1),
+			cmp.Or(C.float(opts.YaRN.BetaFast), 32),
+			cmp.Or(C.float(opts.YaRN.BetaSlow), 1),
+		)
+	} else {
+		tt = C.ggml_rope_ext(
+			ctx.(*Context).ctx,
+			dequant,
+			positions.(*Tensor).t,
+			opts.Factors.(*Tensor).t,
+			C.int(ropeDim), C.int(opts.Type),
+			cmp.Or(C.int(opts.YaRN.OriginalContextLength), 128<<10),
+			C.float(ropeBase), C.float(ropeScale),
+			C.float(opts.YaRN.ExtrapolationFactor),
+			cmp.Or(C.float(opts.YaRN.AttentionFactor), 1),
+			cmp.Or(C.float(opts.YaRN.BetaFast), 32),
+			cmp.Or(C.float(opts.YaRN.BetaSlow), 1),
+		)
 	}
+	return &Tensor{b: t.b, t: tt}
 }
 
 func (t *Tensor) IM2Col(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h
index 80983524..1cab4bb3 100644
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -98,7 +98,7 @@ extern "C" {
 
     GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
     GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
 
     // NOTE: will be removed, use device version instead
     GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
@@ -317,6 +317,9 @@ extern "C" {
     GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers);
     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
 
+    // Provide a hint on the batch size to optimize processing (uses heuristics if unset)
+    GGML_API void                 ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size);
+
     // Initialize backend buffers from a measure graph
     GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
 
diff --git a/ml/backend/ggml/ggml/src/ggml-backend-impl.h b/ml/backend/ggml/ggml/src/ggml-backend-impl.h
index 43c91d9f..21b35ac5 100644
--- a/ml/backend/ggml/ggml/src/ggml-backend-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-backend-impl.h
@@ -112,8 +112,8 @@ extern "C" {
         // compute the graph with the plan
         enum ggml_status          (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
-        // compute graph (always async if supported by the backend)
-        enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        // compute graph (always async if supported by the backend). batch_size may be -1 if unknown
+        enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
 
         // (optional) event synchronization
         // record an event on this stream
diff --git a/ml/backend/ggml/ggml/src/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp
index 0b757af5..9b0a9b91 100644
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -368,14 +368,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
 }
 
 enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
+    enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, -1);
     ggml_backend_synchronize(backend);
     return err;
 }
 
-enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
     GGML_ASSERT(backend);
-    return backend->iface.graph_compute(backend, cgraph);
+    return backend->iface.graph_compute(backend, cgraph, batch_size);
 }
 
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -750,6 +750,8 @@ struct ggml_backend_sched {
 
     bool op_offload;
 
+    int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics
+
     int debug;
 
     // allocate buffers on attached ggml_backend_buffer_type_t's and during reservation
@@ -848,7 +850,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
         if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
             int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
             // check if a backend with higher prio wants to offload the op
-            if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
+            if (sched->op_offload && (sched->batch_size < 0 || sched->batch_size >= 32) && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
                 for (int b = 0; b < src_backend_id; b++) {
                     if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
                         SET_CAUSE(tensor, "1.off");
@@ -1584,7 +1586,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
         }
 
         if (!sched->callback_eval) {
-            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
+            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, sched->batch_size);
             if (ec != GGML_STATUS_SUCCESS) {
                 return ec;
             }
@@ -1606,7 +1608,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
 
                 struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
 
-                enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
+                enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, sched->batch_size);
                 if (ec != GGML_STATUS_SUCCESS) {
                     return ec;
                 }
@@ -1698,6 +1700,7 @@ ggml_backend_sched_t ggml_backend_sched_new_ext(
 
     sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
     sched->op_offload = op_offload;
+    sched->batch_size = -1;
     sched->alloc_buffers = alloc_buffers;
 
     ggml_backend_sched_reset(sched);
@@ -1734,6 +1737,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
     free(sched);
 }
 
+void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size) {
+    sched->batch_size = batch_size;
+}
+
 void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
     GGML_ASSERT(sched);
     // reset state for the next run
diff --git a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
index 2a9ff7f6..6a38a51a 100644
--- a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
@@ -224,7 +224,7 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
     delete backend;
 }
 
-static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
     ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -254,6 +254,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
     return GGML_STATUS_SUCCESS;
 
     GGML_UNUSED(backend);
+    GGML_UNUSED(batch_size);
 }
 
 static struct ggml_backend_i blas_backend_i = {
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp
index 3191faaa..32f14c81 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe
     GGML_UNUSED(backend);
 }
 
-static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
@@ -184,6 +184,8 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
     cplan.abort_callback_data = cpu_ctx->abort_callback_data;
 
     return ggml_graph_compute(cgraph, &cplan);
+
+    GGML_UNUSED(batch_size);
 }
 
 static const struct ggml_backend_i ggml_backend_cpu_i = {
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
index 31478dd8..4d1ed207 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
@@ -5509,15 +5509,12 @@ static void ggml_mrope_cache_init(
         }
 
         float theta = theta_t;
-        if (sector >= sections[0] && sector < sec_w) {
+        if (sector % 3 == 1 && sector < 1 + 3 * sections[1]) {
             theta = theta_h;
         }
-        else if (sector >= sec_w && sector < sec_w + sections[2]) {
+        else if (sector % 3 == 2 && sector < 2 + 3 * sections[2]) {
             theta = theta_w;
         }
-        else if (sector >= sec_w + sections[2]) {
-            theta = theta_e;
-        }
 
         rope_yarn(
             theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index d62f412d..e9b73147 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2775,31 +2775,19 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
 
 #ifdef USE_CUDA_GRAPH
 static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
-    bool use_cuda_graph) {
+    int batch_size, bool use_cuda_graph) {
 
     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
 
-    // This fix was added in llama.cpp and Ollama in parallel, but with
-    // different tensor names.
-    // llama.cpp: https://github.com/ggml-org/llama.cpp/pull/14741
-    // ollama: https://github.com/ollama/ollama/pull/11525
-
-    const std::string gemma3n_per_layer_proj_src1_name_ollama = " (reshaped)";
-    const std::string gemma3n_node_name_ollama                = "node_";
-
     const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
     const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
-
-    const std::string ffn_moe_bias_suffix = "_exps.bias";
-
     const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
     const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
     const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
     const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
     const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
 
-
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
 
@@ -2821,30 +2809,34 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 #endif
         }
 
-        if (node->op == GGML_OP_ADD &&
-            node->src[1] && node->src[1]->ne[1] > 1 &&
-            // ollama
-            // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
-            // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
-            !(node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name_ollama) != std::string::npos : false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name_ollama : false) &&
-            node->src[1] ? std::string(node->src[1]->name).find(ffn_moe_bias_suffix) == std::string::npos : false &&
-            // upstream
-            (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
-            (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
-            strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
-            strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
-            strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
-            strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
-            strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
-            // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
-            // by means of matching node names. See
-            // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
-            // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
-            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
-            use_cuda_graph = false;
+        // If we have an explicit batch size hint then we don't need to use the tensor name heuristics
+        if (batch_size >= 0) {
+            if (batch_size > 1) {
+                use_cuda_graph = false;
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%d]\n", __func__, batch_size);
 #endif
+            }
+        } else {
+            if (node->op == GGML_OP_ADD &&
+                node->src[1] && node->src[1]->ne[1] > 1 &&
+                (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
+                (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
+                strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
+                strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
+                strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
+                strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
+                strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
+                // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
+                // by means of matching node names. See
+                // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
+                // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
+                // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
+                use_cuda_graph = false;
+#ifndef NDEBUG
+                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+#endif
+            }
         }
 
         if (node->op == GGML_OP_CPY) {
@@ -3247,7 +3239,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
     }
 }
 
-static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
     cuda_ctx->pool_set_alloc(true);
 
@@ -3286,7 +3278,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
     if (use_cuda_graph) {
         cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
 
-        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
+        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, batch_size, use_cuda_graph);
 
         // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
         if (use_cuda_graph && cuda_graph_update_required) {
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu b/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu
index d058504c..287fe9d2 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu
@@ -151,19 +151,13 @@ static __global__ void rope_multi(
     const int sec_w = sections.v[1] + sections.v[0];
     const int sector = (i0 / 2) % sect_dims;
 
-    float theta_base = 0.0;
-    if (sector < sections.v[0]) {
-        theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
-    }
-    else if (sector >= sections.v[0] && sector < sec_w) {
+    float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
+    if (sector % 3 == 1 && sector < 1 + 3 * sections.v[1]) {
         theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
     }
-    else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
+    else if (sector % 3 == 2 && sector < 2 + 3 * sections.v[2]) {
         theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
     }
-    else if (sector >= sec_w + sections.v[2]) {
-        theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
-    }
 
     const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
 
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
index 9c0e0c56..f342872d 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@@ -6523,15 +6523,11 @@ kernel void kernel_rope_multi(
             const int sec_w012  = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
             const int sector    = ic % sect_dims;
 
-            float theta_base;
-            if (sector < args.sect_0) {
-                theta_base = (float) pos[i2];
-            } else if (sector < sec_w01) {
+            float theta_base = (float) pos[i2];
+            if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) {
                 theta_base = (float) pos[i2 + args.ne02];
-            } else if (sector < sec_w012) {
+            } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) {
                 theta_base = (float) pos[i2 + args.ne02 * 2];
-            } else {
-                theta_base = (float) pos[i2 + args.ne02 * 3];
             }
             // end of mrope
 
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp
index f356e4a0..032dee76 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp
@@ -410,10 +410,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml
     GGML_UNUSED(dst);
 }
 
-static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
     ggml_metal_t ctx = (ggml_metal_t)backend->context;
 
     return ggml_metal_graph_compute(ctx, cgraph);
+
+    GGML_UNUSED(batch_size);
 }
 
 static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
index 375a0c7f..9866c96b 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@@ -3858,15 +3858,11 @@ kernel void kernel_rope_multi(
             const int sec_w012  = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
             const int sector    = ic % sect_dims;
 
-            float theta_base;
-            if (sector < args.sect_0) {
-                theta_base = (float) pos[i2];
-            } else if (sector < sec_w01) {
+            float theta_base = (float) pos[i2];
+            if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) {
                 theta_base = (float) pos[i2 + args.ne02];
-            } else if (sector < sec_w012) {
+            } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) {
                 theta_base = (float) pos[i2 + args.ne02 * 2];
-            } else {
-                theta_base = (float) pos[i2 + args.ne02 * 3];
             }
             // end of mrope
 
diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 0bbcecd0..cc68e796 100644
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -12039,7 +12039,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
     return num_adds;
 }
 
-static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
     VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
 
@@ -12235,6 +12235,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
     return GGML_STATUS_SUCCESS;
 
     UNUSED(backend);
+    UNUSED(batch_size);
 }
 
 // Sort the graph for improved parallelism.
diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
index 111286b4..633dc20f 100644
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
@@ -31,19 +31,13 @@ void main() {
     const int sec_w = p.sections[1] + p.sections[0];
     const uint sector = (i0 / 2) % sect_dims;
 
-    float theta_base = 0.0;
-    if (sector < p.sections[0]) {
-        theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
-    }
-    else if (sector >= p.sections[0] && sector < sec_w) {
+    float theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
+    if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) {
         theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
     }
-    else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
+    else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) {
         theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
     }
-    else if (sector >= sec_w + p.sections[2]) {
-        theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
-    }
 
     const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
 
diff --git a/ml/backend/ggml/ggml_test.go b/ml/backend/ggml/ggml_test.go
index 4717ea90..31dfdb7b 100644
--- a/ml/backend/ggml/ggml_test.go
+++ b/ml/backend/ggml/ggml_test.go
@@ -23,7 +23,7 @@ func setup(tb testing.TB) ml.Context {
 		tb.Fatal(err)
 	}
 
-	b, err := ml.NewBackend(f.Name(), ml.BackendParams{})
+	b, err := ml.NewBackend(f.Name(), ml.BackendParams{AllocMemory: true})
 	if err != nil {
 		tb.Fatal(err)
 	}
@@ -124,3 +124,254 @@ func TestInferShape(t *testing.T) {
 		})
 	}
 }
+
+func EquateTensors(ctx ml.Context) cmp.Option {
+	return cmp.Comparer(func(x, y ml.Tensor) bool {
+		ctx.Forward(x, y).Compute(x, y)
+		return cmp.Equal(x.Shape(), y.Shape()) &&
+			cmp.Equal(x.DType(), y.DType()) &&
+			cmp.Equal(x.Bytes(), y.Bytes())
+	})
+}
+
+func TestMulmat(t *testing.T) {
+	cases := []struct {
+		name    string
+		a, b, c func(ml.Context) ml.Tensor
+	}{
+		{
+			name: "vector x vector",
+			a: func(ctx ml.Context) ml.Tensor {
+				return ctx.Arange(0, 3, 1, ml.DTypeF32)
+			},
+			b: func(ctx ml.Context) ml.Tensor {
+				return ctx.Arange(0, 3, 1, ml.DTypeF32)
+			},
+			c: func(ctx ml.Context) ml.Tensor {
+				return ctx.FromFloats([]float32{5}, 1)
+			},
+		},
+		{
+			name: "vector x matrix",
+			a: func(ctx ml.Context) ml.Tensor {
+				return ctx.Arange(0, 4, 1, ml.DTypeF32)
+			},
+			b: func(ctx ml.Context) ml.Tensor {
+				return ctx.Arange(0, 12, 1, ml.DTypeF32).Reshape(ctx, 4, 3)
+			},
+			c: func(ctx ml.Context) ml.Tensor {
+				return ctx.FromFloats([]float32{
+					14, 38, 62,
+				}, 1, 3)
+			},
+		},
+		{
+			name: "broadcast vector x batched matrix",
+			a: func(ctx ml.Context) ml.Tensor {
+				return ctx.Arange(0, 4, 1, ml.DTypeF32)
+			},
+			b: func(ctx ml.Context) ml.Tensor {
+				return ctx.Arange(0, 10*3*4, 1, ml.DTypeF32).Reshape(ctx, 4, 3, 10)
+			},
+			c: func(ctx ml.Context) ml.Tensor {
+				return ctx.FromFloats([]float32{
+					14, 38, 62,
+					86, 110, 134,
+					158, 182, 206,
+					230, 254, 278,
+					302, 326, 350,
+					374, 398, 422,
+					446, 470, 494,
+					518, 542, 566,
+					590, 614, 638,
+					662, 686, 710,
+				}, 1, 3, 10)
+			},
+		},
+		{
+			name: "batched matrix x batched matrix",
+			a: func(ctx ml.Context) ml.Tensor {
+				return ctx.Arange(0, 4*5*10, 1, ml.DTypeF32).Reshape(ctx, 4, 5, 10)
+			},
+			b: func(ctx ml.Context) ml.Tensor {
+				return ctx.Arange(0, 4*3*10, 1, ml.DTypeF32).Reshape(ctx, 4, 3, 10)
+			},
+			c: func(ctx ml.Context) ml.Tensor {
+				return ctx.FromFloats([]float32{
+					14, 38, 62, 86, 110,
+					38, 126, 214, 302, 390,
+					62, 214, 366, 518, 670,
+
+					1166, 1382, 1598, 1814, 2030,
+					1510, 1790, 2070, 2350, 2630,
+					1854, 2198, 2542, 2886, 3230,
+
+					4238, 4646, 5054, 5462, 5870,
+					4902, 5374, 5846, 6318, 6790,
+					5566, 6102, 6638, 7174, 7710,
+
+					9230, 9830, 10430, 11030, 11630,
+					10214, 10878, 11542, 12206, 12870,
+					11198, 11926, 12654, 13382, 14110,
+
+					16142, 16934, 17726, 18518, 19310,
+					17446, 18302, 19158, 20014, 20870,
+					18750, 19670, 20590, 21510, 22430,
+
+					24974, 25958, 26942, 27926, 28910,
+					26598, 27646, 28694, 29742, 30790,
+					28222, 29334, 30446, 31558, 32670,
+
+					35726, 36902, 38078, 39254, 40430,
+					37670, 38910, 40150, 41390, 42630,
+					39614, 40918, 42222, 43526, 44830,
+
+					48398, 49766, 51134, 52502, 53870,
+					50662, 52094, 53526, 54958, 56390,
+					52926, 54422, 55918, 57414, 58910,
+
+					62990, 64550, 66110, 67670, 69230,
+					65574, 67198, 68822, 70446, 72070,
+					68158, 69846, 71534, 73222, 74910,
+
+					79502, 81254, 83006, 84758, 86510,
+					82406, 84222, 86038, 87854, 89670,
+					85310, 87190, 89070, 90950, 92830,
+				}, 5, 3, 10)
+			},
+		},
+		{
+			name: "broadcast matrix x batched matrix",
+			a: func(ctx ml.Context) ml.Tensor {
+				return ctx.Arange(0, 4*5, 1, ml.DTypeF32).Reshape(ctx, 4, 5)
+			},
+			b: func(ctx ml.Context) ml.Tensor {
+				return ctx.Arange(0, 4*3*10, 1, ml.DTypeF32).Reshape(ctx, 4, 3, 10)
+			},
+			c: func(ctx ml.Context) ml.Tensor {
+				return ctx.FromFloats([]float32{
+					14, 38, 62, 86, 110,
+					38, 126, 214, 302, 390,
+					62, 214, 366, 518, 670,
+
+					86, 302, 518, 734, 950,
+					110, 390, 670, 950, 1230,
+					134, 478, 822, 1166, 1510,
+
+					158, 566, 974, 1382, 1790,
+					182, 654, 1126, 1598, 2070,
+					206, 742, 1278, 1814, 2350,
+
+					230, 830, 1430, 2030, 2630,
+					254, 918, 1582, 2246, 2910,
+					278, 1006, 1734, 2462, 3190,
+
+					302, 1094, 1886, 2678, 3470,
+					326, 1182, 2038, 2894, 3750,
+					350, 1270, 2190, 3110, 4030,
+
+					374, 1358, 2342, 3326, 4310,
+					398, 1446, 2494, 3542, 4590,
+					422, 1534, 2646, 3758, 4870,
+
+					446, 1622, 2798, 3974, 5150,
+					470, 1710, 2950, 4190, 5430,
+					494, 1798, 3102, 4406, 5710,
+
+					518, 1886, 3254, 4622, 5990,
+					542, 1974, 3406, 4838, 6270,
+					566, 2062, 3558, 5054, 6550,
+
+					590, 2150, 3710, 5270, 6830,
+					614, 2238, 3862, 5486, 7110,
+					638, 2326, 4014, 5702, 7390,
+
+					662, 2414, 4166, 5918, 7670,
+					686, 2502, 4318, 6134, 7950,
+					710, 2590, 4470, 6350, 8230,
+				}, 5, 3, 10)
+			},
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			ctx := setup(t)
+			a, b := tt.a(ctx), tt.b(ctx)
+			c := a.Mulmat(ctx, b)
+			if diff := cmp.Diff(tt.c(ctx), c, EquateTensors(ctx)); diff != "" {
+				t.Errorf("MulMat() result mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestPermute(t *testing.T) {
+	cases := []struct {
+		name  string
+		input func(ml.Context) ml.Tensor
+		shape []int
+		want  func(ml.Context) ml.Tensor
+	}{
+		{
+			name: "transpose",
+			input: func(ctx ml.Context) ml.Tensor {
+				return ctx.Arange(0, 3*2, 1, ml.DTypeF32).Reshape(ctx, 3, 2)
+			},
+			shape: []int{1, 0, 2, 3},
+			want: func(ctx ml.Context) ml.Tensor {
+				return ctx.FromFloats([]float32{
+					0, 3,
+					1, 4,
+					2, 5,
+				}, 2, 3)
+			},
+		},
+		{
+			name: "transpose fill dims",
+			input: func(ctx ml.Context) ml.Tensor {
+				return ctx.Arange(0, 3*2, 1, ml.DTypeF32).Reshape(ctx, 3, 2)
+			},
+			shape: []int{1, 0},
+			want: func(ctx ml.Context) ml.Tensor {
+				return ctx.FromFloats([]float32{
+					0, 3,
+					1, 4,
+					2, 5,
+				}, 2, 3)
+			},
+		},
+		{
+			name: "permute 3d",
+			input: func(ctx ml.Context) ml.Tensor {
+				return ctx.Arange(0, 5*3*2, 1, ml.DTypeF32).Reshape(ctx, 2, 3, 5)
+			},
+			shape: []int{2, 0, 1, 3},
+			want: func(ctx ml.Context) ml.Tensor {
+				return ctx.FromFloats([]float32{
+					0, 2, 4,
+					6, 8, 10,
+					12, 14, 16,
+					18, 20, 22,
+					24, 26, 28,
+
+					1, 3, 5,
+					7, 9, 11,
+					13, 15, 17,
+					19, 21, 23,
+					25, 27, 29,
+				}, 3, 5, 2)
+			},
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			ctx := setup(t)
+			got := tt.input(ctx).Permute(ctx, tt.shape...).Contiguous(ctx)
+			if diff := cmp.Diff(tt.want(ctx), got, EquateTensors(ctx)); diff != "" {
+				t.Errorf("Permute() result mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
diff --git a/ml/nn/rope/rope.go b/ml/nn/rope/rope.go
index 57dd2252..bca8058d 100644
--- a/ml/nn/rope/rope.go
+++ b/ml/nn/rope/rope.go
@@ -4,21 +4,21 @@ import "github.com/ollama/ollama/ml"
 
 // Options contains optional parameters for RoPE function
 type Options struct {
-	Type                  int
-	Factors               ml.Tensor
-	OriginalContextLength int
+	Type    int
+	Factors ml.Tensor
 
 	// YaRN options
-	ExtrapolationFactor,
-	AttentionFactor,
-	BetaFast,
-	BetaSlow float32
-}
+	YaRN struct {
+		OriginalContextLength int
+		ExtrapolationFactor,
+		AttentionFactor,
+		BetaFast,
+		BetaSlow float32
+	}
 
-// WithOriginalContextLength sets a custom context length
-func WithOriginalContextLength(n int) func(*Options) {
-	return func(opts *Options) {
-		opts.OriginalContextLength = n
+	// MRoPE options
+	MRoPE struct {
+		Sections []int
 	}
 }
 
@@ -38,14 +38,28 @@ func WithFactors(factors ml.Tensor) func(*Options) {
 	}
 }
 
+// WithOriginalContextLength sets a custom context length
+func WithOriginalContextLength(n int) func(*Options) {
+	return func(opts *Options) {
+		opts.YaRN.OriginalContextLength = n
+	}
+}
+
 func WithExtrapolationFactor(extrapolationFactor float32) func(*Options) {
 	return func(opts *Options) {
-		opts.ExtrapolationFactor = extrapolationFactor
+		opts.YaRN.ExtrapolationFactor = extrapolationFactor
 	}
 }
 
 func WithAttentionFactor(attentionFactor float32) func(*Options) {
 	return func(opts *Options) {
-		opts.AttentionFactor = attentionFactor
+		opts.YaRN.AttentionFactor = attentionFactor
+	}
+}
+
+func WithMRoPESections(sections []int) func(*Options) {
+	return func(opts *Options) {
+		opts.Type |= 1 << 3
+		opts.MRoPE.Sections = sections
 	}
 }
diff --git a/model/models/qwen25vl/process_image.go b/model/models/qwen25vl/process_image.go
index dc91bdea..ce5ded29 100644
--- a/model/models/qwen25vl/process_image.go
+++ b/model/models/qwen25vl/process_image.go
@@ -79,6 +79,8 @@ type Grid struct {
 }
 
 func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) {
+	img = imageproc.Composite(img)
+
 	origWidth := img.Bounds().Dx()
 	origHeight := img.Bounds().Dy()
 
diff --git a/model/models/qwen3vl/imageprocessor.go b/model/models/qwen3vl/imageprocessor.go
index 621167f5..2453a87d 100644
--- a/model/models/qwen3vl/imageprocessor.go
+++ b/model/models/qwen3vl/imageprocessor.go
@@ -83,6 +83,8 @@ type Grid struct {
 }
 
 func (p *ImageProcessor) ProcessImage(ctx ml.Context, img image.Image) (ml.Tensor, *Grid, error) {
+	img = imageproc.Composite(img)
+
 	origWidth := img.Bounds().Dx()
 	origHeight := img.Bounds().Dy()
 
diff --git a/model/models/qwen3vl/model.go b/model/models/qwen3vl/model.go
index 08beb37c..579863ae 100644
--- a/model/models/qwen3vl/model.go
+++ b/model/models/qwen3vl/model.go
@@ -112,7 +112,8 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positionSlice := slices.Collect(makeSlice2D[int32](3, len(batch.Positions)))
+	// ggml mrope requires 4 positions per token: [time, height, width, extra]
+	positionSlice := slices.Collect(makeSlice2D[int32](4, len(batch.Positions)))
 	for i, id := range batch.Positions {
 		if id < int32(len(m.positionCache)) {
 			id = m.positionCache[id]
@@ -123,6 +124,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 		positionSlice[0][i] = id
 		positionSlice[1][i] = id
 		positionSlice[2][i] = id
+		// positionSlice[3] is intentionally left as zeros
 	}
 
 	hiddenStates := m.TextModel.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
@@ -147,8 +149,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 		}
 	}
 
-	positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0]), len(positionSlice))
-	cos, sin := m.rotaryEmbedding(ctx, positions)
+	positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
 	for i, layer := range m.TextModel.Layers {
 		if m.Cache != nil {
 			m.Cache.SetLayer(i)
@@ -159,7 +160,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 			outputs = batch.Outputs
 		}
 
-		hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, outputs, m.Cache, m.Options)
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
 		if i < len(deepstackVisualEmbeds) {
 			hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i])
 		}
@@ -191,9 +192,10 @@ func New(c fs.Config) (model.Model, error) {
 		ImageProcessor: newImageProcessor(c),
 	}
 
-	m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, position ml.Tensor) (ml.Tensor, error) {
+	m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, positions ml.Tensor) (ml.Tensor, error) {
 		m.positionCache = nil
-		return nil, kvcache.ErrNotSupported
+		positions = positions.Repeat(ctx, 1, 4).Reshape(ctx, -1)
+		return m.Options.applyRotaryPositionalEmbedding(ctx, key, positions), nil
 	})
 	return &m, nil
 }
diff --git a/model/models/qwen3vl/model_text.go b/model/models/qwen3vl/model_text.go
index 14e7d7dc..f5767f65 100644
--- a/model/models/qwen3vl/model_text.go
+++ b/model/models/qwen3vl/model_text.go
@@ -10,6 +10,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 )
 
@@ -27,14 +29,18 @@ type TextOptions struct {
 
 	numExperts, numExpertsUsed int
 	normTopKProb               bool
-
-	inverseFrequenciesCache []float32
 }
 
 func (o TextOptions) headDim() int {
 	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
 }
 
+func (o TextOptions) applyRotaryPositionalEmbedding(ctx ml.Context, t, p ml.Tensor) ml.Tensor {
+	return fast.RoPE(ctx, t, p, o.headDim(), o.ropeBase, 1/float32(math.Sqrt(float64(o.ropeScale))),
+		rope.WithMRoPESections(o.mropeSections),
+	)
+}
+
 type TextAttention struct {
 	Query     *nn.Linear  `gguf:"attn_q"`
 	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
@@ -44,7 +50,7 @@ type TextAttention struct {
 	Output    *nn.Linear  `gguf:"attn_output"`
 }
 
-func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenStates.Dim(1)
 
 	query := sa.Query.Forward(ctx, hiddenStates)
@@ -58,8 +64,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tenso
 	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
 	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
 
-	query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
-	key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
+	query = opts.applyRotaryPositionalEmbedding(ctx, query, positions)
+	key = opts.applyRotaryPositionalEmbedding(ctx, key, positions)
 
 	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
 	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
@@ -125,10 +131,10 @@ type TextLayer struct {
 	TextMLP
 }
 
-func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	residual := hiddenStates
 	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = d.TextAttention.Forward(ctx, hiddenStates, cos, sin, cache, opts)
+	hiddenStates = d.TextAttention.Forward(ctx, hiddenStates, positions, cache, opts)
 
 	if outputs != nil {
 		hiddenStates = hiddenStates.Rows(ctx, outputs)
@@ -153,42 +159,6 @@ type TextModel struct {
 	Options *TextOptions
 }
 
-func (m *TextModel) rotaryEmbedding(ctx ml.Context, positions ml.Tensor) (_, _ ml.Tensor) {
-	positions = positions.Reshape(ctx, 1, positions.Dim(0), positions.Dim(1))
-	if len(m.Options.inverseFrequenciesCache) == 0 {
-		m.Options.inverseFrequenciesCache = make([]float32, m.Options.headDim()/2)
-		for i := range m.Options.inverseFrequenciesCache {
-			frequency := float32(math.Pow(float64(m.Options.ropeBase), float64(i*2)/float64(m.Options.headDim())))
-			m.Options.inverseFrequenciesCache[i] = 1 / frequency
-		}
-	}
-
-	inverseFrequencies := ctx.Input().FromFloats(m.Options.inverseFrequenciesCache, 1, len(m.Options.inverseFrequenciesCache))
-
-	positions = positions.Cast(ctx, ml.DTypeF32)
-	frequencies := inverseFrequencies.Mulmat(ctx, positions)
-
-	interleaved := frequencies.View(ctx,
-		0, frequencies.Dim(0),
-		frequencies.Stride(1), frequencies.Dim(1),
-	)
-
-	for _, i := range []int{1, 2} {
-		args := []int{
-			i * frequencies.Stride(0), 1,
-			3 * frequencies.Stride(0), m.Options.mropeSections[i],
-			frequencies.Stride(1), frequencies.Dim(1),
-		}
-
-		ctx.Forward(frequencies.View(ctx, i*frequencies.Stride(2)+args[0], args[1:]...).
-			Copy(ctx, interleaved.View(ctx, args[0], args[1:]...)))
-	}
-
-	interleaved = interleaved.Concat(ctx, interleaved, 0)
-	interleaved = interleaved.Reshape(ctx, interleaved.Dim(0), 1, interleaved.Dim(1), interleaved.Dim(2))
-	return interleaved.Cos(ctx), interleaved.Sin(ctx)
-}
-
 var _ model.Model = (*Model)(nil)
 
 func newTextModel(c fs.Config) *TextModel {
diff --git a/runner/ollamarunner/multimodal.go b/runner/ollamarunner/multimodal.go
index 78ceb771..6af89021 100644
--- a/runner/ollamarunner/multimodal.go
+++ b/runner/ollamarunner/multimodal.go
@@ -86,6 +86,9 @@ func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Ten
 		computeCtx.Forward(tensors...)
 		entry.data = make([][]float32, len(entry.mm))
 
+		// Multimodal processing is computationally intensive, so treat it similarly to a large batch
+		computeCtx.SetBatchSize(512)
+
 		if !reserve {
 			computeCtx.Compute(tensors...)
 
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index 153a3e57..3e8c1e22 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -598,6 +598,7 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
 	// Actual batchInputs values will be injected into the batch.Inputs tensor before calling Compute
 	batch.Inputs = nextBatch.ctx.Input().Empty(ml.DTypeI32, len(batchInputs))
 	batch.Outputs = nextBatch.ctx.Input().FromInts(batchOutputs, len(batchOutputs))
+	nextBatch.ctx.SetBatchSize(len(batchInputs))
 	nextBatch.modelOutput, err = model.Forward(nextBatch.ctx, s.model, batch)
 	if err != nil {
 		err = fmt.Errorf("failed to build graph: %w", err)
@@ -1009,12 +1010,17 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
 	}
 }
 
-func (s *Server) reserveWorstCaseGraph() error {
+func (s *Server) reserveWorstCaseGraph(prompt bool) error {
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()
 
 	var err error
-	inputs := make([]*input.Input, s.batchSize)
+	batchSize := 1
+	if prompt {
+		batchSize = s.batchSize
+	}
+
+	inputs := make([]*input.Input, batchSize)
 	for i := range inputs {
 		inputs[i] = &input.Input{}
 	}
@@ -1031,7 +1037,7 @@ func (s *Server) reserveWorstCaseGraph() error {
 	// - The result may now be larger than a batch (images may not fit in a
 	//   single batch), so trim based on what will fit and must be grouped together.
 	// - Fill out the rest of the space with text tokens.
-	if multimodalProcessor, ok := s.model.(model.MultimodalProcessor); ok {
+	if multimodalProcessor, ok := s.model.(model.MultimodalProcessor); prompt && ok {
 		mmCtx := s.model.Backend().NewContext()
 		defer mmCtx.Close()
 
@@ -1058,10 +1064,10 @@ func (s *Server) reserveWorstCaseGraph() error {
 				}
 			}
 
-			if len(inputs) < s.batchSize {
-				newInputs := make([]*input.Input, s.batchSize)
+			if len(inputs) < batchSize {
+				newInputs := make([]*input.Input, batchSize)
 				copy(newInputs, inputs)
-				for i := len(inputs); i < s.batchSize; i++ {
+				for i := len(inputs); i < batchSize; i++ {
 					newInputs[i] = &input.Input{}
 				}
 				inputs = newInputs
@@ -1103,6 +1109,7 @@ func (s *Server) reserveWorstCaseGraph() error {
 		return err
 	}
 
+	ctx.SetBatchSize(batchSize)
 	ctx.Forward(t).Reserve()
 
 	return nil
@@ -1160,7 +1167,12 @@ func (s *Server) allocModel(
 	s.seqs = make([]*Sequence, s.parallel)
 	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
 
-	return s.reserveWorstCaseGraph()
+	err = s.reserveWorstCaseGraph(true)
+	if err != nil {
+		return nil
+	}
+
+	return s.reserveWorstCaseGraph(false)
 }
 
 // closeModel frees all memory associated with a model