diff --git a/README.md b/README.md index eb11483c..03cb725c 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ support lists. Explore its through self-build as guided on the wiki. curl -fsSL https://ollama.com/install.sh | sh ``` -[Manual install instructions](https://github.com/ollama/ollama/blob/main/docs/linux.md) +[Manual install instructions](https://docs.ollama.com/linux#manual-install) [Configuring Environment Variables Tip For Unsupport GPUs](https://github.com/likelovewant/ollama-for-amd/wiki#troubleshooting-amd-gpu-support-in-linux) @@ -132,7 +132,7 @@ Ollama supports importing GGUF models in the Modelfile: ### Import from Safetensors -See the [guide](docs/import.md) on importing models for more information. +See the [guide](https://docs.ollama.com/import) on importing models for more information. ### Customize a prompt @@ -165,7 +165,7 @@ ollama run mario Hello! It's your friend Mario. ``` -For more information on working with a Modelfile, see the [Modelfile](docs/modelfile.md) documentation. +For more information on working with a Modelfile, see the [Modelfile](https://docs.ollama.com/modelfile) documentation. ## CLI Reference diff --git a/cmd/cmd.go b/cmd/cmd.go index 369a27a4..1050d7b0 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -280,6 +280,13 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error { return err } + if info, err := client.Show(cmd.Context(), &api.ShowRequest{Model: opts.Model}); err != nil { + return err + } else if info.RemoteHost != "" { + // Cloud model, no need to load/unload + return nil + } + req := &api.GenerateRequest{ Model: opts.Model, KeepAlive: opts.KeepAlive, @@ -720,23 +727,21 @@ func DeleteHandler(cmd *cobra.Command, args []string) error { return err } - // Unload the model if it's running before deletion - opts := &runOptions{ - Model: args[0], - KeepAlive: &api.Duration{Duration: 0}, - } - if err := loadOrUnloadModel(cmd, opts); err != nil { - if !strings.Contains(strings.ToLower(err.Error()), "not found") { - fmt.Fprintf(os.Stderr, "Warning: unable to stop model '%s'\n", args[0]) + for _, arg := range args { + // Unload the model if it's running before deletion + if err := loadOrUnloadModel(cmd, &runOptions{ + Model: args[0], + KeepAlive: &api.Duration{Duration: 0}, + }); err != nil { + if !strings.Contains(strings.ToLower(err.Error()), "not found") { + fmt.Fprintf(os.Stderr, "Warning: unable to stop model '%s'\n", args[0]) + } } - } - for _, name := range args { - req := api.DeleteRequest{Name: name} - if err := client.Delete(cmd.Context(), &req); err != nil { + if err := client.Delete(cmd.Context(), &api.DeleteRequest{Name: arg}); err != nil { return err } - fmt.Printf("deleted '%s'\n", name) + fmt.Printf("deleted '%s'\n", arg) } return nil } diff --git a/discover/runner.go b/discover/runner.go index e74050d0..caaef222 100644 --- a/discover/runner.go +++ b/discover/runner.go @@ -335,11 +335,14 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. ctx, cancel := context.WithTimeout(ctx, 3*time.Second) defer cancel() + // Apply any dev filters to avoid re-discovering unsupported devices, and get IDs correct + devFilter := ml.GetVisibleDevicesEnv(devices) + for dir := range libDirs { - updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, nil) + updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, devFilter) for _, u := range updatedDevices { for i := range devices { - if u.DeviceID == devices[i].DeviceID { + if u.DeviceID == devices[i].DeviceID && u.PCIID == devices[i].PCIID { updated[i] = true devices[i].FreeMemory = u.FreeMemory break diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index c0ca068a..909104c6 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -895,8 +895,8 @@ func (f GGML) FlashAttention() bool { return slices.Contains([]string{ "gemma3", "gptoss", "gpt-oss", - "qwen3", - "qwen3moe", + "qwen3", "qwen3moe", + "qwen3vl", "qwen3vlmoe", }, f.KV().String("general.architecture")) } diff --git a/integration/api_test.go b/integration/api_test.go index 48572085..39eea39c 100644 --- a/integration/api_test.go +++ b/integration/api_test.go @@ -381,126 +381,3 @@ func TestAPIShowModel(t *testing.T) { t.Errorf("%s missing modified_at: %#v", modelName, resp) } } - -func TestAPIEmbeddings(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) - defer cancel() - client, _, cleanup := InitServerConnection(ctx, t) - defer cleanup() - req := api.EmbeddingRequest{ - Model: libraryEmbedModels[0], - Prompt: "why is the sky blue?", - Options: map[string]interface{}{ - "temperature": 0, - "seed": 123, - }, - } - - if err := PullIfMissing(ctx, client, req.Model); err != nil { - t.Fatalf("pull failed %s", err) - } - - resp, err := client.Embeddings(ctx, &req) - if err != nil { - t.Fatalf("embeddings call failed %s", err) - } - if len(resp.Embedding) == 0 { - t.Errorf("zero length embedding response") - } -} - -func TestAPIToolCalling(t *testing.T) { - initialTimeout := 60 * time.Second - streamTimeout := 30 * time.Second - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) - defer cancel() - - client, _, cleanup := InitServerConnection(ctx, t) - defer cleanup() - - modelName := "qwen3:0.6b" - if err := PullIfMissing(ctx, client, modelName); err != nil { - t.Fatalf("pull failed %s", err) - } - - tools := []api.Tool{ - { - Type: "function", - Function: api.ToolFunction{ - Name: "get_weather", - Description: "Get the current weather in a given location", - Parameters: api.ToolFunctionParameters{ - Type: "object", - Required: []string{"location"}, - Properties: map[string]api.ToolProperty{ - "location": { - Type: api.PropertyType{"string"}, - Description: "The city and state, e.g. San Francisco, CA", - }, - }, - }, - }, - }, - } - - req := api.ChatRequest{ - Model: modelName, - Messages: []api.Message{ - { - Role: "user", - Content: "Call get_weather with location set to San Francisco.", - }, - }, - Tools: tools, - Options: map[string]any{ - "temperature": 0, - }, - } - - stallTimer := time.NewTimer(initialTimeout) - var gotToolCall bool - var lastToolCall api.ToolCall - - fn := func(response api.ChatResponse) error { - if len(response.Message.ToolCalls) > 0 { - gotToolCall = true - lastToolCall = response.Message.ToolCalls[len(response.Message.ToolCalls)-1] - } - if !stallTimer.Reset(streamTimeout) { - return fmt.Errorf("stall was detected while streaming response, aborting") - } - return nil - } - - stream := true - req.Stream = &stream - done := make(chan int) - var genErr error - go func() { - genErr = client.Chat(ctx, &req, fn) - done <- 0 - }() - - select { - case <-stallTimer.C: - t.Errorf("tool-calling chat never started. Timed out after: %s", initialTimeout.String()) - case <-done: - if genErr != nil { - t.Fatalf("chat failed: %v", genErr) - } - - if !gotToolCall { - t.Fatalf("expected at least one tool call, got none") - } - - if lastToolCall.Function.Name != "get_weather" { - t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather") - } - - if _, ok := lastToolCall.Function.Arguments["location"]; !ok { - t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String()) - } - case <-ctx.Done(): - t.Error("outer test context done while waiting for tool-calling chat") - } -} diff --git a/integration/tools_test.go b/integration/tools_test.go new file mode 100644 index 00000000..d6b8dfa5 --- /dev/null +++ b/integration/tools_test.go @@ -0,0 +1,132 @@ +//go:build integration + +package integration + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/ollama/ollama/api" +) + +func TestAPIToolCalling(t *testing.T) { + initialTimeout := 60 * time.Second + streamTimeout := 60 * time.Second + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() + + minVRAM := map[string]uint64{ + "qwen3-vl": 16, + "gpt-oss:20b": 16, + "gpt-oss:120b": 70, + "qwen3": 6, + "llama3.1": 8, + "llama3.2": 4, + "mistral": 6, + "qwen2.5": 6, + "qwen2": 6, + "mistral-nemo": 9, + "mistral-small": 16, + "mixtral:8x22b": 80, + "qwq": 20, + "granite3.3": 7, + } + + for _, model := range libraryToolsModels { + t.Run(model, func(t *testing.T) { + if v, ok := minVRAM[model]; ok { + skipUnderMinVRAM(t, v) + } + + if err := PullIfMissing(ctx, client, model); err != nil { + t.Fatalf("pull failed %s", err) + } + + tools := []api.Tool{ + { + Type: "function", + Function: api.ToolFunction{ + Name: "get_weather", + Description: "Get the current weather in a given location", + Parameters: api.ToolFunctionParameters{ + Type: "object", + Required: []string{"location"}, + Properties: map[string]api.ToolProperty{ + "location": { + Type: api.PropertyType{"string"}, + Description: "The city and state, e.g. San Francisco, CA", + }, + }, + }, + }, + }, + } + + req := api.ChatRequest{ + Model: model, + Messages: []api.Message{ + { + Role: "user", + Content: "Call get_weather with location set to San Francisco.", + }, + }, + Tools: tools, + Options: map[string]any{ + "temperature": 0, + }, + } + + stallTimer := time.NewTimer(initialTimeout) + var gotToolCall bool + var lastToolCall api.ToolCall + + fn := func(response api.ChatResponse) error { + if len(response.Message.ToolCalls) > 0 { + gotToolCall = true + lastToolCall = response.Message.ToolCalls[len(response.Message.ToolCalls)-1] + } + if !stallTimer.Reset(streamTimeout) { + return fmt.Errorf("stall was detected while streaming response, aborting") + } + return nil + } + + stream := true + req.Stream = &stream + done := make(chan int) + var genErr error + go func() { + genErr = client.Chat(ctx, &req, fn) + done <- 0 + }() + + select { + case <-stallTimer.C: + t.Errorf("tool-calling chat never started. Timed out after: %s", initialTimeout.String()) + case <-done: + if genErr != nil { + t.Fatalf("chat failed: %v", genErr) + } + + if !gotToolCall { + t.Fatalf("expected at least one tool call, got none") + } + + if lastToolCall.Function.Name != "get_weather" { + t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather") + } + + if _, ok := lastToolCall.Function.Arguments["location"]; !ok { + t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String()) + } + case <-ctx.Done(): + t.Error("outer test context done while waiting for tool-calling chat") + } + }) + } +} diff --git a/integration/utils_test.go b/integration/utils_test.go index c0bac5e1..8a362408 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -260,6 +260,22 @@ var ( "snowflake-arctic-embed", "snowflake-arctic-embed2", } + libraryToolsModels = []string{ + "qwen3-vl", + "gpt-oss:20b", + "gpt-oss:120b", + "qwen3", + "llama3.1", + "llama3.2", + "mistral", + "qwen2.5", + "qwen2", + "mistral-nemo", + "mistral-small", + "mixtral:8x22b", + "qwq", + "granite3.3", + } blueSkyPrompt = "why is the sky blue? Be brief but factual in your reply" blueSkyExpected = []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength", "interact"} diff --git a/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch b/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch deleted file mode 100644 index 85cba5b3..00000000 --- a/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Oliver Simons -Date: Tue, 22 Jul 2025 11:02:28 +0200 -Subject: [PATCH] Enable CUDA Graphs for gemma3n. - -Similar to -https://github.com/ggml-org/llama.cpp/pull/14741, -though ollama has a slightly different model graph -than llama.cpp which requires different workaround -checks. ---- - ggml/src/ggml-cuda/ggml-cuda.cu | 18 ++++++++++++++++++ - 1 file changed, 18 insertions(+) - -diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 5b852f69..827e3205 100644 ---- a/ggml/src/ggml-cuda/ggml-cuda.cu -+++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2689,14 +2689,26 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud - // Loop over nodes in GGML graph to obtain info needed for CUDA graph - cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); - -+ // This fix was added in llama.cpp and Ollama in parallel, but with -+ // different tensor names. -+ // llama.cpp: https://github.com/ggml-org/llama.cpp/pull/14741 -+ // ollama: https://github.com/ollama/ollama/pull/11525 -+ -+ const std::string gemma3n_per_layer_proj_src1_name_ollama = " (reshaped)"; -+ const std::string gemma3n_node_name_ollama = "node_"; -+ - const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected"; - const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj"; -+ -+ const std::string ffn_moe_bias_suffix = "_exps.bias"; -+ - const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased"; - const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased"; - const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased"; - const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out"; - const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d"; - -+ - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - -@@ -2720,6 +2732,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud - - if (node->op == GGML_OP_ADD && - node->src[1] && node->src[1]->ne[1] > 1 && -+ // ollama -+ // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n -+ // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here -+ !(node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name_ollama) != std::string::npos : false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name_ollama : false) && -+ node->src[1] ? std::string(node->src[1]->name).find(ffn_moe_bias_suffix) == std::string::npos : false && -+ // upstream - (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && - (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && - strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && diff --git a/llama/patches/0019-ggml-Add-batch-size-hint.patch b/llama/patches/0019-ggml-Add-batch-size-hint.patch new file mode 100644 index 00000000..76d61e2d --- /dev/null +++ b/llama/patches/0019-ggml-Add-batch-size-hint.patch @@ -0,0 +1,300 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jesse Gross +Date: Tue, 28 Oct 2025 17:36:54 -0700 +Subject: [PATCH] ggml: Add batch size hint + +Some operations use heuristics to determine the batch size, which +affects offloading decisions. However, these are not always +accurate when looking at single operations. This provides an +explicit signal on the batch size from higher layers to ensure +consistent performance. +--- + ggml/include/ggml-backend.h | 5 ++- + ggml/src/ggml-backend-impl.h | 4 +-- + ggml/src/ggml-backend.cpp | 19 +++++++---- + ggml/src/ggml-blas/ggml-blas.cpp | 3 +- + ggml/src/ggml-cpu/ggml-cpu.cpp | 4 ++- + ggml/src/ggml-cuda/ggml-cuda.cu | 48 +++++++++++++++++----------- + ggml/src/ggml-metal/ggml-metal.cpp | 4 ++- + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 3 +- + 8 files changed, 58 insertions(+), 32 deletions(-) + +diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h +index 229bf387b..2763f2bd6 100644 +--- a/ggml/include/ggml-backend.h ++++ b/ggml/include/ggml-backend.h +@@ -98,7 +98,7 @@ extern "C" { + + GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); +- GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); ++ GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size); + + // NOTE: will be removed, use device version instead + GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); +@@ -307,6 +307,9 @@ extern "C" { + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload); + GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); + ++ // Provide a hint on the batch size to optimize processing (uses heuristics if unset) ++ GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size); ++ + // Initialize backend buffers from a measure graph + GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success + +diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h +index 6792ba986..0f5b03cef 100644 +--- a/ggml/src/ggml-backend-impl.h ++++ b/ggml/src/ggml-backend-impl.h +@@ -106,8 +106,8 @@ extern "C" { + // compute the graph with the plan + enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + +- // compute graph (always async if supported by the backend) +- enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); ++ // compute graph (always async if supported by the backend). batch_size may be -1 if unknown ++ enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size); + + // (optional) event synchronization + // record an event on this stream +diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp +index cb2b99562..41eef3b5f 100644 +--- a/ggml/src/ggml-backend.cpp ++++ b/ggml/src/ggml-backend.cpp +@@ -348,14 +348,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba + } + + enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +- enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph); ++ enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, -1); + ggml_backend_synchronize(backend); + return err; + } + +-enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ++enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { + GGML_ASSERT(backend); +- return backend->iface.graph_compute(backend, cgraph); ++ return backend->iface.graph_compute(backend, cgraph, batch_size); + } + + bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { +@@ -722,6 +722,8 @@ struct ggml_backend_sched { + + bool op_offload; + ++ int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics ++ + int debug; + }; + +@@ -814,7 +816,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st + if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); + // check if a backend with higher prio wants to offload the op +- if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { ++ if (sched->op_offload && (sched->batch_size < 0 || sched->batch_size >= 32) && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { + for (int b = 0; b < src_backend_id; b++) { + if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { + SET_CAUSE(tensor, "1.off"); +@@ -1550,7 +1552,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s + } + + if (!sched->callback_eval) { +- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); ++ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, sched->batch_size); + if (ec != GGML_STATUS_SUCCESS) { + return ec; + } +@@ -1572,7 +1574,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s + + struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); + +- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); ++ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, sched->batch_size); + if (ec != GGML_STATUS_SUCCESS) { + return ec; + } +@@ -1651,6 +1653,7 @@ ggml_backend_sched_t ggml_backend_sched_new( + + sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); + sched->op_offload = op_offload; ++ sched->batch_size = -1; + + ggml_backend_sched_reset(sched); + +@@ -1682,6 +1685,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { + free(sched); + } + ++void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size) { ++ sched->batch_size = batch_size; ++} ++ + void ggml_backend_sched_reset(ggml_backend_sched_t sched) { + GGML_ASSERT(sched); + // reset state for the next run +diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp +index 5b888cdd8..88d088952 100644 +--- a/ggml/src/ggml-blas/ggml-blas.cpp ++++ b/ggml/src/ggml-blas/ggml-blas.cpp +@@ -224,7 +224,7 @@ static void ggml_backend_blas_free(ggml_backend_t backend) { + delete backend; + } + +-static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ++static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { +@@ -254,6 +254,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); ++ GGML_UNUSED(batch_size); + } + + static struct ggml_backend_i blas_backend_i = { +diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp +index 3191faaa4..32f14c811 100644 +--- a/ggml/src/ggml-cpu/ggml-cpu.cpp ++++ b/ggml/src/ggml-cpu/ggml-cpu.cpp +@@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe + GGML_UNUSED(backend); + } + +-static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ++static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; + + struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); +@@ -184,6 +184,8 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s + cplan.abort_callback_data = cpu_ctx->abort_callback_data; + + return ggml_graph_compute(cgraph, &cplan); ++ ++ GGML_UNUSED(batch_size); + } + + static const struct ggml_backend_i ggml_backend_cpu_i = { +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index 5b852f690..c555cd30f 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -2684,7 +2684,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { + + #ifdef USE_CUDA_GRAPH + static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, +- bool use_cuda_graph) { ++ int batch_size, bool use_cuda_graph) { + + // Loop over nodes in GGML graph to obtain info needed for CUDA graph + cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); +@@ -2718,24 +2718,34 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud + #endif + } + +- if (node->op == GGML_OP_ADD && +- node->src[1] && node->src[1]->ne[1] > 1 && +- (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && +- (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && +- strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && +- strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && +- strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && +- strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && +- strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { +- // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation +- // by means of matching node names. See +- // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and +- // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, +- // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. +- use_cuda_graph = false; ++ // If we have an explicit batch size hint then we don't need to use the tensor name heuristics ++ if (batch_size >= 0) { ++ if (batch_size > 1) { ++ use_cuda_graph = false; + #ifndef NDEBUG +- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); ++ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%d]\n", __func__, batch_size); + #endif ++ } ++ } else { ++ if (node->op == GGML_OP_ADD && ++ node->src[1] && node->src[1]->ne[1] > 1 && ++ (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && ++ (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && ++ strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && ++ strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && ++ strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && ++ strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && ++ strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { ++ // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation ++ // by means of matching node names. See ++ // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and ++ // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, ++ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. ++ use_cuda_graph = false; ++#ifndef NDEBUG ++ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); ++#endif ++ } + } + + if (node->op == GGML_OP_CPY) { +@@ -3132,7 +3142,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx + } + } + +-static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ++static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + ggml_cuda_set_device(cuda_ctx->device); +@@ -3170,7 +3180,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, + if (use_cuda_graph) { + cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); + +- use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph); ++ use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, batch_size, use_cuda_graph); + + // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. + if (use_cuda_graph && cuda_graph_update_required) { +diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp +index f2ff9f322..05ff6a5a6 100644 +--- a/ggml/src/ggml-metal/ggml-metal.cpp ++++ b/ggml/src/ggml-metal/ggml-metal.cpp +@@ -410,10 +410,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml + GGML_UNUSED(dst); + } + +-static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ++static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { + ggml_metal_t ctx = (ggml_metal_t)backend->context; + + return ggml_metal_graph_compute(ctx, cgraph); ++ ++ GGML_UNUSED(batch_size); + } + + static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { +diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +index ed83236f4..bd3ece516 100644 +--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp ++++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +@@ -12015,7 +12015,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru + return num_adds; + } + +-static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ++static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { + VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + +@@ -12211,6 +12211,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg + return GGML_STATUS_SUCCESS; + + UNUSED(backend); ++ UNUSED(batch_size); + } + + // Sort the graph for improved parallelism. diff --git a/llama/patches/0022-ggml-No-alloc-mode.patch b/llama/patches/0022-ggml-No-alloc-mode.patch index 019cb886..d03c6c84 100644 --- a/llama/patches/0022-ggml-No-alloc-mode.patch +++ b/llama/patches/0022-ggml-No-alloc-mode.patch @@ -16,7 +16,7 @@ must be recreated with no-alloc set to false before loading data. 5 files changed, 310 insertions(+), 44 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h -index 229bf387..1ff53ed0 100644 +index 2763f2bd6..b3b5b356a 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -305,6 +305,7 @@ extern "C" { @@ -26,9 +26,9 @@ index 229bf387..1ff53ed0 100644 + GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); - // Initialize backend buffers from a measure graph + // Provide a hint on the batch size to optimize processing (uses heuristics if unset) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h -index 6792ba98..3c3f22fc 100644 +index 0f5b03cef..7bdf9d81f 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -26,12 +26,17 @@ extern "C" { @@ -75,7 +75,7 @@ index 6792ba98..3c3f22fc 100644 struct ggml_backend { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp -index cb2b9956..6ef5eeaf 100644 +index 41eef3b5f..c81a2e48a 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t @@ -121,8 +121,8 @@ index cb2b9956..6ef5eeaf 100644 void * base = buffer->iface.get_base(buffer); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); -@@ -723,6 +743,12 @@ struct ggml_backend_sched { - bool op_offload; +@@ -725,6 +745,12 @@ struct ggml_backend_sched { + int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics int debug; + @@ -134,7 +134,7 @@ index cb2b9956..6ef5eeaf 100644 }; #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) -@@ -1606,6 +1632,17 @@ ggml_backend_sched_t ggml_backend_sched_new( +@@ -1608,6 +1634,17 @@ ggml_backend_sched_t ggml_backend_sched_new( size_t graph_size, bool parallel, bool op_offload) { @@ -152,7 +152,7 @@ index cb2b9956..6ef5eeaf 100644 GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); -@@ -1647,10 +1684,13 @@ ggml_backend_sched_t ggml_backend_sched_new( +@@ -1649,11 +1686,14 @@ ggml_backend_sched_t ggml_backend_sched_new( sched->events[b][c] = ggml_backend_event_new(backends[b]->device); } } @@ -162,11 +162,12 @@ index cb2b9956..6ef5eeaf 100644 sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->op_offload = op_offload; + sched->batch_size = -1; + sched->alloc_buffers = alloc_buffers; ggml_backend_sched_reset(sched); -@@ -1665,6 +1705,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { +@@ -1668,6 +1708,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { for (int c = 0; c < sched->n_copies; c++) { ggml_backend_event_free(sched->events[b][c]); } @@ -177,7 +178,7 @@ index cb2b9956..6ef5eeaf 100644 } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); -@@ -1708,6 +1752,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * +@@ -1715,6 +1759,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * return false; } @@ -202,7 +203,7 @@ index cb2b9956..6ef5eeaf 100644 ggml_backend_sched_reset(sched); return true; -@@ -1813,7 +1875,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, +@@ -1820,7 +1882,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); @@ -218,7 +219,7 @@ index cb2b9956..6ef5eeaf 100644 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh -index e0abde54..28d6bcd7 100644 +index e0abde542..28d6bcd71 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -35,6 +35,31 @@ @@ -299,7 +300,7 @@ index e0abde54..28d6bcd7 100644 + } }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 827e3205..811462c7 100644 +index f4d4a4267..ac70dcac8 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() { @@ -540,7 +541,7 @@ index 827e3205..811462c7 100644 }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { -@@ -3011,6 +3073,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, +@@ -3003,6 +3065,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { @@ -548,7 +549,7 @@ index 827e3205..811462c7 100644 // flag used to determine whether it is an integrated_gpu const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated; -@@ -3026,6 +3089,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx +@@ -3018,6 +3081,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx continue; } @@ -560,15 +561,15 @@ index 827e3205..811462c7 100644 static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); if (!disable_fusion) { -@@ -3152,6 +3220,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx +@@ -3144,6 +3212,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx - static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + cuda_ctx->pool_set_alloc(true); ggml_cuda_set_device(cuda_ctx->device); -@@ -3231,6 +3300,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, +@@ -3223,6 +3292,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, return GGML_STATUS_SUCCESS; } @@ -640,7 +641,7 @@ index 827e3205..811462c7 100644 static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; -@@ -3271,6 +3405,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = { +@@ -3263,6 +3397,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = { /* .event_record = */ ggml_backend_cuda_event_record, /* .event_wait = */ ggml_backend_cuda_event_wait, /* .graph_optimize = */ NULL, diff --git a/llama/patches/0032-interleave-multi-rope.patch b/llama/patches/0032-interleave-multi-rope.patch new file mode 100644 index 00000000..eb41639e --- /dev/null +++ b/llama/patches/0032-interleave-multi-rope.patch @@ -0,0 +1,113 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Michael Yang +Date: Web, 16 Oct 2025 20:37:19 -0700 +Subject: [PATCH] interleave multi rope + +since ollama doesn't use mrope for anything else, change it to mean the +interleaved version used for qwen3vl +--- + ggml/src/ggml-cpu/ops.cpp | 7 ++----- + ggml/src/ggml-cuda/rope.cu | 12 +++--------- + ggml/src/ggml-metal/ggml-metal.metal | 10 +++------- + ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp | 12 +++--------- + 4 files changed, 11 insertions(+), 30 deletions(-) + +diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp +index 31478dd8e..4d1ed207e 100644 +--- a/ggml/src/ggml-cpu/ops.cpp ++++ b/ggml/src/ggml-cpu/ops.cpp +@@ -5509,15 +5509,12 @@ static void ggml_mrope_cache_init( + } + + float theta = theta_t; +- if (sector >= sections[0] && sector < sec_w) { ++ if (sector % 3 == 1 && sector < 1 + 3 * sections[1]) { + theta = theta_h; + } +- else if (sector >= sec_w && sector < sec_w + sections[2]) { ++ else if (sector % 3 == 2 && sector < 2 + 3 * sections[2]) { + theta = theta_w; + } +- else if (sector >= sec_w + sections[2]) { +- theta = theta_e; +- } + + rope_yarn( + theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1] +diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu +index d058504cd..287fe9d2c 100644 +--- a/ggml/src/ggml-cuda/rope.cu ++++ b/ggml/src/ggml-cuda/rope.cu +@@ -151,19 +151,13 @@ static __global__ void rope_multi( + const int sec_w = sections.v[1] + sections.v[0]; + const int sector = (i0 / 2) % sect_dims; + +- float theta_base = 0.0; +- if (sector < sections.v[0]) { +- theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); +- } +- else if (sector >= sections.v[0] && sector < sec_w) { ++ float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); ++ if (sector % 3 == 1 && sector < 1 + 3 * sections.v[1]) { + theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f); + } +- else if (sector >= sec_w && sector < sec_w + sections.v[2]) { ++ else if (sector % 3 == 2 && sector < 2 + 3 * sections.v[2]) { + theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f); + } +- else if (sector >= sec_w + sections.v[2]) { +- theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f); +- } + + const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; + +diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal +index 375a0c7fd..9866c96b4 100644 +--- a/ggml/src/ggml-metal/ggml-metal.metal ++++ b/ggml/src/ggml-metal/ggml-metal.metal +@@ -3858,15 +3858,11 @@ kernel void kernel_rope_multi( + const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2 + const int sector = ic % sect_dims; + +- float theta_base; +- if (sector < args.sect_0) { +- theta_base = (float) pos[i2]; +- } else if (sector < sec_w01) { ++ float theta_base = (float) pos[i2]; ++ if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) { + theta_base = (float) pos[i2 + args.ne02]; +- } else if (sector < sec_w012) { ++ } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) { + theta_base = (float) pos[i2 + args.ne02 * 2]; +- } else { +- theta_base = (float) pos[i2 + args.ne02 * 3]; + } + // end of mrope + +diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +index 111286b49..6fc2b42f8 100644 +--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp ++++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +@@ -31,19 +31,13 @@ void main() { + const int sec_w = p.sections[1] + p.sections[0]; + const uint sector = (i0 / 2) % sect_dims; + +- float theta_base = 0.0; +- if (sector < p.sections[0]) { +- theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f); +- } +- else if (sector >= p.sections[0] && sector < sec_w) { ++ float theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f); ++ if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) { + theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f); + } +- else if (sector >= sec_w && sector < sec_w + p.sections[2]) { ++ else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) { + theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f); + } +- else if (sector >= sec_w + p.sections[2]) { +- theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f); +- } + + const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f; + diff --git a/llm/server.go b/llm/server.go index f8b232df..4f7c3760 100644 --- a/llm/server.go +++ b/llm/server.go @@ -339,8 +339,23 @@ func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.W cmd = exec.Command(exe, params...) cmd.Env = os.Environ() - cmd.Stdout = out - cmd.Stderr = out + + if out != nil { + stdout, err := cmd.StdoutPipe() + if err != nil { + return nil, 0, fmt.Errorf("failed to spawn server stdout pipe: %w", err) + } + stderr, err := cmd.StderrPipe() + if err != nil { + return nil, 0, fmt.Errorf("failed to spawn server stderr pipe: %w", err) + } + go func() { + io.Copy(out, stdout) //nolint:errcheck + }() + go func() { + io.Copy(out, stderr) //nolint:errcheck + }() + } cmd.SysProcAttr = LlamaServerSysProcAttr // Always filter down the set of GPUs in case there are any unsupported devices that might crash diff --git a/ml/backend.go b/ml/backend.go index bf390c01..b07039e2 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -106,6 +106,11 @@ type Context interface { Arange(start, stop, step float32, dtype DType) Tensor Forward(...Tensor) Context + + // SetBatchSize provides a hint on the batch size to optimize processing + // Uses heuristics if not set + SetBatchSize(int) + Compute(...Tensor) ComputeWithNotify(func(), ...Tensor) // notify callback once compute has begun diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 33401c30..eb02c3b1 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -11,6 +11,7 @@ package ggml import "C" import ( + "cmp" "context" "encoding/binary" "errors" @@ -385,7 +386,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { C.int(len(schedBackends)), C.size_t(maxGraphNodes), C._Bool(false), - C._Bool(false), + C._Bool(true), C._Bool(params.AllocMemory), ) @@ -748,6 +749,9 @@ type Context struct { ctx *C.struct_ggml_context graph *C.struct_ggml_cgraph + // batchSize is a hint to optimize processing + batchSize int + // buft is the buffer type used for new tensors buft C.ggml_backend_buffer_type_t @@ -804,6 +808,10 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context { return c } +func (c *Context) SetBatchSize(batchSize int) { + c.batchSize = batchSize +} + func (c *Context) Compute(tensors ...ml.Tensor) { c.ComputeWithNotify(nil, tensors...) } @@ -814,6 +822,11 @@ func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) { if cb != nil { go cb() } + + if c.batchSize > 0 { + C.ggml_backend_sched_set_batch_size(c.b.sched, C.int(c.batchSize)) + } + if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS { panic(fmt.Errorf("error computing ggml graph: %v", status)) } @@ -835,6 +848,10 @@ func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) { } func (c *Context) Reserve() { + if c.batchSize > 0 { + C.ggml_backend_sched_set_batch_size(c.b.sched, C.int(c.batchSize)) + } + reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph) slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched)) @@ -1231,6 +1248,11 @@ func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor { } } +// Mulmat performs matrix multiplication between two tensors. +// If t has shape [m, p, ...] and t2 has shape [m, n, ...], +// Mulmat returns a new Tensor with shape [p, n, ...]. +// +// Note: this is similar to matmul(t2, t.tranpose(-1, -2)) in other libraries. func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor { return &Tensor{ b: t.b, @@ -1303,14 +1325,21 @@ func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor { } } -func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor { - if len(shape) != 4 { - panic("expected 4 dimensions") +// Permute permutes t according to order. Permute panics if the number of dimensions +// in order does not match the number of dimensions in t. +func (t *Tensor) Permute(ctx ml.Context, order ...int) ml.Tensor { + if len(order) != len(t.Shape()) && len(order) != 4 { + panic("invalid number of dimensions for permute") + } + + // ggml_permute requires 4 dimensions so fill in the rest + for i := len(order); i < 4; i++ { + order = append(order, i) } return &Tensor{ b: t.b, - t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])), + t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(order[0]), C.int(order[1]), C.int(order[2]), C.int(order[3])), } } @@ -1478,14 +1507,7 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor { func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor { // Default options - opts := rope.Options{ - Factors: &Tensor{}, - OriginalContextLength: 131072, - ExtrapolationFactor: 0., - AttentionFactor: 1., - BetaFast: 32., - BetaSlow: 1., - } + opts := rope.Options{Factors: &Tensor{}} // Apply any provided options for _, option := range options { @@ -1497,24 +1519,44 @@ func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32) } - return &Tensor{ - b: t.b, - t: C.ggml_rope_ext( + var tt *C.struct_ggml_tensor + if len(opts.MRoPE.Sections) > 0 { + mropeSections := make([]C.int32_t, 4) + for i, section := range opts.MRoPE.Sections { + mropeSections[i] = C.int32_t(section) + } + + tt = C.ggml_rope_multi( ctx.(*Context).ctx, dequant, positions.(*Tensor).t, opts.Factors.(*Tensor).t, C.int(ropeDim), + unsafe.SliceData(mropeSections), C.int(opts.Type), - C.int(opts.OriginalContextLength), - C.float(ropeBase), - C.float(ropeScale), - C.float(opts.ExtrapolationFactor), - C.float(opts.AttentionFactor), - C.float(opts.BetaFast), - C.float(opts.BetaSlow), - ), + cmp.Or(C.int(opts.YaRN.OriginalContextLength), 128<<10), + C.float(ropeBase), C.float(ropeScale), + C.float(opts.YaRN.ExtrapolationFactor), + cmp.Or(C.float(opts.YaRN.AttentionFactor), 1), + cmp.Or(C.float(opts.YaRN.BetaFast), 32), + cmp.Or(C.float(opts.YaRN.BetaSlow), 1), + ) + } else { + tt = C.ggml_rope_ext( + ctx.(*Context).ctx, + dequant, + positions.(*Tensor).t, + opts.Factors.(*Tensor).t, + C.int(ropeDim), C.int(opts.Type), + cmp.Or(C.int(opts.YaRN.OriginalContextLength), 128<<10), + C.float(ropeBase), C.float(ropeScale), + C.float(opts.YaRN.ExtrapolationFactor), + cmp.Or(C.float(opts.YaRN.AttentionFactor), 1), + cmp.Or(C.float(opts.YaRN.BetaFast), 32), + cmp.Or(C.float(opts.YaRN.BetaSlow), 1), + ) } + return &Tensor{b: t.b, t: tt} } func (t *Tensor) IM2Col(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor { diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h index 80983524..1cab4bb3 100644 --- a/ml/backend/ggml/ggml/include/ggml-backend.h +++ b/ml/backend/ggml/ggml/include/ggml-backend.h @@ -98,7 +98,7 @@ extern "C" { GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); - GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size); // NOTE: will be removed, use device version instead GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); @@ -317,6 +317,9 @@ extern "C" { GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); + // Provide a hint on the batch size to optimize processing (uses heuristics if unset) + GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size); + // Initialize backend buffers from a measure graph GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success diff --git a/ml/backend/ggml/ggml/src/ggml-backend-impl.h b/ml/backend/ggml/ggml/src/ggml-backend-impl.h index 43c91d9f..21b35ac5 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend-impl.h +++ b/ml/backend/ggml/ggml/src/ggml-backend-impl.h @@ -112,8 +112,8 @@ extern "C" { // compute the graph with the plan enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - // compute graph (always async if supported by the backend) - enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + // compute graph (always async if supported by the backend). batch_size may be -1 if unknown + enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size); // (optional) event synchronization // record an event on this stream diff --git a/ml/backend/ggml/ggml/src/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp index 0b757af5..9b0a9b91 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend.cpp +++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp @@ -368,14 +368,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba } enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph); + enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, -1); ggml_backend_synchronize(backend); return err; } -enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { GGML_ASSERT(backend); - return backend->iface.graph_compute(backend, cgraph); + return backend->iface.graph_compute(backend, cgraph, batch_size); } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -750,6 +750,8 @@ struct ggml_backend_sched { bool op_offload; + int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics + int debug; // allocate buffers on attached ggml_backend_buffer_type_t's and during reservation @@ -848,7 +850,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op - if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { + if (sched->op_offload && (sched->batch_size < 0 || sched->batch_size >= 32) && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { for (int b = 0; b < src_backend_id; b++) { if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { SET_CAUSE(tensor, "1.off"); @@ -1584,7 +1586,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } if (!sched->callback_eval) { - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, sched->batch_size); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1606,7 +1608,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, sched->batch_size); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1698,6 +1700,7 @@ ggml_backend_sched_t ggml_backend_sched_new_ext( sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->op_offload = op_offload; + sched->batch_size = -1; sched->alloc_buffers = alloc_buffers; ggml_backend_sched_reset(sched); @@ -1734,6 +1737,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { free(sched); } +void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size) { + sched->batch_size = batch_size; +} + void ggml_backend_sched_reset(ggml_backend_sched_t sched) { GGML_ASSERT(sched); // reset state for the next run diff --git a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp index 2a9ff7f6..6a38a51a 100644 --- a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp @@ -224,7 +224,7 @@ static void ggml_backend_blas_free(ggml_backend_t backend) { delete backend; } -static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { @@ -254,6 +254,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); + GGML_UNUSED(batch_size); } static struct ggml_backend_i blas_backend_i = { diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp index 3191faaa..32f14c81 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe GGML_UNUSED(backend); } -static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); @@ -184,6 +184,8 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s cplan.abort_callback_data = cpu_ctx->abort_callback_data; return ggml_graph_compute(cgraph, &cplan); + + GGML_UNUSED(batch_size); } static const struct ggml_backend_i ggml_backend_cpu_i = { diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp index 31478dd8..4d1ed207 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp @@ -5509,15 +5509,12 @@ static void ggml_mrope_cache_init( } float theta = theta_t; - if (sector >= sections[0] && sector < sec_w) { + if (sector % 3 == 1 && sector < 1 + 3 * sections[1]) { theta = theta_h; } - else if (sector >= sec_w && sector < sec_w + sections[2]) { + else if (sector % 3 == 2 && sector < 2 + 3 * sections[2]) { theta = theta_w; } - else if (sector >= sec_w + sections[2]) { - theta = theta_e; - } rope_yarn( theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1] diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index d62f412d..e9b73147 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2775,31 +2775,19 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { #ifdef USE_CUDA_GRAPH static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, - bool use_cuda_graph) { + int batch_size, bool use_cuda_graph) { // Loop over nodes in GGML graph to obtain info needed for CUDA graph cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); - // This fix was added in llama.cpp and Ollama in parallel, but with - // different tensor names. - // llama.cpp: https://github.com/ggml-org/llama.cpp/pull/14741 - // ollama: https://github.com/ollama/ollama/pull/11525 - - const std::string gemma3n_per_layer_proj_src1_name_ollama = " (reshaped)"; - const std::string gemma3n_node_name_ollama = "node_"; - const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected"; const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj"; - - const std::string ffn_moe_bias_suffix = "_exps.bias"; - const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased"; const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased"; const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased"; const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out"; const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d"; - for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2821,30 +2809,34 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud #endif } - if (node->op == GGML_OP_ADD && - node->src[1] && node->src[1]->ne[1] > 1 && - // ollama - // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n - // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here - !(node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name_ollama) != std::string::npos : false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name_ollama : false) && - node->src[1] ? std::string(node->src[1]->name).find(ffn_moe_bias_suffix) == std::string::npos : false && - // upstream - (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && - (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && - strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && - strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && - strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && - strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && - strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { - // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation - // by means of matching node names. See - // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and - // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, - // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. - use_cuda_graph = false; + // If we have an explicit batch size hint then we don't need to use the tensor name heuristics + if (batch_size >= 0) { + if (batch_size > 1) { + use_cuda_graph = false; #ifndef NDEBUG - GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%d]\n", __func__, batch_size); #endif + } + } else { + if (node->op == GGML_OP_ADD && + node->src[1] && node->src[1]->ne[1] > 1 && + (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && + (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && + strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && + strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && + strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && + strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && + strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { + // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation + // by means of matching node names. See + // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and + // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, + // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. + use_cuda_graph = false; +#ifndef NDEBUG + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); +#endif + } } if (node->op == GGML_OP_CPY) { @@ -3247,7 +3239,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } } -static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; cuda_ctx->pool_set_alloc(true); @@ -3286,7 +3278,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (use_cuda_graph) { cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); - use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph); + use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, batch_size, use_cuda_graph); // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. if (use_cuda_graph && cuda_graph_update_required) { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu b/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu index d058504c..287fe9d2 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu @@ -151,19 +151,13 @@ static __global__ void rope_multi( const int sec_w = sections.v[1] + sections.v[0]; const int sector = (i0 / 2) % sect_dims; - float theta_base = 0.0; - if (sector < sections.v[0]) { - theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); - } - else if (sector >= sections.v[0] && sector < sec_w) { + float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); + if (sector % 3 == 1 && sector < 1 + 3 * sections.v[1]) { theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f); } - else if (sector >= sec_w && sector < sec_w + sections.v[2]) { + else if (sector % 3 == 2 && sector < 2 + 3 * sections.v[2]) { theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f); } - else if (sector >= sec_w + sections.v[2]) { - theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f); - } const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal index 9c0e0c56..f342872d 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal @@ -6523,15 +6523,11 @@ kernel void kernel_rope_multi( const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2 const int sector = ic % sect_dims; - float theta_base; - if (sector < args.sect_0) { - theta_base = (float) pos[i2]; - } else if (sector < sec_w01) { + float theta_base = (float) pos[i2]; + if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) { theta_base = (float) pos[i2 + args.ne02]; - } else if (sector < sec_w012) { + } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) { theta_base = (float) pos[i2 + args.ne02 * 2]; - } else { - theta_base = (float) pos[i2 + args.ne02 * 3]; } // end of mrope diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp index f356e4a0..032dee76 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp @@ -410,10 +410,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml GGML_UNUSED(dst); } -static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { ggml_metal_t ctx = (ggml_metal_t)backend->context; return ggml_metal_graph_compute(ctx, cgraph); + + GGML_UNUSED(batch_size); } static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal index 375a0c7f..9866c96b 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal @@ -3858,15 +3858,11 @@ kernel void kernel_rope_multi( const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2 const int sector = ic % sect_dims; - float theta_base; - if (sector < args.sect_0) { - theta_base = (float) pos[i2]; - } else if (sector < sec_w01) { + float theta_base = (float) pos[i2]; + if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) { theta_base = (float) pos[i2 + args.ne02]; - } else if (sector < sec_w012) { + } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) { theta_base = (float) pos[i2 + args.ne02 * 2]; - } else { - theta_base = (float) pos[i2 + args.ne02 * 3]; } // end of mrope diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 0bbcecd0..cc68e796 100644 --- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -12039,7 +12039,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru return num_adds; } -static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; @@ -12235,6 +12235,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg return GGML_STATUS_SUCCESS; UNUSED(backend); + UNUSED(batch_size); } // Sort the graph for improved parallelism. diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp index 111286b4..633dc20f 100644 --- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp @@ -31,19 +31,13 @@ void main() { const int sec_w = p.sections[1] + p.sections[0]; const uint sector = (i0 / 2) % sect_dims; - float theta_base = 0.0; - if (sector < p.sections[0]) { - theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f); - } - else if (sector >= p.sections[0] && sector < sec_w) { + float theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f); + if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) { theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f); } - else if (sector >= sec_w && sector < sec_w + p.sections[2]) { + else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) { theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f); } - else if (sector >= sec_w + p.sections[2]) { - theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f); - } const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f; diff --git a/ml/backend/ggml/ggml_test.go b/ml/backend/ggml/ggml_test.go index 4717ea90..31dfdb7b 100644 --- a/ml/backend/ggml/ggml_test.go +++ b/ml/backend/ggml/ggml_test.go @@ -23,7 +23,7 @@ func setup(tb testing.TB) ml.Context { tb.Fatal(err) } - b, err := ml.NewBackend(f.Name(), ml.BackendParams{}) + b, err := ml.NewBackend(f.Name(), ml.BackendParams{AllocMemory: true}) if err != nil { tb.Fatal(err) } @@ -124,3 +124,254 @@ func TestInferShape(t *testing.T) { }) } } + +func EquateTensors(ctx ml.Context) cmp.Option { + return cmp.Comparer(func(x, y ml.Tensor) bool { + ctx.Forward(x, y).Compute(x, y) + return cmp.Equal(x.Shape(), y.Shape()) && + cmp.Equal(x.DType(), y.DType()) && + cmp.Equal(x.Bytes(), y.Bytes()) + }) +} + +func TestMulmat(t *testing.T) { + cases := []struct { + name string + a, b, c func(ml.Context) ml.Tensor + }{ + { + name: "vector x vector", + a: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 3, 1, ml.DTypeF32) + }, + b: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 3, 1, ml.DTypeF32) + }, + c: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{5}, 1) + }, + }, + { + name: "vector x matrix", + a: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 4, 1, ml.DTypeF32) + }, + b: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 12, 1, ml.DTypeF32).Reshape(ctx, 4, 3) + }, + c: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 14, 38, 62, + }, 1, 3) + }, + }, + { + name: "broadcast vector x batched matrix", + a: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 4, 1, ml.DTypeF32) + }, + b: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 10*3*4, 1, ml.DTypeF32).Reshape(ctx, 4, 3, 10) + }, + c: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 14, 38, 62, + 86, 110, 134, + 158, 182, 206, + 230, 254, 278, + 302, 326, 350, + 374, 398, 422, + 446, 470, 494, + 518, 542, 566, + 590, 614, 638, + 662, 686, 710, + }, 1, 3, 10) + }, + }, + { + name: "batched matrix x batched matrix", + a: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 4*5*10, 1, ml.DTypeF32).Reshape(ctx, 4, 5, 10) + }, + b: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 4*3*10, 1, ml.DTypeF32).Reshape(ctx, 4, 3, 10) + }, + c: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 14, 38, 62, 86, 110, + 38, 126, 214, 302, 390, + 62, 214, 366, 518, 670, + + 1166, 1382, 1598, 1814, 2030, + 1510, 1790, 2070, 2350, 2630, + 1854, 2198, 2542, 2886, 3230, + + 4238, 4646, 5054, 5462, 5870, + 4902, 5374, 5846, 6318, 6790, + 5566, 6102, 6638, 7174, 7710, + + 9230, 9830, 10430, 11030, 11630, + 10214, 10878, 11542, 12206, 12870, + 11198, 11926, 12654, 13382, 14110, + + 16142, 16934, 17726, 18518, 19310, + 17446, 18302, 19158, 20014, 20870, + 18750, 19670, 20590, 21510, 22430, + + 24974, 25958, 26942, 27926, 28910, + 26598, 27646, 28694, 29742, 30790, + 28222, 29334, 30446, 31558, 32670, + + 35726, 36902, 38078, 39254, 40430, + 37670, 38910, 40150, 41390, 42630, + 39614, 40918, 42222, 43526, 44830, + + 48398, 49766, 51134, 52502, 53870, + 50662, 52094, 53526, 54958, 56390, + 52926, 54422, 55918, 57414, 58910, + + 62990, 64550, 66110, 67670, 69230, + 65574, 67198, 68822, 70446, 72070, + 68158, 69846, 71534, 73222, 74910, + + 79502, 81254, 83006, 84758, 86510, + 82406, 84222, 86038, 87854, 89670, + 85310, 87190, 89070, 90950, 92830, + }, 5, 3, 10) + }, + }, + { + name: "broadcast matrix x batched matrix", + a: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 4*5, 1, ml.DTypeF32).Reshape(ctx, 4, 5) + }, + b: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 4*3*10, 1, ml.DTypeF32).Reshape(ctx, 4, 3, 10) + }, + c: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 14, 38, 62, 86, 110, + 38, 126, 214, 302, 390, + 62, 214, 366, 518, 670, + + 86, 302, 518, 734, 950, + 110, 390, 670, 950, 1230, + 134, 478, 822, 1166, 1510, + + 158, 566, 974, 1382, 1790, + 182, 654, 1126, 1598, 2070, + 206, 742, 1278, 1814, 2350, + + 230, 830, 1430, 2030, 2630, + 254, 918, 1582, 2246, 2910, + 278, 1006, 1734, 2462, 3190, + + 302, 1094, 1886, 2678, 3470, + 326, 1182, 2038, 2894, 3750, + 350, 1270, 2190, 3110, 4030, + + 374, 1358, 2342, 3326, 4310, + 398, 1446, 2494, 3542, 4590, + 422, 1534, 2646, 3758, 4870, + + 446, 1622, 2798, 3974, 5150, + 470, 1710, 2950, 4190, 5430, + 494, 1798, 3102, 4406, 5710, + + 518, 1886, 3254, 4622, 5990, + 542, 1974, 3406, 4838, 6270, + 566, 2062, 3558, 5054, 6550, + + 590, 2150, 3710, 5270, 6830, + 614, 2238, 3862, 5486, 7110, + 638, 2326, 4014, 5702, 7390, + + 662, 2414, 4166, 5918, 7670, + 686, 2502, 4318, 6134, 7950, + 710, 2590, 4470, 6350, 8230, + }, 5, 3, 10) + }, + }, + } + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + ctx := setup(t) + a, b := tt.a(ctx), tt.b(ctx) + c := a.Mulmat(ctx, b) + if diff := cmp.Diff(tt.c(ctx), c, EquateTensors(ctx)); diff != "" { + t.Errorf("MulMat() result mismatch (-want +got):\n%s", diff) + } + }) + } +} + +func TestPermute(t *testing.T) { + cases := []struct { + name string + input func(ml.Context) ml.Tensor + shape []int + want func(ml.Context) ml.Tensor + }{ + { + name: "transpose", + input: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 3*2, 1, ml.DTypeF32).Reshape(ctx, 3, 2) + }, + shape: []int{1, 0, 2, 3}, + want: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 0, 3, + 1, 4, + 2, 5, + }, 2, 3) + }, + }, + { + name: "transpose fill dims", + input: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 3*2, 1, ml.DTypeF32).Reshape(ctx, 3, 2) + }, + shape: []int{1, 0}, + want: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 0, 3, + 1, 4, + 2, 5, + }, 2, 3) + }, + }, + { + name: "permute 3d", + input: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 5*3*2, 1, ml.DTypeF32).Reshape(ctx, 2, 3, 5) + }, + shape: []int{2, 0, 1, 3}, + want: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 0, 2, 4, + 6, 8, 10, + 12, 14, 16, + 18, 20, 22, + 24, 26, 28, + + 1, 3, 5, + 7, 9, 11, + 13, 15, 17, + 19, 21, 23, + 25, 27, 29, + }, 3, 5, 2) + }, + }, + } + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + ctx := setup(t) + got := tt.input(ctx).Permute(ctx, tt.shape...).Contiguous(ctx) + if diff := cmp.Diff(tt.want(ctx), got, EquateTensors(ctx)); diff != "" { + t.Errorf("Permute() result mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/ml/nn/rope/rope.go b/ml/nn/rope/rope.go index 57dd2252..bca8058d 100644 --- a/ml/nn/rope/rope.go +++ b/ml/nn/rope/rope.go @@ -4,21 +4,21 @@ import "github.com/ollama/ollama/ml" // Options contains optional parameters for RoPE function type Options struct { - Type int - Factors ml.Tensor - OriginalContextLength int + Type int + Factors ml.Tensor // YaRN options - ExtrapolationFactor, - AttentionFactor, - BetaFast, - BetaSlow float32 -} + YaRN struct { + OriginalContextLength int + ExtrapolationFactor, + AttentionFactor, + BetaFast, + BetaSlow float32 + } -// WithOriginalContextLength sets a custom context length -func WithOriginalContextLength(n int) func(*Options) { - return func(opts *Options) { - opts.OriginalContextLength = n + // MRoPE options + MRoPE struct { + Sections []int } } @@ -38,14 +38,28 @@ func WithFactors(factors ml.Tensor) func(*Options) { } } +// WithOriginalContextLength sets a custom context length +func WithOriginalContextLength(n int) func(*Options) { + return func(opts *Options) { + opts.YaRN.OriginalContextLength = n + } +} + func WithExtrapolationFactor(extrapolationFactor float32) func(*Options) { return func(opts *Options) { - opts.ExtrapolationFactor = extrapolationFactor + opts.YaRN.ExtrapolationFactor = extrapolationFactor } } func WithAttentionFactor(attentionFactor float32) func(*Options) { return func(opts *Options) { - opts.AttentionFactor = attentionFactor + opts.YaRN.AttentionFactor = attentionFactor + } +} + +func WithMRoPESections(sections []int) func(*Options) { + return func(opts *Options) { + opts.Type |= 1 << 3 + opts.MRoPE.Sections = sections } } diff --git a/model/models/qwen25vl/process_image.go b/model/models/qwen25vl/process_image.go index dc91bdea..ce5ded29 100644 --- a/model/models/qwen25vl/process_image.go +++ b/model/models/qwen25vl/process_image.go @@ -79,6 +79,8 @@ type Grid struct { } func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) { + img = imageproc.Composite(img) + origWidth := img.Bounds().Dx() origHeight := img.Bounds().Dy() diff --git a/model/models/qwen3vl/imageprocessor.go b/model/models/qwen3vl/imageprocessor.go index 621167f5..2453a87d 100644 --- a/model/models/qwen3vl/imageprocessor.go +++ b/model/models/qwen3vl/imageprocessor.go @@ -83,6 +83,8 @@ type Grid struct { } func (p *ImageProcessor) ProcessImage(ctx ml.Context, img image.Image) (ml.Tensor, *Grid, error) { + img = imageproc.Composite(img) + origWidth := img.Bounds().Dx() origHeight := img.Bounds().Dy() diff --git a/model/models/qwen3vl/model.go b/model/models/qwen3vl/model.go index 08beb37c..579863ae 100644 --- a/model/models/qwen3vl/model.go +++ b/model/models/qwen3vl/model.go @@ -112,7 +112,8 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) { } func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { - positionSlice := slices.Collect(makeSlice2D[int32](3, len(batch.Positions))) + // ggml mrope requires 4 positions per token: [time, height, width, extra] + positionSlice := slices.Collect(makeSlice2D[int32](4, len(batch.Positions))) for i, id := range batch.Positions { if id < int32(len(m.positionCache)) { id = m.positionCache[id] @@ -123,6 +124,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { positionSlice[0][i] = id positionSlice[1][i] = id positionSlice[2][i] = id + // positionSlice[3] is intentionally left as zeros } hiddenStates := m.TextModel.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx) @@ -147,8 +149,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { } } - positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0]), len(positionSlice)) - cos, sin := m.rotaryEmbedding(ctx, positions) + positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice)) for i, layer := range m.TextModel.Layers { if m.Cache != nil { m.Cache.SetLayer(i) @@ -159,7 +160,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { outputs = batch.Outputs } - hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, outputs, m.Cache, m.Options) + hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options) if i < len(deepstackVisualEmbeds) { hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i]) } @@ -191,9 +192,10 @@ func New(c fs.Config) (model.Model, error) { ImageProcessor: newImageProcessor(c), } - m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, position ml.Tensor) (ml.Tensor, error) { + m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, positions ml.Tensor) (ml.Tensor, error) { m.positionCache = nil - return nil, kvcache.ErrNotSupported + positions = positions.Repeat(ctx, 1, 4).Reshape(ctx, -1) + return m.Options.applyRotaryPositionalEmbedding(ctx, key, positions), nil }) return &m, nil } diff --git a/model/models/qwen3vl/model_text.go b/model/models/qwen3vl/model_text.go index 14e7d7dc..f5767f65 100644 --- a/model/models/qwen3vl/model_text.go +++ b/model/models/qwen3vl/model_text.go @@ -10,6 +10,8 @@ import ( "github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml/nn" + "github.com/ollama/ollama/ml/nn/fast" + "github.com/ollama/ollama/ml/nn/rope" "github.com/ollama/ollama/model" ) @@ -27,14 +29,18 @@ type TextOptions struct { numExperts, numExpertsUsed int normTopKProb bool - - inverseFrequenciesCache []float32 } func (o TextOptions) headDim() int { return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads) } +func (o TextOptions) applyRotaryPositionalEmbedding(ctx ml.Context, t, p ml.Tensor) ml.Tensor { + return fast.RoPE(ctx, t, p, o.headDim(), o.ropeBase, 1/float32(math.Sqrt(float64(o.ropeScale))), + rope.WithMRoPESections(o.mropeSections), + ) +} + type TextAttention struct { Query *nn.Linear `gguf:"attn_q"` QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"` @@ -44,7 +50,7 @@ type TextAttention struct { Output *nn.Linear `gguf:"attn_output"` } -func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor { +func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor { batchSize := hiddenStates.Dim(1) query := sa.Query.Forward(ctx, hiddenStates) @@ -58,8 +64,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tenso query = sa.QueryNorm.Forward(ctx, query, opts.eps) key = sa.KeyNorm.Forward(ctx, key, opts.eps) - query = applyRotaryPositionalEmbedding(ctx, query, cos, sin) - key = applyRotaryPositionalEmbedding(ctx, key, cos, sin) + query = opts.applyRotaryPositionalEmbedding(ctx, query, positions) + key = opts.applyRotaryPositionalEmbedding(ctx, key, positions) attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache) attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize) @@ -125,10 +131,10 @@ type TextLayer struct { TextMLP } -func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor { +func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor { residual := hiddenStates hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps) - hiddenStates = d.TextAttention.Forward(ctx, hiddenStates, cos, sin, cache, opts) + hiddenStates = d.TextAttention.Forward(ctx, hiddenStates, positions, cache, opts) if outputs != nil { hiddenStates = hiddenStates.Rows(ctx, outputs) @@ -153,42 +159,6 @@ type TextModel struct { Options *TextOptions } -func (m *TextModel) rotaryEmbedding(ctx ml.Context, positions ml.Tensor) (_, _ ml.Tensor) { - positions = positions.Reshape(ctx, 1, positions.Dim(0), positions.Dim(1)) - if len(m.Options.inverseFrequenciesCache) == 0 { - m.Options.inverseFrequenciesCache = make([]float32, m.Options.headDim()/2) - for i := range m.Options.inverseFrequenciesCache { - frequency := float32(math.Pow(float64(m.Options.ropeBase), float64(i*2)/float64(m.Options.headDim()))) - m.Options.inverseFrequenciesCache[i] = 1 / frequency - } - } - - inverseFrequencies := ctx.Input().FromFloats(m.Options.inverseFrequenciesCache, 1, len(m.Options.inverseFrequenciesCache)) - - positions = positions.Cast(ctx, ml.DTypeF32) - frequencies := inverseFrequencies.Mulmat(ctx, positions) - - interleaved := frequencies.View(ctx, - 0, frequencies.Dim(0), - frequencies.Stride(1), frequencies.Dim(1), - ) - - for _, i := range []int{1, 2} { - args := []int{ - i * frequencies.Stride(0), 1, - 3 * frequencies.Stride(0), m.Options.mropeSections[i], - frequencies.Stride(1), frequencies.Dim(1), - } - - ctx.Forward(frequencies.View(ctx, i*frequencies.Stride(2)+args[0], args[1:]...). - Copy(ctx, interleaved.View(ctx, args[0], args[1:]...))) - } - - interleaved = interleaved.Concat(ctx, interleaved, 0) - interleaved = interleaved.Reshape(ctx, interleaved.Dim(0), 1, interleaved.Dim(1), interleaved.Dim(2)) - return interleaved.Cos(ctx), interleaved.Sin(ctx) -} - var _ model.Model = (*Model)(nil) func newTextModel(c fs.Config) *TextModel { diff --git a/runner/ollamarunner/multimodal.go b/runner/ollamarunner/multimodal.go index 78ceb771..6af89021 100644 --- a/runner/ollamarunner/multimodal.go +++ b/runner/ollamarunner/multimodal.go @@ -86,6 +86,9 @@ func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Ten computeCtx.Forward(tensors...) entry.data = make([][]float32, len(entry.mm)) + // Multimodal processing is computationally intensive, so treat it similarly to a large batch + computeCtx.SetBatchSize(512) + if !reserve { computeCtx.Compute(tensors...) diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index 153a3e57..3e8c1e22 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -598,6 +598,7 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er // Actual batchInputs values will be injected into the batch.Inputs tensor before calling Compute batch.Inputs = nextBatch.ctx.Input().Empty(ml.DTypeI32, len(batchInputs)) batch.Outputs = nextBatch.ctx.Input().FromInts(batchOutputs, len(batchOutputs)) + nextBatch.ctx.SetBatchSize(len(batchInputs)) nextBatch.modelOutput, err = model.Forward(nextBatch.ctx, s.model, batch) if err != nil { err = fmt.Errorf("failed to build graph: %w", err) @@ -1009,12 +1010,17 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) { } } -func (s *Server) reserveWorstCaseGraph() error { +func (s *Server) reserveWorstCaseGraph(prompt bool) error { ctx := s.model.Backend().NewContext() defer ctx.Close() var err error - inputs := make([]*input.Input, s.batchSize) + batchSize := 1 + if prompt { + batchSize = s.batchSize + } + + inputs := make([]*input.Input, batchSize) for i := range inputs { inputs[i] = &input.Input{} } @@ -1031,7 +1037,7 @@ func (s *Server) reserveWorstCaseGraph() error { // - The result may now be larger than a batch (images may not fit in a // single batch), so trim based on what will fit and must be grouped together. // - Fill out the rest of the space with text tokens. - if multimodalProcessor, ok := s.model.(model.MultimodalProcessor); ok { + if multimodalProcessor, ok := s.model.(model.MultimodalProcessor); prompt && ok { mmCtx := s.model.Backend().NewContext() defer mmCtx.Close() @@ -1058,10 +1064,10 @@ func (s *Server) reserveWorstCaseGraph() error { } } - if len(inputs) < s.batchSize { - newInputs := make([]*input.Input, s.batchSize) + if len(inputs) < batchSize { + newInputs := make([]*input.Input, batchSize) copy(newInputs, inputs) - for i := len(inputs); i < s.batchSize; i++ { + for i := len(inputs); i < batchSize; i++ { newInputs[i] = &input.Input{} } inputs = newInputs @@ -1103,6 +1109,7 @@ func (s *Server) reserveWorstCaseGraph() error { return err } + ctx.SetBatchSize(batchSize) ctx.Forward(t).Reserve() return nil @@ -1160,7 +1167,12 @@ func (s *Server) allocModel( s.seqs = make([]*Sequence, s.parallel) s.seqsSem = semaphore.NewWeighted(int64(s.parallel)) - return s.reserveWorstCaseGraph() + err = s.reserveWorstCaseGraph(true) + if err != nil { + return nil + } + + return s.reserveWorstCaseGraph(false) } // closeModel frees all memory associated with a model