From cbe1cf06c44c6ff0901851c04b6bd8650ece7c5f Mon Sep 17 00:00:00 2001 From: Athiban Sharon <135643455+athshh@users.noreply.github.com> Date: Thu, 30 Oct 2025 17:14:39 +0000 Subject: [PATCH 01/12] Update README.md (#12822) Fixed broken docs links --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6abda757..e8846128 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Get up and running with large language models. curl -fsSL https://ollama.com/install.sh | sh ``` -[Manual install instructions](https://github.com/ollama/ollama/blob/main/docs/linux.md) +[Manual install instructions](https://docs.ollama.com/linux#manual-install) ### Docker @@ -110,7 +110,7 @@ Ollama supports importing GGUF models in the Modelfile: ### Import from Safetensors -See the [guide](docs/import.md) on importing models for more information. +See the [guide](https://docs.ollama.com/import) on importing models for more information. ### Customize a prompt @@ -143,7 +143,7 @@ ollama run mario Hello! It's your friend Mario. ``` -For more information on working with a Modelfile, see the [Modelfile](docs/modelfile.md) documentation. +For more information on working with a Modelfile, see the [Modelfile](https://docs.ollama.com/modelfile) documentation. ## CLI Reference From 06b3422d5f973ba9026333dc49f312d5d649629d Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 30 Oct 2025 10:32:45 -0700 Subject: [PATCH 02/12] tests: add tests and docs for commonly used ops (#12844) * mulmat * permute --- ml/backend/ggml/ggml.go | 20 ++- ml/backend/ggml/ggml_test.go | 253 ++++++++++++++++++++++++++++++++++- 2 files changed, 268 insertions(+), 5 deletions(-) diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 33401c30..38b18b3e 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -1231,6 +1231,11 @@ func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor { } } +// Mulmat performs matrix multiplication between two tensors. +// If t has shape [m, p, ...] and t2 has shape [m, n, ...], +// Mulmat returns a new Tensor with shape [p, n, ...]. +// +// Note: this is similar to matmul(t2, t.tranpose(-1, -2)) in other libraries. func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor { return &Tensor{ b: t.b, @@ -1303,14 +1308,21 @@ func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor { } } -func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor { - if len(shape) != 4 { - panic("expected 4 dimensions") +// Permute permutes t according to order. Permute panics if the number of dimensions +// in order does not match the number of dimensions in t. +func (t *Tensor) Permute(ctx ml.Context, order ...int) ml.Tensor { + if len(order) != len(t.Shape()) && len(order) != 4 { + panic("invalid number of dimensions for permute") + } + + // ggml_permute requires 4 dimensions so fill in the rest + for i := len(order); i < 4; i++ { + order = append(order, i) } return &Tensor{ b: t.b, - t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])), + t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(order[0]), C.int(order[1]), C.int(order[2]), C.int(order[3])), } } diff --git a/ml/backend/ggml/ggml_test.go b/ml/backend/ggml/ggml_test.go index 4717ea90..31dfdb7b 100644 --- a/ml/backend/ggml/ggml_test.go +++ b/ml/backend/ggml/ggml_test.go @@ -23,7 +23,7 @@ func setup(tb testing.TB) ml.Context { tb.Fatal(err) } - b, err := ml.NewBackend(f.Name(), ml.BackendParams{}) + b, err := ml.NewBackend(f.Name(), ml.BackendParams{AllocMemory: true}) if err != nil { tb.Fatal(err) } @@ -124,3 +124,254 @@ func TestInferShape(t *testing.T) { }) } } + +func EquateTensors(ctx ml.Context) cmp.Option { + return cmp.Comparer(func(x, y ml.Tensor) bool { + ctx.Forward(x, y).Compute(x, y) + return cmp.Equal(x.Shape(), y.Shape()) && + cmp.Equal(x.DType(), y.DType()) && + cmp.Equal(x.Bytes(), y.Bytes()) + }) +} + +func TestMulmat(t *testing.T) { + cases := []struct { + name string + a, b, c func(ml.Context) ml.Tensor + }{ + { + name: "vector x vector", + a: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 3, 1, ml.DTypeF32) + }, + b: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 3, 1, ml.DTypeF32) + }, + c: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{5}, 1) + }, + }, + { + name: "vector x matrix", + a: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 4, 1, ml.DTypeF32) + }, + b: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 12, 1, ml.DTypeF32).Reshape(ctx, 4, 3) + }, + c: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 14, 38, 62, + }, 1, 3) + }, + }, + { + name: "broadcast vector x batched matrix", + a: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 4, 1, ml.DTypeF32) + }, + b: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 10*3*4, 1, ml.DTypeF32).Reshape(ctx, 4, 3, 10) + }, + c: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 14, 38, 62, + 86, 110, 134, + 158, 182, 206, + 230, 254, 278, + 302, 326, 350, + 374, 398, 422, + 446, 470, 494, + 518, 542, 566, + 590, 614, 638, + 662, 686, 710, + }, 1, 3, 10) + }, + }, + { + name: "batched matrix x batched matrix", + a: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 4*5*10, 1, ml.DTypeF32).Reshape(ctx, 4, 5, 10) + }, + b: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 4*3*10, 1, ml.DTypeF32).Reshape(ctx, 4, 3, 10) + }, + c: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 14, 38, 62, 86, 110, + 38, 126, 214, 302, 390, + 62, 214, 366, 518, 670, + + 1166, 1382, 1598, 1814, 2030, + 1510, 1790, 2070, 2350, 2630, + 1854, 2198, 2542, 2886, 3230, + + 4238, 4646, 5054, 5462, 5870, + 4902, 5374, 5846, 6318, 6790, + 5566, 6102, 6638, 7174, 7710, + + 9230, 9830, 10430, 11030, 11630, + 10214, 10878, 11542, 12206, 12870, + 11198, 11926, 12654, 13382, 14110, + + 16142, 16934, 17726, 18518, 19310, + 17446, 18302, 19158, 20014, 20870, + 18750, 19670, 20590, 21510, 22430, + + 24974, 25958, 26942, 27926, 28910, + 26598, 27646, 28694, 29742, 30790, + 28222, 29334, 30446, 31558, 32670, + + 35726, 36902, 38078, 39254, 40430, + 37670, 38910, 40150, 41390, 42630, + 39614, 40918, 42222, 43526, 44830, + + 48398, 49766, 51134, 52502, 53870, + 50662, 52094, 53526, 54958, 56390, + 52926, 54422, 55918, 57414, 58910, + + 62990, 64550, 66110, 67670, 69230, + 65574, 67198, 68822, 70446, 72070, + 68158, 69846, 71534, 73222, 74910, + + 79502, 81254, 83006, 84758, 86510, + 82406, 84222, 86038, 87854, 89670, + 85310, 87190, 89070, 90950, 92830, + }, 5, 3, 10) + }, + }, + { + name: "broadcast matrix x batched matrix", + a: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 4*5, 1, ml.DTypeF32).Reshape(ctx, 4, 5) + }, + b: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 4*3*10, 1, ml.DTypeF32).Reshape(ctx, 4, 3, 10) + }, + c: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 14, 38, 62, 86, 110, + 38, 126, 214, 302, 390, + 62, 214, 366, 518, 670, + + 86, 302, 518, 734, 950, + 110, 390, 670, 950, 1230, + 134, 478, 822, 1166, 1510, + + 158, 566, 974, 1382, 1790, + 182, 654, 1126, 1598, 2070, + 206, 742, 1278, 1814, 2350, + + 230, 830, 1430, 2030, 2630, + 254, 918, 1582, 2246, 2910, + 278, 1006, 1734, 2462, 3190, + + 302, 1094, 1886, 2678, 3470, + 326, 1182, 2038, 2894, 3750, + 350, 1270, 2190, 3110, 4030, + + 374, 1358, 2342, 3326, 4310, + 398, 1446, 2494, 3542, 4590, + 422, 1534, 2646, 3758, 4870, + + 446, 1622, 2798, 3974, 5150, + 470, 1710, 2950, 4190, 5430, + 494, 1798, 3102, 4406, 5710, + + 518, 1886, 3254, 4622, 5990, + 542, 1974, 3406, 4838, 6270, + 566, 2062, 3558, 5054, 6550, + + 590, 2150, 3710, 5270, 6830, + 614, 2238, 3862, 5486, 7110, + 638, 2326, 4014, 5702, 7390, + + 662, 2414, 4166, 5918, 7670, + 686, 2502, 4318, 6134, 7950, + 710, 2590, 4470, 6350, 8230, + }, 5, 3, 10) + }, + }, + } + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + ctx := setup(t) + a, b := tt.a(ctx), tt.b(ctx) + c := a.Mulmat(ctx, b) + if diff := cmp.Diff(tt.c(ctx), c, EquateTensors(ctx)); diff != "" { + t.Errorf("MulMat() result mismatch (-want +got):\n%s", diff) + } + }) + } +} + +func TestPermute(t *testing.T) { + cases := []struct { + name string + input func(ml.Context) ml.Tensor + shape []int + want func(ml.Context) ml.Tensor + }{ + { + name: "transpose", + input: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 3*2, 1, ml.DTypeF32).Reshape(ctx, 3, 2) + }, + shape: []int{1, 0, 2, 3}, + want: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 0, 3, + 1, 4, + 2, 5, + }, 2, 3) + }, + }, + { + name: "transpose fill dims", + input: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 3*2, 1, ml.DTypeF32).Reshape(ctx, 3, 2) + }, + shape: []int{1, 0}, + want: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 0, 3, + 1, 4, + 2, 5, + }, 2, 3) + }, + }, + { + name: "permute 3d", + input: func(ctx ml.Context) ml.Tensor { + return ctx.Arange(0, 5*3*2, 1, ml.DTypeF32).Reshape(ctx, 2, 3, 5) + }, + shape: []int{2, 0, 1, 3}, + want: func(ctx ml.Context) ml.Tensor { + return ctx.FromFloats([]float32{ + 0, 2, 4, + 6, 8, 10, + 12, 14, 16, + 18, 20, 22, + 24, 26, 28, + + 1, 3, 5, + 7, 9, 11, + 13, 15, 17, + 19, 21, 23, + 25, 27, 29, + }, 3, 5, 2) + }, + }, + } + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + ctx := setup(t) + got := tt.input(ctx).Permute(ctx, tt.shape...).Contiguous(ctx) + if diff := cmp.Diff(tt.want(ctx), got, EquateTensors(ctx)); diff != "" { + t.Errorf("Permute() result mismatch (-want +got):\n%s", diff) + } + }) + } +} From d432ade714156098eac42f633a3911a3cf09dd41 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 30 Oct 2025 10:33:19 -0700 Subject: [PATCH 03/12] fix: qwen2.5vl, qwen3vl composite image (#12841) this change fixes images with an alpha channel by overlaying the image onto a white background --- model/models/qwen25vl/process_image.go | 2 ++ model/models/qwen3vl/imageprocessor.go | 2 ++ 2 files changed, 4 insertions(+) diff --git a/model/models/qwen25vl/process_image.go b/model/models/qwen25vl/process_image.go index dc91bdea..ce5ded29 100644 --- a/model/models/qwen25vl/process_image.go +++ b/model/models/qwen25vl/process_image.go @@ -79,6 +79,8 @@ type Grid struct { } func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) { + img = imageproc.Composite(img) + origWidth := img.Bounds().Dx() origHeight := img.Bounds().Dy() diff --git a/model/models/qwen3vl/imageprocessor.go b/model/models/qwen3vl/imageprocessor.go index 621167f5..2453a87d 100644 --- a/model/models/qwen3vl/imageprocessor.go +++ b/model/models/qwen3vl/imageprocessor.go @@ -83,6 +83,8 @@ type Grid struct { } func (p *ImageProcessor) ProcessImage(ctx ml.Context, img image.Image) (ml.Tensor, *Grid, error) { + img = imageproc.Composite(img) + origWidth := img.Bounds().Dx() origHeight := img.Bounds().Dy() From ed78e127d0578ece5787bfbc6beb5191aa01edc6 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 30 Oct 2025 10:41:49 -0700 Subject: [PATCH 04/12] fix(cmd): unload model before removal (#12832) this change fixes two bugs with `ollama rm`: 1. before a model is removed, it will first be stopped. this only happens for the first argument and skipped for all other models 2. models are unloaded indiscriminately. this errors for cloud models and should be omitted --- cmd/cmd.go | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index 369a27a4..1050d7b0 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -280,6 +280,13 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error { return err } + if info, err := client.Show(cmd.Context(), &api.ShowRequest{Model: opts.Model}); err != nil { + return err + } else if info.RemoteHost != "" { + // Cloud model, no need to load/unload + return nil + } + req := &api.GenerateRequest{ Model: opts.Model, KeepAlive: opts.KeepAlive, @@ -720,23 +727,21 @@ func DeleteHandler(cmd *cobra.Command, args []string) error { return err } - // Unload the model if it's running before deletion - opts := &runOptions{ - Model: args[0], - KeepAlive: &api.Duration{Duration: 0}, - } - if err := loadOrUnloadModel(cmd, opts); err != nil { - if !strings.Contains(strings.ToLower(err.Error()), "not found") { - fmt.Fprintf(os.Stderr, "Warning: unable to stop model '%s'\n", args[0]) + for _, arg := range args { + // Unload the model if it's running before deletion + if err := loadOrUnloadModel(cmd, &runOptions{ + Model: args[0], + KeepAlive: &api.Duration{Duration: 0}, + }); err != nil { + if !strings.Contains(strings.ToLower(err.Error()), "not found") { + fmt.Fprintf(os.Stderr, "Warning: unable to stop model '%s'\n", args[0]) + } } - } - for _, name := range args { - req := api.DeleteRequest{Name: name} - if err := client.Delete(cmd.Context(), &req); err != nil { + if err := client.Delete(cmd.Context(), &api.DeleteRequest{Name: arg}); err != nil { return err } - fmt.Printf("deleted '%s'\n", name) + fmt.Printf("deleted '%s'\n", arg) } return nil } From 75e75d9afea9175f8bd1f0b5fe8ad6e9efbf86ee Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 30 Oct 2025 10:51:37 -0700 Subject: [PATCH 05/12] qwen3vl: enable flash attention by default (#12862) --- fs/ggml/ggml.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index c0ca068a..909104c6 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -895,8 +895,8 @@ func (f GGML) FlashAttention() bool { return slices.Contains([]string{ "gemma3", "gptoss", "gpt-oss", - "qwen3", - "qwen3moe", + "qwen3", "qwen3moe", + "qwen3vl", "qwen3vlmoe", }, f.KV().String("general.architecture")) } From f67a6df110cfa37157bb2f393f56b60c3eff3180 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 30 Oct 2025 11:29:00 -0700 Subject: [PATCH 06/12] interleaved mrope (#12807) * ml(ggml): mrope * interleave mrope --- .../patches/0032-interleave-multi-rope.patch | 113 ++++++++++++++++++ ml/backend/ggml/ggml.go | 52 +++++--- ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp | 7 +- ml/backend/ggml/ggml/src/ggml-cuda/rope.cu | 12 +- .../src/ggml-metal/ggml-metal-embed.metal | 10 +- .../ggml/ggml/src/ggml-metal/ggml-metal.metal | 10 +- .../vulkan-shaders/rope_multi.comp | 12 +- ml/nn/rope/rope.go | 42 ++++--- model/models/qwen3vl/model.go | 14 ++- model/models/qwen3vl/model_text.go | 56 ++------- 10 files changed, 209 insertions(+), 119 deletions(-) create mode 100644 llama/patches/0032-interleave-multi-rope.patch diff --git a/llama/patches/0032-interleave-multi-rope.patch b/llama/patches/0032-interleave-multi-rope.patch new file mode 100644 index 00000000..eb41639e --- /dev/null +++ b/llama/patches/0032-interleave-multi-rope.patch @@ -0,0 +1,113 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Michael Yang +Date: Web, 16 Oct 2025 20:37:19 -0700 +Subject: [PATCH] interleave multi rope + +since ollama doesn't use mrope for anything else, change it to mean the +interleaved version used for qwen3vl +--- + ggml/src/ggml-cpu/ops.cpp | 7 ++----- + ggml/src/ggml-cuda/rope.cu | 12 +++--------- + ggml/src/ggml-metal/ggml-metal.metal | 10 +++------- + ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp | 12 +++--------- + 4 files changed, 11 insertions(+), 30 deletions(-) + +diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp +index 31478dd8e..4d1ed207e 100644 +--- a/ggml/src/ggml-cpu/ops.cpp ++++ b/ggml/src/ggml-cpu/ops.cpp +@@ -5509,15 +5509,12 @@ static void ggml_mrope_cache_init( + } + + float theta = theta_t; +- if (sector >= sections[0] && sector < sec_w) { ++ if (sector % 3 == 1 && sector < 1 + 3 * sections[1]) { + theta = theta_h; + } +- else if (sector >= sec_w && sector < sec_w + sections[2]) { ++ else if (sector % 3 == 2 && sector < 2 + 3 * sections[2]) { + theta = theta_w; + } +- else if (sector >= sec_w + sections[2]) { +- theta = theta_e; +- } + + rope_yarn( + theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1] +diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu +index d058504cd..287fe9d2c 100644 +--- a/ggml/src/ggml-cuda/rope.cu ++++ b/ggml/src/ggml-cuda/rope.cu +@@ -151,19 +151,13 @@ static __global__ void rope_multi( + const int sec_w = sections.v[1] + sections.v[0]; + const int sector = (i0 / 2) % sect_dims; + +- float theta_base = 0.0; +- if (sector < sections.v[0]) { +- theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); +- } +- else if (sector >= sections.v[0] && sector < sec_w) { ++ float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); ++ if (sector % 3 == 1 && sector < 1 + 3 * sections.v[1]) { + theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f); + } +- else if (sector >= sec_w && sector < sec_w + sections.v[2]) { ++ else if (sector % 3 == 2 && sector < 2 + 3 * sections.v[2]) { + theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f); + } +- else if (sector >= sec_w + sections.v[2]) { +- theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f); +- } + + const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; + +diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal +index 375a0c7fd..9866c96b4 100644 +--- a/ggml/src/ggml-metal/ggml-metal.metal ++++ b/ggml/src/ggml-metal/ggml-metal.metal +@@ -3858,15 +3858,11 @@ kernel void kernel_rope_multi( + const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2 + const int sector = ic % sect_dims; + +- float theta_base; +- if (sector < args.sect_0) { +- theta_base = (float) pos[i2]; +- } else if (sector < sec_w01) { ++ float theta_base = (float) pos[i2]; ++ if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) { + theta_base = (float) pos[i2 + args.ne02]; +- } else if (sector < sec_w012) { ++ } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) { + theta_base = (float) pos[i2 + args.ne02 * 2]; +- } else { +- theta_base = (float) pos[i2 + args.ne02 * 3]; + } + // end of mrope + +diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +index 111286b49..6fc2b42f8 100644 +--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp ++++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +@@ -31,19 +31,13 @@ void main() { + const int sec_w = p.sections[1] + p.sections[0]; + const uint sector = (i0 / 2) % sect_dims; + +- float theta_base = 0.0; +- if (sector < p.sections[0]) { +- theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f); +- } +- else if (sector >= p.sections[0] && sector < sec_w) { ++ float theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f); ++ if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) { + theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f); + } +- else if (sector >= sec_w && sector < sec_w + p.sections[2]) { ++ else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) { + theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f); + } +- else if (sector >= sec_w + p.sections[2]) { +- theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f); +- } + + const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f; + diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 38b18b3e..8c782d73 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -11,6 +11,7 @@ package ggml import "C" import ( + "cmp" "context" "encoding/binary" "errors" @@ -1490,14 +1491,7 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor { func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor { // Default options - opts := rope.Options{ - Factors: &Tensor{}, - OriginalContextLength: 131072, - ExtrapolationFactor: 0., - AttentionFactor: 1., - BetaFast: 32., - BetaSlow: 1., - } + opts := rope.Options{Factors: &Tensor{}} // Apply any provided options for _, option := range options { @@ -1509,24 +1503,44 @@ func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32) } - return &Tensor{ - b: t.b, - t: C.ggml_rope_ext( + var tt *C.struct_ggml_tensor + if len(opts.MRoPE.Sections) > 0 { + mropeSections := make([]C.int32_t, 4) + for i, section := range opts.MRoPE.Sections { + mropeSections[i] = C.int32_t(section) + } + + tt = C.ggml_rope_multi( ctx.(*Context).ctx, dequant, positions.(*Tensor).t, opts.Factors.(*Tensor).t, C.int(ropeDim), + unsafe.SliceData(mropeSections), C.int(opts.Type), - C.int(opts.OriginalContextLength), - C.float(ropeBase), - C.float(ropeScale), - C.float(opts.ExtrapolationFactor), - C.float(opts.AttentionFactor), - C.float(opts.BetaFast), - C.float(opts.BetaSlow), - ), + cmp.Or(C.int(opts.YaRN.OriginalContextLength), 128<<10), + C.float(ropeBase), C.float(ropeScale), + C.float(opts.YaRN.ExtrapolationFactor), + cmp.Or(C.float(opts.YaRN.AttentionFactor), 1), + cmp.Or(C.float(opts.YaRN.BetaFast), 32), + cmp.Or(C.float(opts.YaRN.BetaSlow), 1), + ) + } else { + tt = C.ggml_rope_ext( + ctx.(*Context).ctx, + dequant, + positions.(*Tensor).t, + opts.Factors.(*Tensor).t, + C.int(ropeDim), C.int(opts.Type), + cmp.Or(C.int(opts.YaRN.OriginalContextLength), 128<<10), + C.float(ropeBase), C.float(ropeScale), + C.float(opts.YaRN.ExtrapolationFactor), + cmp.Or(C.float(opts.YaRN.AttentionFactor), 1), + cmp.Or(C.float(opts.YaRN.BetaFast), 32), + cmp.Or(C.float(opts.YaRN.BetaSlow), 1), + ) } + return &Tensor{b: t.b, t: tt} } func (t *Tensor) IM2Col(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor { diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp index 31478dd8..4d1ed207 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp @@ -5509,15 +5509,12 @@ static void ggml_mrope_cache_init( } float theta = theta_t; - if (sector >= sections[0] && sector < sec_w) { + if (sector % 3 == 1 && sector < 1 + 3 * sections[1]) { theta = theta_h; } - else if (sector >= sec_w && sector < sec_w + sections[2]) { + else if (sector % 3 == 2 && sector < 2 + 3 * sections[2]) { theta = theta_w; } - else if (sector >= sec_w + sections[2]) { - theta = theta_e; - } rope_yarn( theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1] diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu b/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu index d058504c..287fe9d2 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu @@ -151,19 +151,13 @@ static __global__ void rope_multi( const int sec_w = sections.v[1] + sections.v[0]; const int sector = (i0 / 2) % sect_dims; - float theta_base = 0.0; - if (sector < sections.v[0]) { - theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); - } - else if (sector >= sections.v[0] && sector < sec_w) { + float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); + if (sector % 3 == 1 && sector < 1 + 3 * sections.v[1]) { theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f); } - else if (sector >= sec_w && sector < sec_w + sections.v[2]) { + else if (sector % 3 == 2 && sector < 2 + 3 * sections.v[2]) { theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f); } - else if (sector >= sec_w + sections.v[2]) { - theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f); - } const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal index 9c0e0c56..f342872d 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal @@ -6523,15 +6523,11 @@ kernel void kernel_rope_multi( const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2 const int sector = ic % sect_dims; - float theta_base; - if (sector < args.sect_0) { - theta_base = (float) pos[i2]; - } else if (sector < sec_w01) { + float theta_base = (float) pos[i2]; + if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) { theta_base = (float) pos[i2 + args.ne02]; - } else if (sector < sec_w012) { + } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) { theta_base = (float) pos[i2 + args.ne02 * 2]; - } else { - theta_base = (float) pos[i2 + args.ne02 * 3]; } // end of mrope diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal index 375a0c7f..9866c96b 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal @@ -3858,15 +3858,11 @@ kernel void kernel_rope_multi( const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2 const int sector = ic % sect_dims; - float theta_base; - if (sector < args.sect_0) { - theta_base = (float) pos[i2]; - } else if (sector < sec_w01) { + float theta_base = (float) pos[i2]; + if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) { theta_base = (float) pos[i2 + args.ne02]; - } else if (sector < sec_w012) { + } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) { theta_base = (float) pos[i2 + args.ne02 * 2]; - } else { - theta_base = (float) pos[i2 + args.ne02 * 3]; } // end of mrope diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp index 111286b4..633dc20f 100644 --- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp @@ -31,19 +31,13 @@ void main() { const int sec_w = p.sections[1] + p.sections[0]; const uint sector = (i0 / 2) % sect_dims; - float theta_base = 0.0; - if (sector < p.sections[0]) { - theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f); - } - else if (sector >= p.sections[0] && sector < sec_w) { + float theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f); + if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) { theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f); } - else if (sector >= sec_w && sector < sec_w + p.sections[2]) { + else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) { theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f); } - else if (sector >= sec_w + p.sections[2]) { - theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f); - } const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f; diff --git a/ml/nn/rope/rope.go b/ml/nn/rope/rope.go index 57dd2252..bca8058d 100644 --- a/ml/nn/rope/rope.go +++ b/ml/nn/rope/rope.go @@ -4,21 +4,21 @@ import "github.com/ollama/ollama/ml" // Options contains optional parameters for RoPE function type Options struct { - Type int - Factors ml.Tensor - OriginalContextLength int + Type int + Factors ml.Tensor // YaRN options - ExtrapolationFactor, - AttentionFactor, - BetaFast, - BetaSlow float32 -} + YaRN struct { + OriginalContextLength int + ExtrapolationFactor, + AttentionFactor, + BetaFast, + BetaSlow float32 + } -// WithOriginalContextLength sets a custom context length -func WithOriginalContextLength(n int) func(*Options) { - return func(opts *Options) { - opts.OriginalContextLength = n + // MRoPE options + MRoPE struct { + Sections []int } } @@ -38,14 +38,28 @@ func WithFactors(factors ml.Tensor) func(*Options) { } } +// WithOriginalContextLength sets a custom context length +func WithOriginalContextLength(n int) func(*Options) { + return func(opts *Options) { + opts.YaRN.OriginalContextLength = n + } +} + func WithExtrapolationFactor(extrapolationFactor float32) func(*Options) { return func(opts *Options) { - opts.ExtrapolationFactor = extrapolationFactor + opts.YaRN.ExtrapolationFactor = extrapolationFactor } } func WithAttentionFactor(attentionFactor float32) func(*Options) { return func(opts *Options) { - opts.AttentionFactor = attentionFactor + opts.YaRN.AttentionFactor = attentionFactor + } +} + +func WithMRoPESections(sections []int) func(*Options) { + return func(opts *Options) { + opts.Type |= 1 << 3 + opts.MRoPE.Sections = sections } } diff --git a/model/models/qwen3vl/model.go b/model/models/qwen3vl/model.go index 08beb37c..579863ae 100644 --- a/model/models/qwen3vl/model.go +++ b/model/models/qwen3vl/model.go @@ -112,7 +112,8 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) { } func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { - positionSlice := slices.Collect(makeSlice2D[int32](3, len(batch.Positions))) + // ggml mrope requires 4 positions per token: [time, height, width, extra] + positionSlice := slices.Collect(makeSlice2D[int32](4, len(batch.Positions))) for i, id := range batch.Positions { if id < int32(len(m.positionCache)) { id = m.positionCache[id] @@ -123,6 +124,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { positionSlice[0][i] = id positionSlice[1][i] = id positionSlice[2][i] = id + // positionSlice[3] is intentionally left as zeros } hiddenStates := m.TextModel.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx) @@ -147,8 +149,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { } } - positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0]), len(positionSlice)) - cos, sin := m.rotaryEmbedding(ctx, positions) + positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice)) for i, layer := range m.TextModel.Layers { if m.Cache != nil { m.Cache.SetLayer(i) @@ -159,7 +160,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { outputs = batch.Outputs } - hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, outputs, m.Cache, m.Options) + hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options) if i < len(deepstackVisualEmbeds) { hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i]) } @@ -191,9 +192,10 @@ func New(c fs.Config) (model.Model, error) { ImageProcessor: newImageProcessor(c), } - m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, position ml.Tensor) (ml.Tensor, error) { + m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, positions ml.Tensor) (ml.Tensor, error) { m.positionCache = nil - return nil, kvcache.ErrNotSupported + positions = positions.Repeat(ctx, 1, 4).Reshape(ctx, -1) + return m.Options.applyRotaryPositionalEmbedding(ctx, key, positions), nil }) return &m, nil } diff --git a/model/models/qwen3vl/model_text.go b/model/models/qwen3vl/model_text.go index 14e7d7dc..f5767f65 100644 --- a/model/models/qwen3vl/model_text.go +++ b/model/models/qwen3vl/model_text.go @@ -10,6 +10,8 @@ import ( "github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml/nn" + "github.com/ollama/ollama/ml/nn/fast" + "github.com/ollama/ollama/ml/nn/rope" "github.com/ollama/ollama/model" ) @@ -27,14 +29,18 @@ type TextOptions struct { numExperts, numExpertsUsed int normTopKProb bool - - inverseFrequenciesCache []float32 } func (o TextOptions) headDim() int { return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads) } +func (o TextOptions) applyRotaryPositionalEmbedding(ctx ml.Context, t, p ml.Tensor) ml.Tensor { + return fast.RoPE(ctx, t, p, o.headDim(), o.ropeBase, 1/float32(math.Sqrt(float64(o.ropeScale))), + rope.WithMRoPESections(o.mropeSections), + ) +} + type TextAttention struct { Query *nn.Linear `gguf:"attn_q"` QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"` @@ -44,7 +50,7 @@ type TextAttention struct { Output *nn.Linear `gguf:"attn_output"` } -func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor { +func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor { batchSize := hiddenStates.Dim(1) query := sa.Query.Forward(ctx, hiddenStates) @@ -58,8 +64,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tenso query = sa.QueryNorm.Forward(ctx, query, opts.eps) key = sa.KeyNorm.Forward(ctx, key, opts.eps) - query = applyRotaryPositionalEmbedding(ctx, query, cos, sin) - key = applyRotaryPositionalEmbedding(ctx, key, cos, sin) + query = opts.applyRotaryPositionalEmbedding(ctx, query, positions) + key = opts.applyRotaryPositionalEmbedding(ctx, key, positions) attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache) attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize) @@ -125,10 +131,10 @@ type TextLayer struct { TextMLP } -func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor { +func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor { residual := hiddenStates hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps) - hiddenStates = d.TextAttention.Forward(ctx, hiddenStates, cos, sin, cache, opts) + hiddenStates = d.TextAttention.Forward(ctx, hiddenStates, positions, cache, opts) if outputs != nil { hiddenStates = hiddenStates.Rows(ctx, outputs) @@ -153,42 +159,6 @@ type TextModel struct { Options *TextOptions } -func (m *TextModel) rotaryEmbedding(ctx ml.Context, positions ml.Tensor) (_, _ ml.Tensor) { - positions = positions.Reshape(ctx, 1, positions.Dim(0), positions.Dim(1)) - if len(m.Options.inverseFrequenciesCache) == 0 { - m.Options.inverseFrequenciesCache = make([]float32, m.Options.headDim()/2) - for i := range m.Options.inverseFrequenciesCache { - frequency := float32(math.Pow(float64(m.Options.ropeBase), float64(i*2)/float64(m.Options.headDim()))) - m.Options.inverseFrequenciesCache[i] = 1 / frequency - } - } - - inverseFrequencies := ctx.Input().FromFloats(m.Options.inverseFrequenciesCache, 1, len(m.Options.inverseFrequenciesCache)) - - positions = positions.Cast(ctx, ml.DTypeF32) - frequencies := inverseFrequencies.Mulmat(ctx, positions) - - interleaved := frequencies.View(ctx, - 0, frequencies.Dim(0), - frequencies.Stride(1), frequencies.Dim(1), - ) - - for _, i := range []int{1, 2} { - args := []int{ - i * frequencies.Stride(0), 1, - 3 * frequencies.Stride(0), m.Options.mropeSections[i], - frequencies.Stride(1), frequencies.Dim(1), - } - - ctx.Forward(frequencies.View(ctx, i*frequencies.Stride(2)+args[0], args[1:]...). - Copy(ctx, interleaved.View(ctx, args[0], args[1:]...))) - } - - interleaved = interleaved.Concat(ctx, interleaved, 0) - interleaved = interleaved.Reshape(ctx, interleaved.Dim(0), 1, interleaved.Dim(1), interleaved.Dim(2)) - return interleaved.Cos(ctx), interleaved.Sin(ctx) -} - var _ model.Model = (*Model)(nil) func newTextModel(c fs.Config) *TextModel { From 76eb7d0fff04563ee89e253fc71a4cbf5d0f05f7 Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Thu, 30 Oct 2025 13:19:21 -0700 Subject: [PATCH 07/12] testing: test more models with tool calling (#12867) --- integration/api_test.go | 96 --------------------------- integration/tools_test.go | 132 ++++++++++++++++++++++++++++++++++++++ integration/utils_test.go | 16 +++++ 3 files changed, 148 insertions(+), 96 deletions(-) create mode 100644 integration/tools_test.go diff --git a/integration/api_test.go b/integration/api_test.go index 48572085..5d7acd94 100644 --- a/integration/api_test.go +++ b/integration/api_test.go @@ -408,99 +408,3 @@ func TestAPIEmbeddings(t *testing.T) { t.Errorf("zero length embedding response") } } - -func TestAPIToolCalling(t *testing.T) { - initialTimeout := 60 * time.Second - streamTimeout := 30 * time.Second - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) - defer cancel() - - client, _, cleanup := InitServerConnection(ctx, t) - defer cleanup() - - modelName := "qwen3:0.6b" - if err := PullIfMissing(ctx, client, modelName); err != nil { - t.Fatalf("pull failed %s", err) - } - - tools := []api.Tool{ - { - Type: "function", - Function: api.ToolFunction{ - Name: "get_weather", - Description: "Get the current weather in a given location", - Parameters: api.ToolFunctionParameters{ - Type: "object", - Required: []string{"location"}, - Properties: map[string]api.ToolProperty{ - "location": { - Type: api.PropertyType{"string"}, - Description: "The city and state, e.g. San Francisco, CA", - }, - }, - }, - }, - }, - } - - req := api.ChatRequest{ - Model: modelName, - Messages: []api.Message{ - { - Role: "user", - Content: "Call get_weather with location set to San Francisco.", - }, - }, - Tools: tools, - Options: map[string]any{ - "temperature": 0, - }, - } - - stallTimer := time.NewTimer(initialTimeout) - var gotToolCall bool - var lastToolCall api.ToolCall - - fn := func(response api.ChatResponse) error { - if len(response.Message.ToolCalls) > 0 { - gotToolCall = true - lastToolCall = response.Message.ToolCalls[len(response.Message.ToolCalls)-1] - } - if !stallTimer.Reset(streamTimeout) { - return fmt.Errorf("stall was detected while streaming response, aborting") - } - return nil - } - - stream := true - req.Stream = &stream - done := make(chan int) - var genErr error - go func() { - genErr = client.Chat(ctx, &req, fn) - done <- 0 - }() - - select { - case <-stallTimer.C: - t.Errorf("tool-calling chat never started. Timed out after: %s", initialTimeout.String()) - case <-done: - if genErr != nil { - t.Fatalf("chat failed: %v", genErr) - } - - if !gotToolCall { - t.Fatalf("expected at least one tool call, got none") - } - - if lastToolCall.Function.Name != "get_weather" { - t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather") - } - - if _, ok := lastToolCall.Function.Arguments["location"]; !ok { - t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String()) - } - case <-ctx.Done(): - t.Error("outer test context done while waiting for tool-calling chat") - } -} diff --git a/integration/tools_test.go b/integration/tools_test.go new file mode 100644 index 00000000..d6b8dfa5 --- /dev/null +++ b/integration/tools_test.go @@ -0,0 +1,132 @@ +//go:build integration + +package integration + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/ollama/ollama/api" +) + +func TestAPIToolCalling(t *testing.T) { + initialTimeout := 60 * time.Second + streamTimeout := 60 * time.Second + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() + + minVRAM := map[string]uint64{ + "qwen3-vl": 16, + "gpt-oss:20b": 16, + "gpt-oss:120b": 70, + "qwen3": 6, + "llama3.1": 8, + "llama3.2": 4, + "mistral": 6, + "qwen2.5": 6, + "qwen2": 6, + "mistral-nemo": 9, + "mistral-small": 16, + "mixtral:8x22b": 80, + "qwq": 20, + "granite3.3": 7, + } + + for _, model := range libraryToolsModels { + t.Run(model, func(t *testing.T) { + if v, ok := minVRAM[model]; ok { + skipUnderMinVRAM(t, v) + } + + if err := PullIfMissing(ctx, client, model); err != nil { + t.Fatalf("pull failed %s", err) + } + + tools := []api.Tool{ + { + Type: "function", + Function: api.ToolFunction{ + Name: "get_weather", + Description: "Get the current weather in a given location", + Parameters: api.ToolFunctionParameters{ + Type: "object", + Required: []string{"location"}, + Properties: map[string]api.ToolProperty{ + "location": { + Type: api.PropertyType{"string"}, + Description: "The city and state, e.g. San Francisco, CA", + }, + }, + }, + }, + }, + } + + req := api.ChatRequest{ + Model: model, + Messages: []api.Message{ + { + Role: "user", + Content: "Call get_weather with location set to San Francisco.", + }, + }, + Tools: tools, + Options: map[string]any{ + "temperature": 0, + }, + } + + stallTimer := time.NewTimer(initialTimeout) + var gotToolCall bool + var lastToolCall api.ToolCall + + fn := func(response api.ChatResponse) error { + if len(response.Message.ToolCalls) > 0 { + gotToolCall = true + lastToolCall = response.Message.ToolCalls[len(response.Message.ToolCalls)-1] + } + if !stallTimer.Reset(streamTimeout) { + return fmt.Errorf("stall was detected while streaming response, aborting") + } + return nil + } + + stream := true + req.Stream = &stream + done := make(chan int) + var genErr error + go func() { + genErr = client.Chat(ctx, &req, fn) + done <- 0 + }() + + select { + case <-stallTimer.C: + t.Errorf("tool-calling chat never started. Timed out after: %s", initialTimeout.String()) + case <-done: + if genErr != nil { + t.Fatalf("chat failed: %v", genErr) + } + + if !gotToolCall { + t.Fatalf("expected at least one tool call, got none") + } + + if lastToolCall.Function.Name != "get_weather" { + t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather") + } + + if _, ok := lastToolCall.Function.Arguments["location"]; !ok { + t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String()) + } + case <-ctx.Done(): + t.Error("outer test context done while waiting for tool-calling chat") + } + }) + } +} diff --git a/integration/utils_test.go b/integration/utils_test.go index c0bac5e1..8a362408 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -260,6 +260,22 @@ var ( "snowflake-arctic-embed", "snowflake-arctic-embed2", } + libraryToolsModels = []string{ + "qwen3-vl", + "gpt-oss:20b", + "gpt-oss:120b", + "qwen3", + "llama3.1", + "llama3.2", + "mistral", + "qwen2.5", + "qwen2", + "mistral-nemo", + "mistral-small", + "mixtral:8x22b", + "qwq", + "granite3.3", + } blueSkyPrompt = "why is the sky blue? Be brief but factual in your reply" blueSkyExpected = []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength", "interact"} From 88236bc05fc1dbc251abfaf47105ab8d77c4e7e0 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 30 Oct 2025 13:22:00 -0700 Subject: [PATCH 08/12] win: use copy for subprocess logs (#12864) windows gets confused when we try to hand the stderr file descriptor to the subprocess children. This ensures the log output always shows up. --- llm/server.go | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/llm/server.go b/llm/server.go index f8b232df..4f7c3760 100644 --- a/llm/server.go +++ b/llm/server.go @@ -339,8 +339,23 @@ func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.W cmd = exec.Command(exe, params...) cmd.Env = os.Environ() - cmd.Stdout = out - cmd.Stderr = out + + if out != nil { + stdout, err := cmd.StdoutPipe() + if err != nil { + return nil, 0, fmt.Errorf("failed to spawn server stdout pipe: %w", err) + } + stderr, err := cmd.StderrPipe() + if err != nil { + return nil, 0, fmt.Errorf("failed to spawn server stderr pipe: %w", err) + } + go func() { + io.Copy(out, stdout) //nolint:errcheck + }() + go func() { + io.Copy(out, stderr) //nolint:errcheck + }() + } cmd.SysProcAttr = LlamaServerSysProcAttr // Always filter down the set of GPUs in case there are any unsupported devices that might crash From 26465fb85fea8d820506f4a7843193c1f259a979 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 27 Oct 2025 16:31:58 -0700 Subject: [PATCH 09/12] ollamarunner: Worst case batch for token generation We currently allocate the worst case batch for max sized batches, which corresponds to prompt processing. However, there are some cases where the generated graph is different for small and large batches. To ensure that we don't need to allocate memory later after layout has taken place, we should run the worst case batch both ways and take the larger amount of memory. This does not noticeably affect loading speed as the most expensive part of this logic is from image processing and that does not occur during token generation. --- runner/ollamarunner/runner.go | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index 153a3e57..962931fe 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -1009,12 +1009,17 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) { } } -func (s *Server) reserveWorstCaseGraph() error { +func (s *Server) reserveWorstCaseGraph(prompt bool) error { ctx := s.model.Backend().NewContext() defer ctx.Close() var err error - inputs := make([]*input.Input, s.batchSize) + batchSize := 1 + if prompt { + batchSize = s.batchSize + } + + inputs := make([]*input.Input, batchSize) for i := range inputs { inputs[i] = &input.Input{} } @@ -1031,7 +1036,7 @@ func (s *Server) reserveWorstCaseGraph() error { // - The result may now be larger than a batch (images may not fit in a // single batch), so trim based on what will fit and must be grouped together. // - Fill out the rest of the space with text tokens. - if multimodalProcessor, ok := s.model.(model.MultimodalProcessor); ok { + if multimodalProcessor, ok := s.model.(model.MultimodalProcessor); prompt && ok { mmCtx := s.model.Backend().NewContext() defer mmCtx.Close() @@ -1058,10 +1063,10 @@ func (s *Server) reserveWorstCaseGraph() error { } } - if len(inputs) < s.batchSize { - newInputs := make([]*input.Input, s.batchSize) + if len(inputs) < batchSize { + newInputs := make([]*input.Input, batchSize) copy(newInputs, inputs) - for i := len(inputs); i < s.batchSize; i++ { + for i := len(inputs); i < batchSize; i++ { newInputs[i] = &input.Input{} } inputs = newInputs @@ -1160,7 +1165,12 @@ func (s *Server) allocModel( s.seqs = make([]*Sequence, s.parallel) s.seqsSem = semaphore.NewWeighted(int64(s.parallel)) - return s.reserveWorstCaseGraph() + err = s.reserveWorstCaseGraph(true) + if err != nil { + return nil + } + + return s.reserveWorstCaseGraph(false) } // closeModel frees all memory associated with a model From afaf7ce8c34bab8de45ca00dbd12da8cd3cc033a Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 27 Oct 2025 16:32:05 -0700 Subject: [PATCH 10/12] ggml: Enable op_offload to improve partial offload performance When a model is partially offloaded to system RAM, we can either do the calculations on the CPU or we can temporarily transfer the data to the GPU to do the calculations there. Small batches tend to be better on the CPU, large batches on the GPU. The llamarunner used the GPU in most cases and the ollamarunner used the CPU. Although the ollamarunner saw an improvement in token generation performance, there was a large performance hit in prompt processing (3-10x). There is an existing heuristic to dynamically switch between these two modes but in practice it doesn't have enough information to accurately make that decision. This adds authoritative data to make the check work to get the best of both worlds. Fixes #12037 --- .../0019-Enable-CUDA-Graphs-for-gemma3n.patch | 58 ---- .../0019-ggml-Add-batch-size-hint.patch | 300 ++++++++++++++++++ llama/patches/0022-ggml-No-alloc-mode.patch | 39 +-- ml/backend.go | 5 + ml/backend/ggml/ggml.go | 18 +- ml/backend/ggml/ggml/include/ggml-backend.h | 5 +- ml/backend/ggml/ggml/src/ggml-backend-impl.h | 4 +- ml/backend/ggml/ggml/src/ggml-backend.cpp | 19 +- .../ggml/ggml/src/ggml-blas/ggml-blas.cpp | 3 +- .../ggml/ggml/src/ggml-cpu/ggml-cpu.cpp | 4 +- .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu | 66 ++-- .../ggml/ggml/src/ggml-metal/ggml-metal.cpp | 4 +- .../ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp | 3 +- runner/ollamarunner/multimodal.go | 3 + runner/ollamarunner/runner.go | 2 + 15 files changed, 405 insertions(+), 128 deletions(-) delete mode 100644 llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch create mode 100644 llama/patches/0019-ggml-Add-batch-size-hint.patch diff --git a/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch b/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch deleted file mode 100644 index 85cba5b3..00000000 --- a/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Oliver Simons -Date: Tue, 22 Jul 2025 11:02:28 +0200 -Subject: [PATCH] Enable CUDA Graphs for gemma3n. - -Similar to -https://github.com/ggml-org/llama.cpp/pull/14741, -though ollama has a slightly different model graph -than llama.cpp which requires different workaround -checks. ---- - ggml/src/ggml-cuda/ggml-cuda.cu | 18 ++++++++++++++++++ - 1 file changed, 18 insertions(+) - -diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 5b852f69..827e3205 100644 ---- a/ggml/src/ggml-cuda/ggml-cuda.cu -+++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2689,14 +2689,26 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud - // Loop over nodes in GGML graph to obtain info needed for CUDA graph - cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); - -+ // This fix was added in llama.cpp and Ollama in parallel, but with -+ // different tensor names. -+ // llama.cpp: https://github.com/ggml-org/llama.cpp/pull/14741 -+ // ollama: https://github.com/ollama/ollama/pull/11525 -+ -+ const std::string gemma3n_per_layer_proj_src1_name_ollama = " (reshaped)"; -+ const std::string gemma3n_node_name_ollama = "node_"; -+ - const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected"; - const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj"; -+ -+ const std::string ffn_moe_bias_suffix = "_exps.bias"; -+ - const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased"; - const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased"; - const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased"; - const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out"; - const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d"; - -+ - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - -@@ -2720,6 +2732,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud - - if (node->op == GGML_OP_ADD && - node->src[1] && node->src[1]->ne[1] > 1 && -+ // ollama -+ // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n -+ // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here -+ !(node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name_ollama) != std::string::npos : false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name_ollama : false) && -+ node->src[1] ? std::string(node->src[1]->name).find(ffn_moe_bias_suffix) == std::string::npos : false && -+ // upstream - (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && - (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && - strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && diff --git a/llama/patches/0019-ggml-Add-batch-size-hint.patch b/llama/patches/0019-ggml-Add-batch-size-hint.patch new file mode 100644 index 00000000..76d61e2d --- /dev/null +++ b/llama/patches/0019-ggml-Add-batch-size-hint.patch @@ -0,0 +1,300 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jesse Gross +Date: Tue, 28 Oct 2025 17:36:54 -0700 +Subject: [PATCH] ggml: Add batch size hint + +Some operations use heuristics to determine the batch size, which +affects offloading decisions. However, these are not always +accurate when looking at single operations. This provides an +explicit signal on the batch size from higher layers to ensure +consistent performance. +--- + ggml/include/ggml-backend.h | 5 ++- + ggml/src/ggml-backend-impl.h | 4 +-- + ggml/src/ggml-backend.cpp | 19 +++++++---- + ggml/src/ggml-blas/ggml-blas.cpp | 3 +- + ggml/src/ggml-cpu/ggml-cpu.cpp | 4 ++- + ggml/src/ggml-cuda/ggml-cuda.cu | 48 +++++++++++++++++----------- + ggml/src/ggml-metal/ggml-metal.cpp | 4 ++- + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 3 +- + 8 files changed, 58 insertions(+), 32 deletions(-) + +diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h +index 229bf387b..2763f2bd6 100644 +--- a/ggml/include/ggml-backend.h ++++ b/ggml/include/ggml-backend.h +@@ -98,7 +98,7 @@ extern "C" { + + GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); +- GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); ++ GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size); + + // NOTE: will be removed, use device version instead + GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); +@@ -307,6 +307,9 @@ extern "C" { + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload); + GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); + ++ // Provide a hint on the batch size to optimize processing (uses heuristics if unset) ++ GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size); ++ + // Initialize backend buffers from a measure graph + GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success + +diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h +index 6792ba986..0f5b03cef 100644 +--- a/ggml/src/ggml-backend-impl.h ++++ b/ggml/src/ggml-backend-impl.h +@@ -106,8 +106,8 @@ extern "C" { + // compute the graph with the plan + enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + +- // compute graph (always async if supported by the backend) +- enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); ++ // compute graph (always async if supported by the backend). batch_size may be -1 if unknown ++ enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size); + + // (optional) event synchronization + // record an event on this stream +diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp +index cb2b99562..41eef3b5f 100644 +--- a/ggml/src/ggml-backend.cpp ++++ b/ggml/src/ggml-backend.cpp +@@ -348,14 +348,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba + } + + enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +- enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph); ++ enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, -1); + ggml_backend_synchronize(backend); + return err; + } + +-enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ++enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { + GGML_ASSERT(backend); +- return backend->iface.graph_compute(backend, cgraph); ++ return backend->iface.graph_compute(backend, cgraph, batch_size); + } + + bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { +@@ -722,6 +722,8 @@ struct ggml_backend_sched { + + bool op_offload; + ++ int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics ++ + int debug; + }; + +@@ -814,7 +816,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st + if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); + // check if a backend with higher prio wants to offload the op +- if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { ++ if (sched->op_offload && (sched->batch_size < 0 || sched->batch_size >= 32) && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { + for (int b = 0; b < src_backend_id; b++) { + if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { + SET_CAUSE(tensor, "1.off"); +@@ -1550,7 +1552,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s + } + + if (!sched->callback_eval) { +- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); ++ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, sched->batch_size); + if (ec != GGML_STATUS_SUCCESS) { + return ec; + } +@@ -1572,7 +1574,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s + + struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); + +- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); ++ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, sched->batch_size); + if (ec != GGML_STATUS_SUCCESS) { + return ec; + } +@@ -1651,6 +1653,7 @@ ggml_backend_sched_t ggml_backend_sched_new( + + sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); + sched->op_offload = op_offload; ++ sched->batch_size = -1; + + ggml_backend_sched_reset(sched); + +@@ -1682,6 +1685,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { + free(sched); + } + ++void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size) { ++ sched->batch_size = batch_size; ++} ++ + void ggml_backend_sched_reset(ggml_backend_sched_t sched) { + GGML_ASSERT(sched); + // reset state for the next run +diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp +index 5b888cdd8..88d088952 100644 +--- a/ggml/src/ggml-blas/ggml-blas.cpp ++++ b/ggml/src/ggml-blas/ggml-blas.cpp +@@ -224,7 +224,7 @@ static void ggml_backend_blas_free(ggml_backend_t backend) { + delete backend; + } + +-static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ++static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { +@@ -254,6 +254,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); ++ GGML_UNUSED(batch_size); + } + + static struct ggml_backend_i blas_backend_i = { +diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp +index 3191faaa4..32f14c811 100644 +--- a/ggml/src/ggml-cpu/ggml-cpu.cpp ++++ b/ggml/src/ggml-cpu/ggml-cpu.cpp +@@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe + GGML_UNUSED(backend); + } + +-static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ++static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; + + struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); +@@ -184,6 +184,8 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s + cplan.abort_callback_data = cpu_ctx->abort_callback_data; + + return ggml_graph_compute(cgraph, &cplan); ++ ++ GGML_UNUSED(batch_size); + } + + static const struct ggml_backend_i ggml_backend_cpu_i = { +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index 5b852f690..c555cd30f 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -2684,7 +2684,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { + + #ifdef USE_CUDA_GRAPH + static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, +- bool use_cuda_graph) { ++ int batch_size, bool use_cuda_graph) { + + // Loop over nodes in GGML graph to obtain info needed for CUDA graph + cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); +@@ -2718,24 +2718,34 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud + #endif + } + +- if (node->op == GGML_OP_ADD && +- node->src[1] && node->src[1]->ne[1] > 1 && +- (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && +- (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && +- strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && +- strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && +- strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && +- strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && +- strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { +- // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation +- // by means of matching node names. See +- // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and +- // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, +- // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. +- use_cuda_graph = false; ++ // If we have an explicit batch size hint then we don't need to use the tensor name heuristics ++ if (batch_size >= 0) { ++ if (batch_size > 1) { ++ use_cuda_graph = false; + #ifndef NDEBUG +- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); ++ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%d]\n", __func__, batch_size); + #endif ++ } ++ } else { ++ if (node->op == GGML_OP_ADD && ++ node->src[1] && node->src[1]->ne[1] > 1 && ++ (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && ++ (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && ++ strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && ++ strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && ++ strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && ++ strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && ++ strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { ++ // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation ++ // by means of matching node names. See ++ // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and ++ // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, ++ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. ++ use_cuda_graph = false; ++#ifndef NDEBUG ++ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); ++#endif ++ } + } + + if (node->op == GGML_OP_CPY) { +@@ -3132,7 +3142,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx + } + } + +-static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ++static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + ggml_cuda_set_device(cuda_ctx->device); +@@ -3170,7 +3180,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, + if (use_cuda_graph) { + cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); + +- use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph); ++ use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, batch_size, use_cuda_graph); + + // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. + if (use_cuda_graph && cuda_graph_update_required) { +diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp +index f2ff9f322..05ff6a5a6 100644 +--- a/ggml/src/ggml-metal/ggml-metal.cpp ++++ b/ggml/src/ggml-metal/ggml-metal.cpp +@@ -410,10 +410,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml + GGML_UNUSED(dst); + } + +-static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ++static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { + ggml_metal_t ctx = (ggml_metal_t)backend->context; + + return ggml_metal_graph_compute(ctx, cgraph); ++ ++ GGML_UNUSED(batch_size); + } + + static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { +diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +index ed83236f4..bd3ece516 100644 +--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp ++++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +@@ -12015,7 +12015,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru + return num_adds; + } + +-static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ++static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { + VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + +@@ -12211,6 +12211,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg + return GGML_STATUS_SUCCESS; + + UNUSED(backend); ++ UNUSED(batch_size); + } + + // Sort the graph for improved parallelism. diff --git a/llama/patches/0022-ggml-No-alloc-mode.patch b/llama/patches/0022-ggml-No-alloc-mode.patch index 019cb886..d03c6c84 100644 --- a/llama/patches/0022-ggml-No-alloc-mode.patch +++ b/llama/patches/0022-ggml-No-alloc-mode.patch @@ -16,7 +16,7 @@ must be recreated with no-alloc set to false before loading data. 5 files changed, 310 insertions(+), 44 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h -index 229bf387..1ff53ed0 100644 +index 2763f2bd6..b3b5b356a 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -305,6 +305,7 @@ extern "C" { @@ -26,9 +26,9 @@ index 229bf387..1ff53ed0 100644 + GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); - // Initialize backend buffers from a measure graph + // Provide a hint on the batch size to optimize processing (uses heuristics if unset) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h -index 6792ba98..3c3f22fc 100644 +index 0f5b03cef..7bdf9d81f 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -26,12 +26,17 @@ extern "C" { @@ -75,7 +75,7 @@ index 6792ba98..3c3f22fc 100644 struct ggml_backend { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp -index cb2b9956..6ef5eeaf 100644 +index 41eef3b5f..c81a2e48a 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t @@ -121,8 +121,8 @@ index cb2b9956..6ef5eeaf 100644 void * base = buffer->iface.get_base(buffer); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); -@@ -723,6 +743,12 @@ struct ggml_backend_sched { - bool op_offload; +@@ -725,6 +745,12 @@ struct ggml_backend_sched { + int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics int debug; + @@ -134,7 +134,7 @@ index cb2b9956..6ef5eeaf 100644 }; #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) -@@ -1606,6 +1632,17 @@ ggml_backend_sched_t ggml_backend_sched_new( +@@ -1608,6 +1634,17 @@ ggml_backend_sched_t ggml_backend_sched_new( size_t graph_size, bool parallel, bool op_offload) { @@ -152,7 +152,7 @@ index cb2b9956..6ef5eeaf 100644 GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); -@@ -1647,10 +1684,13 @@ ggml_backend_sched_t ggml_backend_sched_new( +@@ -1649,11 +1686,14 @@ ggml_backend_sched_t ggml_backend_sched_new( sched->events[b][c] = ggml_backend_event_new(backends[b]->device); } } @@ -162,11 +162,12 @@ index cb2b9956..6ef5eeaf 100644 sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->op_offload = op_offload; + sched->batch_size = -1; + sched->alloc_buffers = alloc_buffers; ggml_backend_sched_reset(sched); -@@ -1665,6 +1705,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { +@@ -1668,6 +1708,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { for (int c = 0; c < sched->n_copies; c++) { ggml_backend_event_free(sched->events[b][c]); } @@ -177,7 +178,7 @@ index cb2b9956..6ef5eeaf 100644 } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); -@@ -1708,6 +1752,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * +@@ -1715,6 +1759,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * return false; } @@ -202,7 +203,7 @@ index cb2b9956..6ef5eeaf 100644 ggml_backend_sched_reset(sched); return true; -@@ -1813,7 +1875,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, +@@ -1820,7 +1882,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); @@ -218,7 +219,7 @@ index cb2b9956..6ef5eeaf 100644 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh -index e0abde54..28d6bcd7 100644 +index e0abde542..28d6bcd71 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -35,6 +35,31 @@ @@ -299,7 +300,7 @@ index e0abde54..28d6bcd7 100644 + } }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 827e3205..811462c7 100644 +index f4d4a4267..ac70dcac8 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() { @@ -540,7 +541,7 @@ index 827e3205..811462c7 100644 }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { -@@ -3011,6 +3073,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, +@@ -3003,6 +3065,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { @@ -548,7 +549,7 @@ index 827e3205..811462c7 100644 // flag used to determine whether it is an integrated_gpu const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated; -@@ -3026,6 +3089,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx +@@ -3018,6 +3081,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx continue; } @@ -560,15 +561,15 @@ index 827e3205..811462c7 100644 static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); if (!disable_fusion) { -@@ -3152,6 +3220,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx +@@ -3144,6 +3212,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx - static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + cuda_ctx->pool_set_alloc(true); ggml_cuda_set_device(cuda_ctx->device); -@@ -3231,6 +3300,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, +@@ -3223,6 +3292,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, return GGML_STATUS_SUCCESS; } @@ -640,7 +641,7 @@ index 827e3205..811462c7 100644 static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; -@@ -3271,6 +3405,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = { +@@ -3263,6 +3397,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = { /* .event_record = */ ggml_backend_cuda_event_record, /* .event_wait = */ ggml_backend_cuda_event_wait, /* .graph_optimize = */ NULL, diff --git a/ml/backend.go b/ml/backend.go index bf390c01..b07039e2 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -106,6 +106,11 @@ type Context interface { Arange(start, stop, step float32, dtype DType) Tensor Forward(...Tensor) Context + + // SetBatchSize provides a hint on the batch size to optimize processing + // Uses heuristics if not set + SetBatchSize(int) + Compute(...Tensor) ComputeWithNotify(func(), ...Tensor) // notify callback once compute has begun diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 8c782d73..eb02c3b1 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -386,7 +386,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { C.int(len(schedBackends)), C.size_t(maxGraphNodes), C._Bool(false), - C._Bool(false), + C._Bool(true), C._Bool(params.AllocMemory), ) @@ -749,6 +749,9 @@ type Context struct { ctx *C.struct_ggml_context graph *C.struct_ggml_cgraph + // batchSize is a hint to optimize processing + batchSize int + // buft is the buffer type used for new tensors buft C.ggml_backend_buffer_type_t @@ -805,6 +808,10 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context { return c } +func (c *Context) SetBatchSize(batchSize int) { + c.batchSize = batchSize +} + func (c *Context) Compute(tensors ...ml.Tensor) { c.ComputeWithNotify(nil, tensors...) } @@ -815,6 +822,11 @@ func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) { if cb != nil { go cb() } + + if c.batchSize > 0 { + C.ggml_backend_sched_set_batch_size(c.b.sched, C.int(c.batchSize)) + } + if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS { panic(fmt.Errorf("error computing ggml graph: %v", status)) } @@ -836,6 +848,10 @@ func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) { } func (c *Context) Reserve() { + if c.batchSize > 0 { + C.ggml_backend_sched_set_batch_size(c.b.sched, C.int(c.batchSize)) + } + reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph) slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched)) diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h index 80983524..1cab4bb3 100644 --- a/ml/backend/ggml/ggml/include/ggml-backend.h +++ b/ml/backend/ggml/ggml/include/ggml-backend.h @@ -98,7 +98,7 @@ extern "C" { GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); - GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size); // NOTE: will be removed, use device version instead GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); @@ -317,6 +317,9 @@ extern "C" { GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); + // Provide a hint on the batch size to optimize processing (uses heuristics if unset) + GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size); + // Initialize backend buffers from a measure graph GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success diff --git a/ml/backend/ggml/ggml/src/ggml-backend-impl.h b/ml/backend/ggml/ggml/src/ggml-backend-impl.h index 43c91d9f..21b35ac5 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend-impl.h +++ b/ml/backend/ggml/ggml/src/ggml-backend-impl.h @@ -112,8 +112,8 @@ extern "C" { // compute the graph with the plan enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - // compute graph (always async if supported by the backend) - enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + // compute graph (always async if supported by the backend). batch_size may be -1 if unknown + enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size); // (optional) event synchronization // record an event on this stream diff --git a/ml/backend/ggml/ggml/src/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp index 0b757af5..9b0a9b91 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend.cpp +++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp @@ -368,14 +368,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba } enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph); + enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, -1); ggml_backend_synchronize(backend); return err; } -enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { GGML_ASSERT(backend); - return backend->iface.graph_compute(backend, cgraph); + return backend->iface.graph_compute(backend, cgraph, batch_size); } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -750,6 +750,8 @@ struct ggml_backend_sched { bool op_offload; + int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics + int debug; // allocate buffers on attached ggml_backend_buffer_type_t's and during reservation @@ -848,7 +850,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op - if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { + if (sched->op_offload && (sched->batch_size < 0 || sched->batch_size >= 32) && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { for (int b = 0; b < src_backend_id; b++) { if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { SET_CAUSE(tensor, "1.off"); @@ -1584,7 +1586,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } if (!sched->callback_eval) { - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, sched->batch_size); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1606,7 +1608,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, sched->batch_size); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1698,6 +1700,7 @@ ggml_backend_sched_t ggml_backend_sched_new_ext( sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->op_offload = op_offload; + sched->batch_size = -1; sched->alloc_buffers = alloc_buffers; ggml_backend_sched_reset(sched); @@ -1734,6 +1737,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { free(sched); } +void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size) { + sched->batch_size = batch_size; +} + void ggml_backend_sched_reset(ggml_backend_sched_t sched) { GGML_ASSERT(sched); // reset state for the next run diff --git a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp index 2a9ff7f6..6a38a51a 100644 --- a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp @@ -224,7 +224,7 @@ static void ggml_backend_blas_free(ggml_backend_t backend) { delete backend; } -static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { @@ -254,6 +254,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); + GGML_UNUSED(batch_size); } static struct ggml_backend_i blas_backend_i = { diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp index 3191faaa..32f14c81 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe GGML_UNUSED(backend); } -static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); @@ -184,6 +184,8 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s cplan.abort_callback_data = cpu_ctx->abort_callback_data; return ggml_graph_compute(cgraph, &cplan); + + GGML_UNUSED(batch_size); } static const struct ggml_backend_i ggml_backend_cpu_i = { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index d62f412d..e9b73147 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2775,31 +2775,19 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { #ifdef USE_CUDA_GRAPH static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, - bool use_cuda_graph) { + int batch_size, bool use_cuda_graph) { // Loop over nodes in GGML graph to obtain info needed for CUDA graph cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); - // This fix was added in llama.cpp and Ollama in parallel, but with - // different tensor names. - // llama.cpp: https://github.com/ggml-org/llama.cpp/pull/14741 - // ollama: https://github.com/ollama/ollama/pull/11525 - - const std::string gemma3n_per_layer_proj_src1_name_ollama = " (reshaped)"; - const std::string gemma3n_node_name_ollama = "node_"; - const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected"; const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj"; - - const std::string ffn_moe_bias_suffix = "_exps.bias"; - const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased"; const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased"; const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased"; const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out"; const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d"; - for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2821,30 +2809,34 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud #endif } - if (node->op == GGML_OP_ADD && - node->src[1] && node->src[1]->ne[1] > 1 && - // ollama - // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n - // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here - !(node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name_ollama) != std::string::npos : false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name_ollama : false) && - node->src[1] ? std::string(node->src[1]->name).find(ffn_moe_bias_suffix) == std::string::npos : false && - // upstream - (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && - (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && - strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && - strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && - strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && - strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && - strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { - // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation - // by means of matching node names. See - // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and - // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, - // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. - use_cuda_graph = false; + // If we have an explicit batch size hint then we don't need to use the tensor name heuristics + if (batch_size >= 0) { + if (batch_size > 1) { + use_cuda_graph = false; #ifndef NDEBUG - GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%d]\n", __func__, batch_size); #endif + } + } else { + if (node->op == GGML_OP_ADD && + node->src[1] && node->src[1]->ne[1] > 1 && + (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && + (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && + strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && + strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && + strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && + strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && + strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { + // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation + // by means of matching node names. See + // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and + // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, + // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. + use_cuda_graph = false; +#ifndef NDEBUG + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); +#endif + } } if (node->op == GGML_OP_CPY) { @@ -3247,7 +3239,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } } -static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; cuda_ctx->pool_set_alloc(true); @@ -3286,7 +3278,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (use_cuda_graph) { cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); - use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph); + use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, batch_size, use_cuda_graph); // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. if (use_cuda_graph && cuda_graph_update_required) { diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp index f356e4a0..032dee76 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp @@ -410,10 +410,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml GGML_UNUSED(dst); } -static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { ggml_metal_t ctx = (ggml_metal_t)backend->context; return ggml_metal_graph_compute(ctx, cgraph); + + GGML_UNUSED(batch_size); } static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 0bbcecd0..cc68e796 100644 --- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -12039,7 +12039,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru return num_adds; } -static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; @@ -12235,6 +12235,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg return GGML_STATUS_SUCCESS; UNUSED(backend); + UNUSED(batch_size); } // Sort the graph for improved parallelism. diff --git a/runner/ollamarunner/multimodal.go b/runner/ollamarunner/multimodal.go index 78ceb771..6af89021 100644 --- a/runner/ollamarunner/multimodal.go +++ b/runner/ollamarunner/multimodal.go @@ -86,6 +86,9 @@ func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Ten computeCtx.Forward(tensors...) entry.data = make([][]float32, len(entry.mm)) + // Multimodal processing is computationally intensive, so treat it similarly to a large batch + computeCtx.SetBatchSize(512) + if !reserve { computeCtx.Compute(tensors...) diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index 962931fe..3e8c1e22 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -598,6 +598,7 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er // Actual batchInputs values will be injected into the batch.Inputs tensor before calling Compute batch.Inputs = nextBatch.ctx.Input().Empty(ml.DTypeI32, len(batchInputs)) batch.Outputs = nextBatch.ctx.Input().FromInts(batchOutputs, len(batchOutputs)) + nextBatch.ctx.SetBatchSize(len(batchInputs)) nextBatch.modelOutput, err = model.Forward(nextBatch.ctx, s.model, batch) if err != nil { err = fmt.Errorf("failed to build graph: %w", err) @@ -1108,6 +1109,7 @@ func (s *Server) reserveWorstCaseGraph(prompt bool) error { return err } + ctx.SetBatchSize(batchSize) ctx.Forward(t).Reserve() return nil From db973c8fc2579e97fa4b8adea5cb88835138b3ee Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 30 Oct 2025 15:12:14 -0700 Subject: [PATCH 11/12] win: avoid ID mixups on refresh (#12869) On Windows AMD IDs are numeric, and can reorder based on the filter environment. By passing in the filter env on a full discovery refresh, we'll only look at the actual devices and ignore unsupported iGPUs. Without this, on some systems iGPU VRAM was incorrectly being used to populate the dGPU. --- discover/runner.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/discover/runner.go b/discover/runner.go index e74050d0..caaef222 100644 --- a/discover/runner.go +++ b/discover/runner.go @@ -335,11 +335,14 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. ctx, cancel := context.WithTimeout(ctx, 3*time.Second) defer cancel() + // Apply any dev filters to avoid re-discovering unsupported devices, and get IDs correct + devFilter := ml.GetVisibleDevicesEnv(devices) + for dir := range libDirs { - updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, nil) + updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, devFilter) for _, u := range updatedDevices { for i := range devices { - if u.DeviceID == devices[i].DeviceID { + if u.DeviceID == devices[i].DeviceID && u.PCIID == devices[i].PCIID { updated[i] = true devices[i].FreeMemory = u.FreeMemory break From 7dd4862a8997ce6f01018a76ad41bd4d8dbe217b Mon Sep 17 00:00:00 2001 From: nicole pardal <109545900+npardal@users.noreply.github.com> Date: Thu, 30 Oct 2025 17:12:33 -0700 Subject: [PATCH 12/12] embeddings: removed redundant TestAPIEmbeddings test (#12863) This PR removes a redundant test from TestAPIEmbeddings Contents of this test already exists in embed_test.go and model_arch_test.go --- integration/api_test.go | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/integration/api_test.go b/integration/api_test.go index 5d7acd94..39eea39c 100644 --- a/integration/api_test.go +++ b/integration/api_test.go @@ -381,30 +381,3 @@ func TestAPIShowModel(t *testing.T) { t.Errorf("%s missing modified_at: %#v", modelName, resp) } } - -func TestAPIEmbeddings(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) - defer cancel() - client, _, cleanup := InitServerConnection(ctx, t) - defer cleanup() - req := api.EmbeddingRequest{ - Model: libraryEmbedModels[0], - Prompt: "why is the sky blue?", - Options: map[string]interface{}{ - "temperature": 0, - "seed": 123, - }, - } - - if err := PullIfMissing(ctx, client, req.Model); err != nil { - t.Fatalf("pull failed %s", err) - } - - resp, err := client.Embeddings(ctx, &req) - if err != nil { - t.Fatalf("embeddings call failed %s", err) - } - if len(resp.Embedding) == 0 { - t.Errorf("zero length embedding response") - } -}