diff --git a/model/renderers/qwen3vl.go b/model/renderers/qwen3vl.go index 7e49fea1..8ea4abbb 100644 --- a/model/renderers/qwen3vl.go +++ b/model/renderers/qwen3vl.go @@ -48,13 +48,22 @@ func marshalWithSpaces(v any) ([]byte, error) { type Qwen3VLRenderer struct { isThinking bool + + useImgTags bool } -func (r *Qwen3VLRenderer) renderContent(content api.Message, doVisionCount bool) string { +func (r *Qwen3VLRenderer) renderContent(content api.Message) string { // This assumes all images are at the front of the message - same assumption as ollama/ollama/runner.go var subSb strings.Builder for range content.Images { - subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>") + // TODO: (jmorganca): how to render this is different for different + // model backends, and so we should eventually parameterize this or + // only output a placeholder such as [img] + if r.useImgTags { + subSb.WriteString("[img]") + } else { + subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>") + } } // TODO: support videos @@ -88,7 +97,7 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap message := messages[i] if multiStepTool && message.Role == "user" { // Check if content starts with and ends with - content := r.renderContent(message, true) + content := r.renderContent(message) if !(strings.HasPrefix(content, "") && strings.HasSuffix(content, "")) { multiStepTool = false lastQueryIndex = i @@ -97,7 +106,7 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap } for i, message := range messages { - content := r.renderContent(message, true) + content := r.renderContent(message) lastMessage := i == len(messages)-1 prefill := lastMessage && message.Role == "assistant" diff --git a/model/renderers/qwen3vl_nonthinking_test.go b/model/renderers/qwen3vl_nonthinking_test.go index 3f50a965..d3377e39 100644 --- a/model/renderers/qwen3vl_nonthinking_test.go +++ b/model/renderers/qwen3vl_nonthinking_test.go @@ -9,11 +9,12 @@ import ( func TestQwen3VLNonThinkingRenderer(t *testing.T) { tests := []struct { - name string - msgs []api.Message - images []api.ImageData - tools []api.Tool - expected string + name string + msgs []api.Message + images []api.ImageData + tools []api.Tool + useImgTags bool + expected string }{ { name: "prefill", @@ -90,6 +91,18 @@ I'll check the weather in San Francisco for you.Speak poetry after the fi expected: `<|im_start|>user <|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|> <|im_start|>assistant +Let me analyze this image.`, + }, + { + name: "Image with image tags", + msgs: []api.Message{ + {Role: "user", Content: "Describe this image.", Images: []api.ImageData{api.ImageData("img2")}}, + {Role: "assistant", Content: "Let me analyze this image."}, + }, + useImgTags: true, + expected: `<|im_start|>user +[img]Describe this image.<|im_end|> +<|im_start|>assistant Let me analyze this image.`, }, { @@ -102,7 +115,18 @@ Let me analyze this image.`, <|im_start|>assistant `, }, - + { + name: "Multiple images with image tags", + msgs: []api.Message{ + {Role: "user", Content: "Describe these images.", Images: []api.ImageData{api.ImageData("img1"), api.ImageData("img2")}}, + {Role: "assistant", Content: "Let me analyze this image."}, + }, + useImgTags: true, + expected: `<|im_start|>user +[img][img]Describe these images.<|im_end|> +<|im_start|>assistant +Let me analyze this image.`, + }, // // NOTE: solved with #12518: https://github.com/ollama/ollama/compare/main...drifkin/stable-tool-args // { // name: "with tools and response", @@ -485,7 +509,7 @@ I'll check. } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - rendered, err := (&Qwen3VLRenderer{false}).Render(tt.msgs, tt.tools, nil) + rendered, err := (&Qwen3VLRenderer{isThinking: false, useImgTags: tt.useImgTags}).Render(tt.msgs, tt.tools, nil) if err != nil { t.Fatal(err) } diff --git a/model/renderers/qwen3vl_thinking_test.go b/model/renderers/qwen3vl_thinking_test.go index 2bb2c2d0..eb53e6a9 100644 --- a/model/renderers/qwen3vl_thinking_test.go +++ b/model/renderers/qwen3vl_thinking_test.go @@ -323,7 +323,7 @@ Speak poetry after the first sentence.Speak poetry after the seco } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - rendered, err := (&Qwen3VLRenderer{true}).Render(tt.msgs, tt.tools, nil) + rendered, err := (&Qwen3VLRenderer{isThinking: true}).Render(tt.msgs, tt.tools, nil) if err != nil { t.Fatal(err) } diff --git a/model/renderers/renderer.go b/model/renderers/renderer.go index 96a90825..d995579c 100644 --- a/model/renderers/renderer.go +++ b/model/renderers/renderer.go @@ -17,6 +17,11 @@ type ( } ) +// RenderImgTags is a global flag that tells renderers to use [img] tags +// for images. This is set by the Ollama server package on init, or left as +// false for other environments where renderers are used +var RenderImgTags bool + func (r *RendererRegistry) Register(name string, renderer RendererConstructor) { r.renderers[name] = renderer } @@ -46,10 +51,10 @@ func rendererForName(name string) Renderer { renderer := &Qwen3CoderRenderer{} return renderer case "qwen3-vl-instruct": - renderer := &Qwen3VLRenderer{false} + renderer := &Qwen3VLRenderer{isThinking: false, useImgTags: RenderImgTags} return renderer case "qwen3-vl-thinking": - renderer := &Qwen3VLRenderer{true} + renderer := &Qwen3VLRenderer{isThinking: true, useImgTags: RenderImgTags} return renderer default: return nil diff --git a/server/routes.go b/server/routes.go index e65b0ed7..80c00cb6 100644 --- a/server/routes.go +++ b/server/routes.go @@ -39,6 +39,7 @@ import ( "github.com/ollama/ollama/logutil" "github.com/ollama/ollama/middleware" "github.com/ollama/ollama/model/parsers" + "github.com/ollama/ollama/model/renderers" "github.com/ollama/ollama/server/internal/client/ollama" "github.com/ollama/ollama/server/internal/registry" "github.com/ollama/ollama/template" @@ -91,6 +92,9 @@ func init() { } gin.SetMode(mode) + + // Tell renderers to use [img] tags + renderers.RenderImgTags = true } var (