diff --git a/model/renderers/qwen3vl.go b/model/renderers/qwen3vl.go
index 7e49fea1..8ea4abbb 100644
--- a/model/renderers/qwen3vl.go
+++ b/model/renderers/qwen3vl.go
@@ -48,13 +48,22 @@ func marshalWithSpaces(v any) ([]byte, error) {
type Qwen3VLRenderer struct {
isThinking bool
+
+ useImgTags bool
}
-func (r *Qwen3VLRenderer) renderContent(content api.Message, doVisionCount bool) string {
+func (r *Qwen3VLRenderer) renderContent(content api.Message) string {
// This assumes all images are at the front of the message - same assumption as ollama/ollama/runner.go
var subSb strings.Builder
for range content.Images {
- subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>")
+ // TODO: (jmorganca): how to render this is different for different
+ // model backends, and so we should eventually parameterize this or
+ // only output a placeholder such as [img]
+ if r.useImgTags {
+ subSb.WriteString("[img]")
+ } else {
+ subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>")
+ }
}
// TODO: support videos
@@ -88,7 +97,7 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap
message := messages[i]
if multiStepTool && message.Role == "user" {
// Check if content starts with and ends with
- content := r.renderContent(message, true)
+ content := r.renderContent(message)
if !(strings.HasPrefix(content, "") && strings.HasSuffix(content, "")) {
multiStepTool = false
lastQueryIndex = i
@@ -97,7 +106,7 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap
}
for i, message := range messages {
- content := r.renderContent(message, true)
+ content := r.renderContent(message)
lastMessage := i == len(messages)-1
prefill := lastMessage && message.Role == "assistant"
diff --git a/model/renderers/qwen3vl_nonthinking_test.go b/model/renderers/qwen3vl_nonthinking_test.go
index 3f50a965..d3377e39 100644
--- a/model/renderers/qwen3vl_nonthinking_test.go
+++ b/model/renderers/qwen3vl_nonthinking_test.go
@@ -9,11 +9,12 @@ import (
func TestQwen3VLNonThinkingRenderer(t *testing.T) {
tests := []struct {
- name string
- msgs []api.Message
- images []api.ImageData
- tools []api.Tool
- expected string
+ name string
+ msgs []api.Message
+ images []api.ImageData
+ tools []api.Tool
+ useImgTags bool
+ expected string
}{
{
name: "prefill",
@@ -90,6 +91,18 @@ I'll check the weather in San Francisco for you.Speak poetry after the fi
expected: `<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>
<|im_start|>assistant
+Let me analyze this image.`,
+ },
+ {
+ name: "Image with image tags",
+ msgs: []api.Message{
+ {Role: "user", Content: "Describe this image.", Images: []api.ImageData{api.ImageData("img2")}},
+ {Role: "assistant", Content: "Let me analyze this image."},
+ },
+ useImgTags: true,
+ expected: `<|im_start|>user
+[img]Describe this image.<|im_end|>
+<|im_start|>assistant
Let me analyze this image.`,
},
{
@@ -102,7 +115,18 @@ Let me analyze this image.`,
<|im_start|>assistant
`,
},
-
+ {
+ name: "Multiple images with image tags",
+ msgs: []api.Message{
+ {Role: "user", Content: "Describe these images.", Images: []api.ImageData{api.ImageData("img1"), api.ImageData("img2")}},
+ {Role: "assistant", Content: "Let me analyze this image."},
+ },
+ useImgTags: true,
+ expected: `<|im_start|>user
+[img][img]Describe these images.<|im_end|>
+<|im_start|>assistant
+Let me analyze this image.`,
+ },
// // NOTE: solved with #12518: https://github.com/ollama/ollama/compare/main...drifkin/stable-tool-args
// {
// name: "with tools and response",
@@ -485,7 +509,7 @@ I'll check.
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- rendered, err := (&Qwen3VLRenderer{false}).Render(tt.msgs, tt.tools, nil)
+ rendered, err := (&Qwen3VLRenderer{isThinking: false, useImgTags: tt.useImgTags}).Render(tt.msgs, tt.tools, nil)
if err != nil {
t.Fatal(err)
}
diff --git a/model/renderers/qwen3vl_thinking_test.go b/model/renderers/qwen3vl_thinking_test.go
index 2bb2c2d0..eb53e6a9 100644
--- a/model/renderers/qwen3vl_thinking_test.go
+++ b/model/renderers/qwen3vl_thinking_test.go
@@ -323,7 +323,7 @@ Speak poetry after the first sentence.Speak poetry after the seco
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- rendered, err := (&Qwen3VLRenderer{true}).Render(tt.msgs, tt.tools, nil)
+ rendered, err := (&Qwen3VLRenderer{isThinking: true}).Render(tt.msgs, tt.tools, nil)
if err != nil {
t.Fatal(err)
}
diff --git a/model/renderers/renderer.go b/model/renderers/renderer.go
index 96a90825..d995579c 100644
--- a/model/renderers/renderer.go
+++ b/model/renderers/renderer.go
@@ -17,6 +17,11 @@ type (
}
)
+// RenderImgTags is a global flag that tells renderers to use [img] tags
+// for images. This is set by the Ollama server package on init, or left as
+// false for other environments where renderers are used
+var RenderImgTags bool
+
func (r *RendererRegistry) Register(name string, renderer RendererConstructor) {
r.renderers[name] = renderer
}
@@ -46,10 +51,10 @@ func rendererForName(name string) Renderer {
renderer := &Qwen3CoderRenderer{}
return renderer
case "qwen3-vl-instruct":
- renderer := &Qwen3VLRenderer{false}
+ renderer := &Qwen3VLRenderer{isThinking: false, useImgTags: RenderImgTags}
return renderer
case "qwen3-vl-thinking":
- renderer := &Qwen3VLRenderer{true}
+ renderer := &Qwen3VLRenderer{isThinking: true, useImgTags: RenderImgTags}
return renderer
default:
return nil
diff --git a/server/routes.go b/server/routes.go
index e65b0ed7..80c00cb6 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -39,6 +39,7 @@ import (
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/middleware"
"github.com/ollama/ollama/model/parsers"
+ "github.com/ollama/ollama/model/renderers"
"github.com/ollama/ollama/server/internal/client/ollama"
"github.com/ollama/ollama/server/internal/registry"
"github.com/ollama/ollama/template"
@@ -91,6 +92,9 @@ func init() {
}
gin.SetMode(mode)
+
+ // Tell renderers to use [img] tags
+ renderers.RenderImgTags = true
}
var (