renderers: add global flag for setting [img] tags (#12669)

Adds a temporary global flag to renderers that causes renderers to always
render images as [img]. In a follow up change, we will consider making this
the default, and this flag could eventually be removed
This commit is contained in:
Jeffrey Morgan
2025-10-16 16:37:32 -07:00
committed by GitHub
parent e2a0b24435
commit 65fb3ff49d
5 changed files with 56 additions and 14 deletions

View File

@@ -48,13 +48,22 @@ func marshalWithSpaces(v any) ([]byte, error) {
type Qwen3VLRenderer struct { type Qwen3VLRenderer struct {
isThinking bool isThinking bool
useImgTags bool
} }
func (r *Qwen3VLRenderer) renderContent(content api.Message, doVisionCount bool) string { func (r *Qwen3VLRenderer) renderContent(content api.Message) string {
// This assumes all images are at the front of the message - same assumption as ollama/ollama/runner.go // This assumes all images are at the front of the message - same assumption as ollama/ollama/runner.go
var subSb strings.Builder var subSb strings.Builder
for range content.Images { for range content.Images {
subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>") // TODO: (jmorganca): how to render this is different for different
// model backends, and so we should eventually parameterize this or
// only output a placeholder such as [img]
if r.useImgTags {
subSb.WriteString("[img]")
} else {
subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>")
}
} }
// TODO: support videos // TODO: support videos
@@ -88,7 +97,7 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap
message := messages[i] message := messages[i]
if multiStepTool && message.Role == "user" { if multiStepTool && message.Role == "user" {
// Check if content starts with <tool_response> and ends with </tool_response> // Check if content starts with <tool_response> and ends with </tool_response>
content := r.renderContent(message, true) content := r.renderContent(message)
if !(strings.HasPrefix(content, "<tool_response>") && strings.HasSuffix(content, "</tool_response>")) { if !(strings.HasPrefix(content, "<tool_response>") && strings.HasSuffix(content, "</tool_response>")) {
multiStepTool = false multiStepTool = false
lastQueryIndex = i lastQueryIndex = i
@@ -97,7 +106,7 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap
} }
for i, message := range messages { for i, message := range messages {
content := r.renderContent(message, true) content := r.renderContent(message)
lastMessage := i == len(messages)-1 lastMessage := i == len(messages)-1
prefill := lastMessage && message.Role == "assistant" prefill := lastMessage && message.Role == "assistant"

View File

@@ -9,11 +9,12 @@ import (
func TestQwen3VLNonThinkingRenderer(t *testing.T) { func TestQwen3VLNonThinkingRenderer(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
msgs []api.Message msgs []api.Message
images []api.ImageData images []api.ImageData
tools []api.Tool tools []api.Tool
expected string useImgTags bool
expected string
}{ }{
{ {
name: "prefill", name: "prefill",
@@ -90,6 +91,18 @@ I'll check the weather in San Francisco for you.<think>Speak poetry after the fi
expected: `<|im_start|>user expected: `<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|> <|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>
<|im_start|>assistant <|im_start|>assistant
Let me analyze this image.`,
},
{
name: "Image with image tags",
msgs: []api.Message{
{Role: "user", Content: "Describe this image.", Images: []api.ImageData{api.ImageData("img2")}},
{Role: "assistant", Content: "Let me analyze this image."},
},
useImgTags: true,
expected: `<|im_start|>user
[img]Describe this image.<|im_end|>
<|im_start|>assistant
Let me analyze this image.`, Let me analyze this image.`,
}, },
{ {
@@ -102,7 +115,18 @@ Let me analyze this image.`,
<|im_start|>assistant <|im_start|>assistant
`, `,
}, },
{
name: "Multiple images with image tags",
msgs: []api.Message{
{Role: "user", Content: "Describe these images.", Images: []api.ImageData{api.ImageData("img1"), api.ImageData("img2")}},
{Role: "assistant", Content: "Let me analyze this image."},
},
useImgTags: true,
expected: `<|im_start|>user
[img][img]Describe these images.<|im_end|>
<|im_start|>assistant
Let me analyze this image.`,
},
// // NOTE: solved with #12518: https://github.com/ollama/ollama/compare/main...drifkin/stable-tool-args // // NOTE: solved with #12518: https://github.com/ollama/ollama/compare/main...drifkin/stable-tool-args
// { // {
// name: "with tools and response", // name: "with tools and response",
@@ -485,7 +509,7 @@ I'll check.
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
rendered, err := (&Qwen3VLRenderer{false}).Render(tt.msgs, tt.tools, nil) rendered, err := (&Qwen3VLRenderer{isThinking: false, useImgTags: tt.useImgTags}).Render(tt.msgs, tt.tools, nil)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }

View File

@@ -323,7 +323,7 @@ Speak poetry after the first sentence.</think><think>Speak poetry after the seco
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
rendered, err := (&Qwen3VLRenderer{true}).Render(tt.msgs, tt.tools, nil) rendered, err := (&Qwen3VLRenderer{isThinking: true}).Render(tt.msgs, tt.tools, nil)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }

View File

@@ -17,6 +17,11 @@ type (
} }
) )
// RenderImgTags is a global flag that tells renderers to use [img] tags
// for images. This is set by the Ollama server package on init, or left as
// false for other environments where renderers are used
var RenderImgTags bool
func (r *RendererRegistry) Register(name string, renderer RendererConstructor) { func (r *RendererRegistry) Register(name string, renderer RendererConstructor) {
r.renderers[name] = renderer r.renderers[name] = renderer
} }
@@ -46,10 +51,10 @@ func rendererForName(name string) Renderer {
renderer := &Qwen3CoderRenderer{} renderer := &Qwen3CoderRenderer{}
return renderer return renderer
case "qwen3-vl-instruct": case "qwen3-vl-instruct":
renderer := &Qwen3VLRenderer{false} renderer := &Qwen3VLRenderer{isThinking: false, useImgTags: RenderImgTags}
return renderer return renderer
case "qwen3-vl-thinking": case "qwen3-vl-thinking":
renderer := &Qwen3VLRenderer{true} renderer := &Qwen3VLRenderer{isThinking: true, useImgTags: RenderImgTags}
return renderer return renderer
default: default:
return nil return nil

View File

@@ -39,6 +39,7 @@ import (
"github.com/ollama/ollama/logutil" "github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/middleware" "github.com/ollama/ollama/middleware"
"github.com/ollama/ollama/model/parsers" "github.com/ollama/ollama/model/parsers"
"github.com/ollama/ollama/model/renderers"
"github.com/ollama/ollama/server/internal/client/ollama" "github.com/ollama/ollama/server/internal/client/ollama"
"github.com/ollama/ollama/server/internal/registry" "github.com/ollama/ollama/server/internal/registry"
"github.com/ollama/ollama/template" "github.com/ollama/ollama/template"
@@ -91,6 +92,9 @@ func init() {
} }
gin.SetMode(mode) gin.SetMode(mode)
// Tell renderers to use [img] tags
renderers.RenderImgTags = true
} }
var ( var (