diff --git a/model/parsers/parsers.go b/model/parsers/parsers.go index ea92a915..4374f3e2 100644 --- a/model/parsers/parsers.go +++ b/model/parsers/parsers.go @@ -45,6 +45,9 @@ func ParserForName(name string) Parser { case "qwen3-vl-instruct": parser := &Qwen3VLParser{hasThinkingSupport: false} return parser + case "qwen3-vl-thinking": + parser := &Qwen3VLParser{hasThinkingSupport: true} + return parser case "passthrough": return &PassthroughParser{} case "harmony": diff --git a/model/parsers/qwen3vl.go b/model/parsers/qwen3vl.go index 965e3246..75ee6abe 100644 --- a/model/parsers/qwen3vl.go +++ b/model/parsers/qwen3vl.go @@ -22,7 +22,6 @@ const ( thinkingCloseTag = "" ) -// TODO(gguo): add a field for isThinking type Qwen3VLParser struct { state qwenParserState buffer strings.Builder @@ -34,21 +33,28 @@ func (p *Qwen3VLParser) HasToolSupport() bool { return true } -// TODO(gguo): changes this to reference an objects param func (p *Qwen3VLParser) HasThinkingSupport() bool { return p.hasThinkingSupport } -func (p *Qwen3VLParser) initialState() qwenParserState { - if p.HasThinkingSupport() { // has thinking, start from collecting thinking content - return CollectingThinkingContent +func (p *Qwen3VLParser) setInitialState(lastMessage *api.Message) { + prefill := lastMessage != nil && lastMessage.Role == "assistant" + if !p.HasThinkingSupport() { + p.state = CollectingContent + return } - return CollectingContent + + if prefill && lastMessage.Content != "" { + p.state = CollectingContent + return + } + + p.state = CollectingThinkingContent } func (p *Qwen3VLParser) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool { p.tools = tools - p.state = p.initialState() + p.setInitialState(lastMessage) return tools } @@ -63,7 +69,8 @@ func (p *Qwen3VLParser) Add(s string, done bool) (content string, thinking strin events := p.parseEvents() var toolCalls []api.ToolCall - var sb strings.Builder + var contentSb strings.Builder + var thinkingSb strings.Builder for _, event := range events { switch event := event.(type) { case qwenEventRawToolCall: @@ -74,15 +81,15 @@ func (p *Qwen3VLParser) Add(s string, done bool) (content string, thinking strin } toolCalls = append(toolCalls, toolCall) case qwenEventThinkingContent: - sb.WriteString(event.content) + thinkingSb.WriteString(event.content) case qwenEventContent: // TODO(drifkin): if the same turn contains multiple interleaved content // events, we naively append them together here. - sb.WriteString(event.content) + contentSb.WriteString(event.content) } } - return sb.String(), "", toolCalls, nil + return contentSb.String(), thinkingSb.String(), toolCalls, nil } func (p *Qwen3VLParser) parseEvents() []qwenEvent { @@ -155,7 +162,7 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) { case CollectingToolContent: if strings.Contains(p.buffer.String(), toolCloseTag) { split := strings.SplitN(p.buffer.String(), toolCloseTag, 2) - before := split[0] + before := split[0] // do we also need to do it to tool calls? if len(before) == 0 { slog.Warn("qwen tool call closing tag found but no content before it") } @@ -169,10 +176,11 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) { } else { return events, false } - case CollectingThinkingContent: // so we want to hip the unambiguous stuff + case CollectingThinkingContent: if strings.Contains(p.buffer.String(), thinkingCloseTag) { split := strings.SplitN(p.buffer.String(), thinkingCloseTag, 2) - before := split[0] + // before := split[0] + before := strings.TrimRightFunc(split[0], unicode.IsSpace) if len(before) == 0 { slog.Warn("qwen tool call closing tag found but no content before it") } @@ -184,7 +192,7 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) { p.buffer.WriteString(after) p.state = CollectingContent return events, true - } else if overlapLen := overlap(p.buffer.String(), thinkingCloseTag); overlapLen > 0 { // we see part of a close thinking tag + } else if overlapLen := overlap(p.buffer.String(), thinkingCloseTag); overlapLen > 0 { beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen] trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag) ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen diff --git a/model/parsers/qwen3vl_thinking_test.go b/model/parsers/qwen3vl_thinking_test.go index a94344a4..d85a60fd 100644 --- a/model/parsers/qwen3vl_thinking_test.go +++ b/model/parsers/qwen3vl_thinking_test.go @@ -344,3 +344,205 @@ func TestQwen3VLThinkingToolParser(t *testing.T) { } } } + +func TestQwen3VLParserState(t *testing.T) { + cases := []struct { + desc string + hasThinking bool + last *api.Message + wantState qwenParserState + }{ + { + desc: "no thinking support => CollectingContent", + hasThinking: false, + last: nil, + wantState: CollectingContent, + }, + { + desc: "thinking support, no last message => CollectingThinkingContent", + hasThinking: true, + last: nil, + wantState: CollectingThinkingContent, + }, + { + desc: "thinking support, last assistant with empty content => CollectingThinkingContent", + hasThinking: true, + last: &api.Message{Role: "assistant", Content: ""}, + wantState: CollectingThinkingContent, + }, + { + desc: "thinking support, last assistant with content => CollectingContent", + hasThinking: true, + last: &api.Message{Role: "assistant", Content: "hello"}, + wantState: CollectingContent, + }, + { + desc: "thinking support, last is user => CollectingThinkingContent", + hasThinking: true, + last: &api.Message{Role: "user", Content: "hi"}, + wantState: CollectingThinkingContent, + }, + } + + for _, tc := range cases { + parser := Qwen3VLParser{hasThinkingSupport: tc.hasThinking} + parser.Init(nil, tc.last) + if parser.state != tc.wantState { + t.Errorf("%s: got state %v, want %v", tc.desc, parser.state, tc.wantState) + } + } +} + +func TestQwen3VLThinkingParserWithThinkingPrefill(t *testing.T) { + type step struct { + input string + wantEvents []qwenEvent + } + + cases := []struct { + desc string + steps []step + only bool + }{ + { + desc: "thinking prefill", + steps: []step{ + {input: "abc", wantEvents: []qwenEvent{qwenEventThinkingContent{content: "abc"}}}, + }, + }, + { + desc: "thinking prefill with content", + steps: []step{ + {input: "abc def", wantEvents: []qwenEvent{qwenEventContent{content: "def"}}}, + }, + }, + { + desc: "thinking prefill with fakeout", + steps: []step{ + {input: "abc", wantEvents: []qwenEvent{}}, + }, + }, + { + desc: "thinking prefill with spaces", + steps: []step{ + {input: " starting content", wantEvents: []qwenEvent{qwenEventContent{content: "starting content"}}}, + }, + }, + } + last := &api.Message{Role: "assistant", Thinking: "i am thinking"} // so if there is thinking the test is still thinking + + for _, tc := range cases { + t.Run(tc.desc, func(t *testing.T) { + parser := Qwen3VLParser{hasThinkingSupport: true} + parser.Init([]api.Tool{}, last) + + for i, step := range tc.steps { + parser.buffer.WriteString(step.input) + gotEvents := parser.parseEvents() + + if len(gotEvents) == 0 && len(step.wantEvents) == 0 { + // avoid deep equal on empty vs. nil slices + continue + } + + if !reflect.DeepEqual(gotEvents, step.wantEvents) { + t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents) + } + } + }) + } +} + +func TestQwen3VLThinkingParserWithNonThinkingPrefill(t *testing.T) { + type step struct { + input string + wantEvents []qwenEvent + } + + cases := []struct { + desc string + steps []step + only bool + }{ + { + desc: "thinking prefill", + steps: []step{ + {input: "abc", wantEvents: []qwenEvent{qwenEventContent{content: "abc"}}}, + }, + }, + { + desc: "thinking prefill with content", + steps: []step{ + {input: "abc def", wantEvents: []qwenEvent{qwenEventContent{content: "ink> def"}}}, + }, + }, + { + desc: "thinking prefill with fakeout", + steps: []step{ + {input: "abc", wantEvents: []qwenEvent{qwenEventContent{content: ">"}}}, + }, + }, + { + desc: "thinking prefill with spaces", + steps: []step{ + {input: " starting content", wantEvents: []qwenEvent{qwenEventContent{content: " starting content"}}}, + }, + }, + } + last := &api.Message{Role: "assistant", Thinking: "i am thinking", Content: "i am content"} // so if there is thinking the test is still thinking + + for _, tc := range cases { + t.Run(tc.desc, func(t *testing.T) { + parser := Qwen3VLParser{hasThinkingSupport: true} + parser.Init([]api.Tool{}, last) + + for i, step := range tc.steps { + parser.buffer.WriteString(step.input) + gotEvents := parser.parseEvents() + + if len(gotEvents) == 0 && len(step.wantEvents) == 0 { + // avoid deep equal on empty vs. nil slices + continue + } + + if !reflect.DeepEqual(gotEvents, step.wantEvents) { + t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents) + } + } + }) + } +} + +func TestQwen3VLThinkingParserStreamingAssistantPrefillContent(t *testing.T) { + // last message is assistant with content ⇒ start in CollectingContent + last := &api.Message{Role: "assistant", Content: "has content"} + parser := Qwen3VLParser{hasThinkingSupport: true} + parser.Init([]api.Tool{}, last) + + type step struct { + input string + wantEvents []qwenEvent + } + + steps := []step{ + {input: "abc", wantEvents: []qwenEvent{qwenEventContent{content: "abc"}}}, + {input: "{\"name\": \"x\", \"arguments\": {}}", wantEvents: []qwenEvent{qwenEventRawToolCall{raw: "{\"name\": \"x\", \"arguments\": {}}"}}}, + } + + for i, s := range steps { + parser.buffer.WriteString(s.input) + gotEvents := parser.parseEvents() + if len(gotEvents) == 0 && len(s.wantEvents) == 0 { + continue + } + if !reflect.DeepEqual(gotEvents, s.wantEvents) { + t.Fatalf("step %d: input %q: got %#v, want %#v", i, s.input, gotEvents, s.wantEvents) + } + } +} diff --git a/model/renderers/renderer.go b/model/renderers/renderer.go index e97b4581..96a90825 100644 --- a/model/renderers/renderer.go +++ b/model/renderers/renderer.go @@ -48,6 +48,9 @@ func rendererForName(name string) Renderer { case "qwen3-vl-instruct": renderer := &Qwen3VLRenderer{false} return renderer + case "qwen3-vl-thinking": + renderer := &Qwen3VLRenderer{true} + return renderer default: return nil }