diff --git a/server/routes.go b/server/routes.go
index cc891353..ae166214 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -1673,6 +1673,10 @@ func (s *Server) ChatHandler(c *gin.Context) {
OpeningTag: openingTag,
ClosingTag: closingTag,
}
+
+ if strings.HasSuffix(strings.TrimSpace(prompt), openingTag) {
+ thinkingState.AddContent(openingTag)
+ }
}
var toolParser *tools.Parser
diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go
index a57975f1..a3b83fc1 100644
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -969,3 +969,233 @@ func TestGenerate(t *testing.T) {
}
})
}
+
+func TestChatWithPromptEndingInThinkTag(t *testing.T) {
+ gin.SetMode(gin.TestMode)
+
+ // Helper to create a standard thinking test setup
+ setupThinkingTest := func(t *testing.T) (*mockRunner, *Server) {
+ mock := &mockRunner{
+ CompletionResponse: llm.CompletionResponse{
+ Done: true,
+ DoneReason: llm.DoneReasonStop,
+ PromptEvalCount: 1,
+ PromptEvalDuration: 1,
+ EvalCount: 1,
+ EvalDuration: 1,
+ },
+ }
+
+ s := &Server{
+ sched: &Scheduler{
+ pendingReqCh: make(chan *LlmRequest, 1),
+ finishedReqCh: make(chan *LlmRequest, 1),
+ expiredCh: make(chan *runnerRef, 1),
+ unloadedCh: make(chan any, 1),
+ loaded: make(map[string]*runnerRef),
+ newServerFn: newMockServer(mock),
+ getGpuFn: discover.GetGPUInfo,
+ getCpuFn: discover.GetCPUInfo,
+ reschedDelay: 250 * time.Millisecond,
+ loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+ time.Sleep(time.Millisecond)
+ req.successCh <- &runnerRef{llama: mock}
+ return false
+ },
+ },
+ }
+
+ go s.sched.Run(t.Context())
+
+ // Create a model with thinking support
+ _, digest := createBinFile(t, ggml.KV{
+ "general.architecture": "llama",
+ "llama.block_count": uint32(1),
+ "llama.context_length": uint32(8192),
+ "llama.embedding_length": uint32(4096),
+ "llama.attention.head_count": uint32(32),
+ "llama.attention.head_count_kv": uint32(8),
+ "tokenizer.ggml.tokens": []string{""},
+ "tokenizer.ggml.scores": []float32{0},
+ "tokenizer.ggml.token_type": []int32{0},
+ }, []*ggml.Tensor{
+ {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+ {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+ {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+ {Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+ {Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+ {Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+ {Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+ {Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+ {Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+ {Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+ {Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+ })
+
+ // Create model with thinking template that adds at the end
+ w := createRequest(t, s.CreateHandler, api.CreateRequest{
+ Model: "test-thinking",
+ Files: map[string]string{"file.gguf": digest},
+ Template: `{{- range .Messages }}
+{{- if eq .Role "user" }}user: {{ .Content }}
+{{ else if eq .Role "assistant" }}assistant: {{ if .Thinking }}{{ .Thinking }}{{ end }}{{ .Content }}
+{{ end }}{{ end }}`,
+ Stream: &stream,
+ })
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("expected status 200, got %d", w.Code)
+ }
+
+ return mock, s
+ }
+
+ mock, s := setupThinkingTest(t)
+
+ // Helper to test chat responses
+ testChatRequest := func(t *testing.T, name string, userContent string, modelResponse string, expectedThinking string, expectedContent string, think bool) {
+ t.Run(name, func(t *testing.T) {
+ mock.CompletionResponse = llm.CompletionResponse{
+ Content: modelResponse,
+ Done: true,
+ DoneReason: llm.DoneReasonStop,
+ PromptEvalCount: 1,
+ PromptEvalDuration: 1,
+ EvalCount: 1,
+ EvalDuration: 1,
+ }
+ mock.CompletionFn = nil
+
+ streamRequest := false
+ req := api.ChatRequest{
+ Model: "test-thinking",
+ Messages: []api.Message{
+ {Role: "user", Content: userContent},
+ },
+ Stream: &streamRequest,
+ }
+ if think {
+ req.Think = &api.ThinkValue{Value: think}
+ }
+
+ w := createRequest(t, s.ChatHandler, req)
+ if w.Code != http.StatusOK {
+ t.Fatalf("expected status 200, got %d", w.Code)
+ }
+
+ var resp api.ChatResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatal(err)
+ }
+
+ if resp.Message.Thinking != expectedThinking {
+ t.Errorf("expected thinking %q, got %q", expectedThinking, resp.Message.Thinking)
+ }
+
+ if resp.Message.Content != expectedContent {
+ t.Errorf("expected content %q, got %q", expectedContent, resp.Message.Content)
+ }
+ })
+ }
+
+ // Test cases - Note: Template adds at the end, and leading whitespace after is eaten by the parser
+ testChatRequest(t, "basic thinking response",
+ "Help me solve this problem",
+ " Let me think about this step by step... The answer is 42.",
+ "Let me think about this step by step... ",
+ "The answer is 42.",
+ true)
+
+ testChatRequest(t, "thinking with multiple sentences",
+ "Explain quantum computing",
+ " First, I need to understand the basics. Quantum bits can be in superposition. Quantum computing uses quantum mechanics principles.",
+ "First, I need to understand the basics. Quantum bits can be in superposition. ",
+ "Quantum computing uses quantum mechanics principles.",
+ true)
+
+ testChatRequest(t, "no thinking content",
+ "What is 2+2?",
+ " The answer is 4.",
+ "",
+ "The answer is 4.",
+ true)
+
+ testChatRequest(t, "thinking disabled but template still adds think tag",
+ "Simple question",
+ " My thoughts The answer.",
+ "",
+ " My thoughts The answer.",
+ false)
+
+ // Test streaming response with template-added
+ t.Run("streaming with thinking", func(t *testing.T) {
+ var wg sync.WaitGroup
+ wg.Add(1)
+
+ mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
+ defer wg.Done()
+
+ // Verify the prompt ends with due to template
+ if !strings.HasSuffix(r.Prompt, "") {
+ t.Errorf("expected prompt to end with , got: %q", r.Prompt)
+ }
+
+ // Simulate streaming chunks
+ responses := []llm.CompletionResponse{
+ {Content: " I need to consider", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
+ {Content: " multiple factors here...", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
+ {Content: " Based on my analysis,", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
+ {Content: " the solution is straightforward.", Done: true, DoneReason: llm.DoneReasonStop, PromptEvalCount: 1, PromptEvalDuration: 1, EvalCount: 1, EvalDuration: 1},
+ }
+
+ for _, resp := range responses {
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ default:
+ fn(resp)
+ time.Sleep(10 * time.Millisecond)
+ }
+ }
+ return nil
+ }
+
+ think := true
+ w := createRequest(t, s.ChatHandler, api.ChatRequest{
+ Model: "test-thinking",
+ Messages: []api.Message{{Role: "user", Content: "Analyze this complex problem"}},
+ Think: &api.ThinkValue{Value: think},
+ Stream: &stream,
+ })
+
+ wg.Wait()
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("expected status 200, got %d", w.Code)
+ }
+
+ // Parse streaming responses
+ decoder := json.NewDecoder(w.Body)
+ var allThinking, allContent strings.Builder
+
+ for {
+ var resp api.ChatResponse
+ if err := decoder.Decode(&resp); err == io.EOF {
+ break
+ } else if err != nil {
+ t.Fatal(err)
+ }
+ allThinking.WriteString(resp.Message.Thinking)
+ allContent.WriteString(resp.Message.Content)
+ }
+
+ // Note: Leading whitespace after is eaten by the parser
+ if got := allThinking.String(); got != "I need to consider multiple factors here... " {
+ t.Errorf("expected thinking %q, got %q", "I need to consider multiple factors here... ", got)
+ }
+
+ if got := allContent.String(); got != "Based on my analysis, the solution is straightforward." {
+ t.Errorf("expected content %q, got %q", "Based on my analysis, the solution is straightforward.", got)
+ }
+ })
+}