mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 14:26:30 +00:00
server: add logprobs and top_logprobs support to Ollama's API (#12899)
Adds logprobs support to Ollama's API including support for Ollama's OpenAI-compatible API. By specifying the new 'logprobs' boolean parameter in the API, Ollama will return the log probabilities for each token generated. 'top_logprobs', an integer value can also be specified up to the value 20. When specified, the API will also provide the number of most likely tokens to return at each token position Co-authored-by: Baptiste Jamin <baptiste@crisp.chat>
This commit is contained in:
79
runner/common/logprob.go
Normal file
79
runner/common/logprob.go
Normal file
@@ -0,0 +1,79 @@
|
||||
package common
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
// TokenDecoderFunc is a function that converts token IDs to text.
|
||||
type TokenDecoderFunc func(tokenID int) string
|
||||
|
||||
// CalculateLogprobs converts raw logits to log probabilities and finds top K tokens.
|
||||
// It uses numerically stable softmax to compute log probabilities.
|
||||
func CalculateLogprobs(logits []float32, selectedToken int, topK int, decoder TokenDecoderFunc) []llm.Logprob {
|
||||
if len(logits) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Step 1: Convert logits to log probabilities using numerically stable softmax
|
||||
maxLogit := logits[0]
|
||||
for _, logit := range logits[1:] {
|
||||
if logit > maxLogit {
|
||||
maxLogit = logit
|
||||
}
|
||||
}
|
||||
|
||||
var sumExp float64
|
||||
for _, logit := range logits {
|
||||
sumExp += math.Exp(float64(logit - maxLogit))
|
||||
}
|
||||
logSumExp := float32(math.Log(sumExp))
|
||||
|
||||
logProbs := make([]float32, len(logits))
|
||||
for i, logit := range logits {
|
||||
logProbs[i] = (logit - maxLogit) - logSumExp
|
||||
}
|
||||
|
||||
// Step 2: Get selected token's information
|
||||
selectedLogprob := logProbs[selectedToken]
|
||||
selectedText := decoder(selectedToken)
|
||||
|
||||
result := llm.Logprob{
|
||||
TokenLogprob: llm.TokenLogprob{
|
||||
Token: selectedText,
|
||||
Logprob: float64(selectedLogprob),
|
||||
},
|
||||
}
|
||||
|
||||
// Step 3: If topK requested, find the top K tokens
|
||||
if topK > 0 {
|
||||
type tokenLogprobPair struct {
|
||||
tokenID int
|
||||
logprob float32
|
||||
}
|
||||
|
||||
pairs := make([]tokenLogprobPair, len(logProbs))
|
||||
for i, lp := range logProbs {
|
||||
pairs[i] = tokenLogprobPair{tokenID: i, logprob: lp}
|
||||
}
|
||||
|
||||
sort.Slice(pairs, func(i, j int) bool {
|
||||
return pairs[i].logprob > pairs[j].logprob
|
||||
})
|
||||
|
||||
k := min(topK, len(pairs))
|
||||
topLogprobs := make([]llm.TokenLogprob, k)
|
||||
for i := range k {
|
||||
tokenText := decoder(pairs[i].tokenID)
|
||||
topLogprobs[i] = llm.TokenLogprob{
|
||||
Token: tokenText,
|
||||
Logprob: float64(pairs[i].logprob),
|
||||
}
|
||||
}
|
||||
result.TopLogprobs = topLogprobs
|
||||
}
|
||||
|
||||
return []llm.Logprob{result}
|
||||
}
|
||||
498
runner/common/logprob_test.go
Normal file
498
runner/common/logprob_test.go
Normal file
@@ -0,0 +1,498 @@
|
||||
package common
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
func TestCalculateLogprobs(t *testing.T) {
|
||||
tokens := map[int]string{
|
||||
0: "hello",
|
||||
1: "hi",
|
||||
2: "hey",
|
||||
3: "world",
|
||||
}
|
||||
decoder := func(tokenID int) string {
|
||||
if text, ok := tokens[tokenID]; ok {
|
||||
return text
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
logits []float32
|
||||
selectedToken int
|
||||
topK int
|
||||
wantLen int
|
||||
wantToken string
|
||||
}{
|
||||
{
|
||||
name: "Empty logits",
|
||||
logits: []float32{},
|
||||
selectedToken: 0,
|
||||
topK: 0,
|
||||
wantLen: 0,
|
||||
},
|
||||
{
|
||||
name: "Single token without top logprobs",
|
||||
logits: []float32{1.0, 0.5, 0.3, 0.1},
|
||||
selectedToken: 0,
|
||||
topK: 0,
|
||||
wantLen: 1,
|
||||
wantToken: "hello",
|
||||
},
|
||||
{
|
||||
name: "Single token with top logprobs",
|
||||
logits: []float32{1.0, 0.5, 0.3, 0.1},
|
||||
selectedToken: 0,
|
||||
topK: 3,
|
||||
wantLen: 1,
|
||||
wantToken: "hello",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := CalculateLogprobs(tt.logits, tt.selectedToken, tt.topK, decoder)
|
||||
if len(result) != tt.wantLen {
|
||||
t.Errorf("CalculateLogprobs() returned %d results, want %d", len(result), tt.wantLen)
|
||||
}
|
||||
if tt.wantLen > 0 && result[0].Token != tt.wantToken {
|
||||
t.Errorf("CalculateLogprobs() token = %s, want %s", result[0].Token, tt.wantToken)
|
||||
}
|
||||
if tt.topK > 0 && len(result) > 0 {
|
||||
if len(result[0].TopLogprobs) != tt.topK {
|
||||
t.Errorf("CalculateLogprobs() top logprobs count = %d, want %d", len(result[0].TopLogprobs), tt.topK)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateLogprobsNumericalStability(t *testing.T) {
|
||||
tokens := map[int]string{
|
||||
0: "a",
|
||||
1: "b",
|
||||
2: "c",
|
||||
}
|
||||
decoder := func(tokenID int) string {
|
||||
if text, ok := tokens[tokenID]; ok {
|
||||
return text
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// Test with very large logits to ensure numerical stability
|
||||
logits := []float32{1000.0, 999.0, 998.0}
|
||||
result := CalculateLogprobs(logits, 0, 3, decoder)
|
||||
|
||||
if len(result) != 1 {
|
||||
t.Fatalf("Expected 1 result, got %d", len(result))
|
||||
}
|
||||
|
||||
// Check that log probabilities are finite and reasonable
|
||||
if math.IsInf(result[0].Logprob, 0) || math.IsNaN(result[0].Logprob) {
|
||||
t.Errorf("Selected token logprob is not finite: %f", result[0].Logprob)
|
||||
}
|
||||
|
||||
for i, tlp := range result[0].TopLogprobs {
|
||||
if math.IsInf(tlp.Logprob, 0) || math.IsNaN(tlp.Logprob) {
|
||||
t.Errorf("Top logprob[%d] is not finite: %f", i, tlp.Logprob)
|
||||
}
|
||||
}
|
||||
|
||||
// Top logprobs should be in descending order
|
||||
for i := 1; i < len(result[0].TopLogprobs); i++ {
|
||||
if result[0].TopLogprobs[i].Logprob > result[0].TopLogprobs[i-1].Logprob {
|
||||
t.Errorf("Top logprobs not in descending order: %f > %f",
|
||||
result[0].TopLogprobs[i].Logprob, result[0].TopLogprobs[i-1].Logprob)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateLogprobsProbabilityCorrectness(t *testing.T) {
|
||||
tokens := map[int]string{
|
||||
0: "hello",
|
||||
1: "world",
|
||||
2: "foo",
|
||||
3: "bar",
|
||||
}
|
||||
decoder := func(tokenID int) string {
|
||||
if text, ok := tokens[tokenID]; ok {
|
||||
return text
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
logits []float32
|
||||
selectedToken int
|
||||
topK int
|
||||
}{
|
||||
{
|
||||
name: "Uniform logits",
|
||||
logits: []float32{1.0, 1.0, 1.0, 1.0},
|
||||
selectedToken: 0,
|
||||
topK: 4,
|
||||
},
|
||||
{
|
||||
name: "Different logits",
|
||||
logits: []float32{2.0, 1.0, 0.5, 0.1},
|
||||
selectedToken: 0,
|
||||
topK: 4,
|
||||
},
|
||||
{
|
||||
name: "Negative logits",
|
||||
logits: []float32{-1.0, -2.0, -3.0, -4.0},
|
||||
selectedToken: 0,
|
||||
topK: 4,
|
||||
},
|
||||
{
|
||||
name: "Mixed logits",
|
||||
logits: []float32{5.0, -5.0, 0.0, 2.5},
|
||||
selectedToken: 0,
|
||||
topK: 4,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := CalculateLogprobs(tt.logits, tt.selectedToken, tt.topK, decoder)
|
||||
|
||||
if len(result) != 1 {
|
||||
t.Fatalf("Expected 1 result, got %d", len(result))
|
||||
}
|
||||
|
||||
// Verify all probabilities are non-positive (log probabilities should be <= 0)
|
||||
if result[0].Logprob > 0 {
|
||||
t.Errorf("Selected token logprob should be <= 0, got %f", result[0].Logprob)
|
||||
}
|
||||
|
||||
for i, tlp := range result[0].TopLogprobs {
|
||||
if tlp.Logprob > 0 {
|
||||
t.Errorf("Top logprob[%d] should be <= 0, got %f", i, tlp.Logprob)
|
||||
}
|
||||
}
|
||||
|
||||
// Verify that probabilities sum to approximately 1
|
||||
// Sum of exp(logprob) for all tokens should equal 1
|
||||
var probSum float64
|
||||
for _, lp := range result[0].TopLogprobs {
|
||||
probSum += math.Exp(lp.Logprob)
|
||||
}
|
||||
|
||||
// For uniform logits, each probability should be 1/n
|
||||
if tt.name == "Uniform logits" {
|
||||
expectedProb := 1.0 / float64(len(tt.logits))
|
||||
actualProb := math.Exp(result[0].Logprob)
|
||||
if math.Abs(actualProb-expectedProb) > 1e-6 {
|
||||
t.Errorf("For uniform logits, expected probability %f, got %f",
|
||||
expectedProb, actualProb)
|
||||
}
|
||||
}
|
||||
|
||||
// Verify top logprobs are sorted in descending order
|
||||
for i := 1; i < len(result[0].TopLogprobs); i++ {
|
||||
if result[0].TopLogprobs[i].Logprob > result[0].TopLogprobs[i-1].Logprob {
|
||||
t.Errorf("Top logprobs not sorted: position %d (%f) > position %d (%f)",
|
||||
i, result[0].TopLogprobs[i].Logprob,
|
||||
i-1, result[0].TopLogprobs[i-1].Logprob)
|
||||
}
|
||||
}
|
||||
|
||||
// Verify the selected token appears in top logprobs
|
||||
selectedText := decoder(tt.selectedToken)
|
||||
found := false
|
||||
for _, tlp := range result[0].TopLogprobs {
|
||||
if tlp.Token == selectedText {
|
||||
found = true
|
||||
// The logprob in top logprobs should match the selected token's logprob
|
||||
if math.Abs(tlp.Logprob-result[0].Logprob) > 1e-6 {
|
||||
t.Errorf("Selected token logprob mismatch: main=%f, in top=%f",
|
||||
result[0].Logprob, tlp.Logprob)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("Selected token %q not found in top logprobs", selectedText)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateLogprobsSoftmaxCorrectness(t *testing.T) {
|
||||
// Test that softmax calculation is correct by verifying probabilities sum to 1
|
||||
decoder := func(tokenID int) string {
|
||||
return string(rune('A' + tokenID))
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
logits []float32
|
||||
}{
|
||||
{
|
||||
name: "Small vocabulary",
|
||||
logits: []float32{1.0, 2.0, 3.0},
|
||||
},
|
||||
{
|
||||
name: "Large differences",
|
||||
logits: []float32{10.0, 0.0, -10.0},
|
||||
},
|
||||
{
|
||||
name: "All equal",
|
||||
logits: []float32{5.0, 5.0, 5.0, 5.0, 5.0},
|
||||
},
|
||||
{
|
||||
name: "Very large values",
|
||||
logits: []float32{500.0, 499.0, 498.0},
|
||||
},
|
||||
{
|
||||
name: "Very small values",
|
||||
logits: []float32{-500.0, -499.0, -498.0},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
// Calculate logprobs for all tokens
|
||||
var totalProb float64
|
||||
for i := range tt.logits {
|
||||
result := CalculateLogprobs(tt.logits, i, 0, decoder)
|
||||
if len(result) != 1 {
|
||||
t.Fatalf("Expected 1 result, got %d", len(result))
|
||||
}
|
||||
prob := math.Exp(result[0].Logprob)
|
||||
totalProb += prob
|
||||
|
||||
// Verify each probability is between 0 and 1
|
||||
if prob < 0 || prob > 1 {
|
||||
t.Errorf("Token %d probability %f is out of range [0, 1]", i, prob)
|
||||
}
|
||||
}
|
||||
|
||||
// Total probability should be very close to 1.0 (allowing for floating point errors)
|
||||
if math.Abs(totalProb-1.0) > 1e-5 {
|
||||
t.Errorf("Total probability sum is %f, expected 1.0", totalProb)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateLogprobsSelectedTokenCorrectness(t *testing.T) {
|
||||
decoder := func(tokenID int) string {
|
||||
return string(rune('A' + tokenID))
|
||||
}
|
||||
|
||||
logits := []float32{3.0, 1.0, 2.0, 0.5}
|
||||
|
||||
// Test that selecting different tokens gives the correct probabilities
|
||||
// and that the highest logit has the highest probability
|
||||
maxLogitIndex := 0
|
||||
maxLogitValue := logits[0]
|
||||
for i, logit := range logits[1:] {
|
||||
if logit > maxLogitValue {
|
||||
maxLogitValue = logit
|
||||
maxLogitIndex = i + 1
|
||||
}
|
||||
}
|
||||
|
||||
var maxProb float64
|
||||
var maxProbIndex int
|
||||
|
||||
for i := range logits {
|
||||
result := CalculateLogprobs(logits, i, 0, decoder)
|
||||
prob := math.Exp(result[0].Logprob)
|
||||
|
||||
if prob > maxProb {
|
||||
maxProb = prob
|
||||
maxProbIndex = i
|
||||
}
|
||||
|
||||
// Verify the token matches
|
||||
expectedToken := decoder(i)
|
||||
if result[0].Token != expectedToken {
|
||||
t.Errorf("Token %d: expected token %q, got %q", i, expectedToken, result[0].Token)
|
||||
}
|
||||
}
|
||||
|
||||
// The token with the highest logit should have the highest probability
|
||||
if maxProbIndex != maxLogitIndex {
|
||||
t.Errorf("Token with highest probability (%d) doesn't match token with highest logit (%d)",
|
||||
maxProbIndex, maxLogitIndex)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateLogprobsTopKOrdering(t *testing.T) {
|
||||
tokens := map[int]string{
|
||||
0: "first",
|
||||
1: "second",
|
||||
2: "third",
|
||||
3: "fourth",
|
||||
4: "fifth",
|
||||
}
|
||||
decoder := func(tokenID int) string {
|
||||
return tokens[tokenID]
|
||||
}
|
||||
|
||||
// Logits in non-sorted order
|
||||
logits := []float32{2.0, 5.0, 1.0, 4.0, 3.0}
|
||||
// Expected order by probability: 1 (5.0), 3 (4.0), 4 (3.0), 0 (2.0), 2 (1.0)
|
||||
expectedOrder := []string{"second", "fourth", "fifth", "first", "third"}
|
||||
|
||||
result := CalculateLogprobs(logits, 0, 5, decoder)
|
||||
|
||||
if len(result) != 1 {
|
||||
t.Fatalf("Expected 1 result, got %d", len(result))
|
||||
}
|
||||
|
||||
if len(result[0].TopLogprobs) != 5 {
|
||||
t.Fatalf("Expected 5 top logprobs, got %d", len(result[0].TopLogprobs))
|
||||
}
|
||||
|
||||
// Verify ordering matches expected
|
||||
for i, tlp := range result[0].TopLogprobs {
|
||||
if tlp.Token != expectedOrder[i] {
|
||||
t.Errorf("Position %d: expected token %q, got %q", i, expectedOrder[i], tlp.Token)
|
||||
}
|
||||
}
|
||||
|
||||
// Verify probabilities are in descending order
|
||||
for i := 1; i < len(result[0].TopLogprobs); i++ {
|
||||
if result[0].TopLogprobs[i].Logprob > result[0].TopLogprobs[i-1].Logprob {
|
||||
t.Errorf("Probabilities not in descending order at position %d: %f > %f",
|
||||
i, result[0].TopLogprobs[i].Logprob, result[0].TopLogprobs[i-1].Logprob)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLogprobsWithStopSequences(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
pendingResponses []string
|
||||
pendingLogprobs []llm.Logprob
|
||||
stop string
|
||||
expectedResponses []string
|
||||
expectedLogprobs int
|
||||
}{
|
||||
{
|
||||
name: "Single token stop",
|
||||
pendingResponses: []string{"Hello", " world", "!"},
|
||||
pendingLogprobs: []llm.Logprob{
|
||||
{TokenLogprob: llm.TokenLogprob{Token: "Hello", Logprob: -0.1}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: " world", Logprob: -0.2}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: "!", Logprob: -0.3}},
|
||||
},
|
||||
stop: "!",
|
||||
expectedResponses: []string{"Hello", " world"},
|
||||
expectedLogprobs: 2,
|
||||
},
|
||||
{
|
||||
name: "Multi-token stop sequence",
|
||||
pendingResponses: []string{"Hello", " ", "there", "STOP"},
|
||||
pendingLogprobs: []llm.Logprob{
|
||||
{TokenLogprob: llm.TokenLogprob{Token: "Hello", Logprob: -0.1}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: " ", Logprob: -0.2}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: "there", Logprob: -0.3}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: "STOP", Logprob: -0.4}},
|
||||
},
|
||||
stop: "STOP",
|
||||
expectedResponses: []string{"Hello", " ", "there"},
|
||||
expectedLogprobs: 3,
|
||||
},
|
||||
{
|
||||
name: "Partial token stop",
|
||||
pendingResponses: []string{"Hello", " the", "re!"},
|
||||
pendingLogprobs: []llm.Logprob{
|
||||
{TokenLogprob: llm.TokenLogprob{Token: "Hello", Logprob: -0.1}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: " the", Logprob: -0.2}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: "re!", Logprob: -0.3}},
|
||||
},
|
||||
stop: "there!",
|
||||
expectedResponses: []string{"Hello", " "},
|
||||
expectedLogprobs: 2,
|
||||
},
|
||||
{
|
||||
name: "Stop at beginning of last token",
|
||||
pendingResponses: []string{"Hello", " world", "END"},
|
||||
pendingLogprobs: []llm.Logprob{
|
||||
{TokenLogprob: llm.TokenLogprob{Token: "Hello", Logprob: -0.1}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: " world", Logprob: -0.2}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: "END", Logprob: -0.3}},
|
||||
},
|
||||
stop: "END",
|
||||
expectedResponses: []string{"Hello", " world"},
|
||||
expectedLogprobs: 2,
|
||||
},
|
||||
{
|
||||
name: "Multi-token stop across tokens",
|
||||
pendingResponses: []string{"Text", " ", "with", " ", "stop", " ", "word"},
|
||||
pendingLogprobs: []llm.Logprob{
|
||||
{TokenLogprob: llm.TokenLogprob{Token: "Text", Logprob: -0.1}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: " ", Logprob: -0.2}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: "with", Logprob: -0.3}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: " ", Logprob: -0.4}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: "stop", Logprob: -0.5}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: " ", Logprob: -0.6}},
|
||||
{TokenLogprob: llm.TokenLogprob{Token: "word", Logprob: -0.7}},
|
||||
},
|
||||
stop: "stop word",
|
||||
expectedResponses: []string{"Text", " ", "with", " "},
|
||||
expectedLogprobs: 4,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
// Simulate the stop sequence detection and truncation
|
||||
origLen := len(tt.pendingResponses)
|
||||
responses, tokenTruncated := TruncateStop(tt.pendingResponses, tt.stop)
|
||||
newLen := len(responses)
|
||||
|
||||
// Simulate logprobs truncation
|
||||
logprobs := make([]llm.Logprob, len(tt.pendingLogprobs))
|
||||
copy(logprobs, tt.pendingLogprobs)
|
||||
|
||||
origLogprobsLen := len(logprobs)
|
||||
numTokensRemoved := origLen - newLen
|
||||
newLogprobsLen := origLogprobsLen - numTokensRemoved
|
||||
if newLogprobsLen < 0 {
|
||||
newLogprobsLen = 0
|
||||
}
|
||||
logprobs = logprobs[:newLogprobsLen]
|
||||
|
||||
// Verify responses were truncated correctly
|
||||
if len(responses) != len(tt.expectedResponses) {
|
||||
t.Errorf("Expected %d responses, got %d", len(tt.expectedResponses), len(responses))
|
||||
}
|
||||
|
||||
// Verify logprobs count matches truncated responses
|
||||
if len(logprobs) != tt.expectedLogprobs {
|
||||
t.Errorf("Expected %d logprobs after truncation, got %d", tt.expectedLogprobs, len(logprobs))
|
||||
}
|
||||
|
||||
// Verify logprobs count matches response count
|
||||
if len(logprobs) != len(responses) {
|
||||
t.Errorf("Logprobs count (%d) doesn't match responses count (%d)", len(logprobs), len(responses))
|
||||
}
|
||||
|
||||
// Verify the correct logprobs were kept (skip last token if it was truncated)
|
||||
// When tokenTruncated is true, the last response token may not match the logprob token
|
||||
checkLen := len(logprobs)
|
||||
if tokenTruncated && checkLen > 0 {
|
||||
checkLen-- // Skip checking the last token when it was partially truncated
|
||||
}
|
||||
|
||||
for i := range checkLen {
|
||||
if i < len(responses) && logprobs[i].Token != responses[i] {
|
||||
t.Errorf("Logprob[%d] token %q doesn't match response[%d] %q",
|
||||
i, logprobs[i].Token, i, responses[i])
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -28,6 +28,12 @@ import (
|
||||
"github.com/ollama/ollama/runner/common"
|
||||
)
|
||||
|
||||
// response contains a piece of generated text along with optional logprobs
|
||||
type response struct {
|
||||
content string
|
||||
logprobs []llm.Logprob
|
||||
}
|
||||
|
||||
// input is an element of the prompt to process, either
|
||||
// a token or an image embedding (generated from a vision projector)
|
||||
type input struct {
|
||||
@@ -53,11 +59,14 @@ type Sequence struct {
|
||||
// tokens that have been generated but not returned yet (e.g. for stop sequences)
|
||||
pendingResponses []string
|
||||
|
||||
// logprobs for tokens that haven't been returned yet
|
||||
pendingLogprobs []llm.Logprob
|
||||
|
||||
// input cache being used by this sequence
|
||||
cache *InputCacheSlot
|
||||
|
||||
// channel to send responses over
|
||||
responses chan string
|
||||
responses chan response
|
||||
|
||||
// channel to stop decoding (such as if the remote connection is closed)
|
||||
quit chan bool
|
||||
@@ -84,6 +93,10 @@ type Sequence struct {
|
||||
|
||||
doneReason llm.DoneReason
|
||||
|
||||
// logprobs configuration
|
||||
logprobs bool
|
||||
topLogprobs int
|
||||
|
||||
// Metrics
|
||||
processingDuration time.Duration
|
||||
generationDuration time.Duration
|
||||
@@ -99,6 +112,8 @@ type NewSequenceParams struct {
|
||||
embedding bool
|
||||
shift bool
|
||||
truncate bool
|
||||
logprobs bool
|
||||
topLogprobs int
|
||||
}
|
||||
|
||||
var errorInputTooLong = errors.New("the input length exceeds the context length")
|
||||
@@ -155,7 +170,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
||||
numPromptInputs: len(inputs),
|
||||
numPredict: params.numPredict,
|
||||
pendingResponses: make([]string, 0),
|
||||
responses: make(chan string, 100),
|
||||
responses: make(chan response, 100),
|
||||
quit: make(chan bool, 1),
|
||||
embedding: make(chan []float32, 1),
|
||||
samplingCtx: sc,
|
||||
@@ -163,9 +178,16 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
||||
stop: params.stop,
|
||||
numKeep: params.numKeep,
|
||||
shift: params.shift,
|
||||
logprobs: params.logprobs,
|
||||
topLogprobs: params.topLogprobs,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// calculateLogprobsLlama converts raw logits to log probabilities and finds top K tokens
|
||||
func calculateLogprobsLlama(logits []float32, selectedToken int, topK int, model *llama.Model) []llm.Logprob {
|
||||
return common.CalculateLogprobs(logits, selectedToken, topK, model.TokenToPiece)
|
||||
}
|
||||
|
||||
// inputs processes the prompt and images into a list of inputs
|
||||
// by splitting the prompt on [img-<n>] tags, tokenizing text and
|
||||
// generating image embeddings for each image
|
||||
@@ -294,7 +316,9 @@ func (s *Server) allNil() bool {
|
||||
|
||||
func flushPending(seq *Sequence) bool {
|
||||
joined := strings.Join(seq.pendingResponses, "")
|
||||
logprobs := seq.pendingLogprobs
|
||||
seq.pendingResponses = []string{}
|
||||
seq.pendingLogprobs = []llm.Logprob{}
|
||||
|
||||
// Check if there are any partial UTF-8 characters remaining.
|
||||
// We already check and queue as we are generating but some may
|
||||
@@ -311,7 +335,7 @@ func flushPending(seq *Sequence) bool {
|
||||
}
|
||||
|
||||
select {
|
||||
case seq.responses <- joined:
|
||||
case seq.responses <- response{content: joined, logprobs: logprobs}:
|
||||
return true
|
||||
case <-seq.quit:
|
||||
return false
|
||||
@@ -526,6 +550,15 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||
continue
|
||||
}
|
||||
|
||||
// Calculate logprobs if requested (after EOS check to avoid logprobs for EOS tokens)
|
||||
if seq.logprobs {
|
||||
logits := s.lc.GetLogitsIth(seq.iBatch)
|
||||
if logits != nil {
|
||||
logprobs := calculateLogprobsLlama(logits, token, seq.topLogprobs, s.model)
|
||||
seq.pendingLogprobs = append(seq.pendingLogprobs, logprobs...)
|
||||
}
|
||||
}
|
||||
|
||||
seq.inputs = []input{{token: token}}
|
||||
|
||||
seq.pendingResponses = append(seq.pendingResponses, piece)
|
||||
@@ -539,6 +572,17 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||
seq.pendingResponses, tokenTruncated = common.TruncateStop(seq.pendingResponses, stop)
|
||||
newLen := len(seq.pendingResponses)
|
||||
|
||||
// Truncate logprobs to match the truncated responses
|
||||
if seq.logprobs {
|
||||
origLogprobsLen := len(seq.pendingLogprobs)
|
||||
numTokensRemoved := origLen - newLen
|
||||
newLogprobsLen := origLogprobsLen - numTokensRemoved
|
||||
if newLogprobsLen < 0 {
|
||||
newLogprobsLen = 0
|
||||
}
|
||||
seq.pendingLogprobs = seq.pendingLogprobs[:newLogprobsLen]
|
||||
}
|
||||
|
||||
// Update the cache based on the tokens that will be returned:
|
||||
// - We have 1 token more than is currently in the cache because
|
||||
// the last one generated wasn't submitted to Decode
|
||||
@@ -618,6 +662,8 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||
embedding: false,
|
||||
shift: req.Shift,
|
||||
truncate: req.Truncate,
|
||||
logprobs: req.Logprobs,
|
||||
topLogprobs: req.TopLogprobs,
|
||||
})
|
||||
if err != nil {
|
||||
if errors.Is(err, errorInputTooLong) {
|
||||
@@ -669,10 +715,11 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||
case <-r.Context().Done():
|
||||
close(seq.quit)
|
||||
return
|
||||
case content, ok := <-seq.responses:
|
||||
case resp, ok := <-seq.responses:
|
||||
if ok {
|
||||
if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
|
||||
Content: content,
|
||||
Content: resp.content,
|
||||
Logprobs: resp.logprobs,
|
||||
}); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||
close(seq.quit)
|
||||
|
||||
@@ -41,6 +41,12 @@ import (
|
||||
_ "github.com/ollama/ollama/model/models"
|
||||
)
|
||||
|
||||
// response contains a piece of generated text along with optional logprobs
|
||||
type response struct {
|
||||
content string
|
||||
logprobs []llm.Logprob
|
||||
}
|
||||
|
||||
type Sequence struct {
|
||||
// ctxs are used for allocating tensors that last the lifetime of the sequence, such as
|
||||
// multimodal embeddings
|
||||
@@ -61,11 +67,14 @@ type Sequence struct {
|
||||
// tokens that have been generated but not returned yet (e.g. for stop sequences)
|
||||
pendingResponses []string
|
||||
|
||||
// logprobs for tokens that haven't been returned yet
|
||||
pendingLogprobs []llm.Logprob
|
||||
|
||||
// input cache being used by this sequence
|
||||
cache *InputCacheSlot
|
||||
|
||||
// channel to send responses over
|
||||
responses chan string
|
||||
responses chan response
|
||||
|
||||
// channel to stop decoding (such as if the remote connection is closed)
|
||||
quit chan bool
|
||||
@@ -93,6 +102,10 @@ type Sequence struct {
|
||||
|
||||
doneReason llm.DoneReason
|
||||
|
||||
// logprobs configuration
|
||||
logprobs bool
|
||||
topLogprobs int
|
||||
|
||||
// Metrics
|
||||
startedAt, lastUpdatedAt time.Time
|
||||
processingDuration time.Duration
|
||||
@@ -102,13 +115,15 @@ type Sequence struct {
|
||||
}
|
||||
|
||||
type NewSequenceParams struct {
|
||||
numPredict int
|
||||
stop []string
|
||||
numKeep int32
|
||||
sampler sample.Sampler
|
||||
embedding bool
|
||||
shift bool
|
||||
truncate bool
|
||||
numPredict int
|
||||
stop []string
|
||||
numKeep int32
|
||||
sampler sample.Sampler
|
||||
embedding bool
|
||||
shift bool
|
||||
truncate bool
|
||||
logprobs bool
|
||||
topLogprobs int
|
||||
}
|
||||
|
||||
var errorInputTooLong = errors.New("the input length exceeds the context length")
|
||||
@@ -181,7 +196,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
||||
numPromptInputs: len(inputs),
|
||||
numPredict: params.numPredict,
|
||||
pendingResponses: make([]string, 0),
|
||||
responses: make(chan string, 100),
|
||||
responses: make(chan response, 100),
|
||||
quit: make(chan bool, 1),
|
||||
embedding: make(chan []float32, 1),
|
||||
sampler: params.sampler,
|
||||
@@ -189,9 +204,20 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
||||
stop: params.stop,
|
||||
numKeep: params.numKeep,
|
||||
shift: params.shift,
|
||||
logprobs: params.logprobs,
|
||||
topLogprobs: params.topLogprobs,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// calculateLogprobs converts raw logits to log probabilities and finds top K tokens
|
||||
func calculateLogprobs(logits []float32, selectedToken int32, topK int, textProcessor model.TextProcessor) []llm.Logprob {
|
||||
decoder := func(tokenID int) string {
|
||||
text, _ := textProcessor.Decode([]int32{int32(tokenID)})
|
||||
return text
|
||||
}
|
||||
return common.CalculateLogprobs(logits, int(selectedToken), topK, decoder)
|
||||
}
|
||||
|
||||
// inputs processes the prompt and images into a list of inputs
|
||||
// by splitting the prompt on [img-<n>] tags, tokenizing text and
|
||||
// decoding images
|
||||
@@ -371,7 +397,9 @@ func (s *Server) allNil() bool {
|
||||
|
||||
func flushPending(seq *Sequence) bool {
|
||||
joined := strings.Join(seq.pendingResponses, "")
|
||||
logprobs := seq.pendingLogprobs
|
||||
seq.pendingResponses = []string{}
|
||||
seq.pendingLogprobs = []llm.Logprob{}
|
||||
|
||||
// Check if there are any partial UTF-8 characters remaining.
|
||||
// We already check and queue as we are generating but some may
|
||||
@@ -388,7 +416,7 @@ func flushPending(seq *Sequence) bool {
|
||||
}
|
||||
|
||||
select {
|
||||
case seq.responses <- joined:
|
||||
case seq.responses <- response{content: joined, logprobs: logprobs}:
|
||||
return true
|
||||
case <-seq.quit:
|
||||
return false
|
||||
@@ -729,7 +757,8 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
||||
// sample a token
|
||||
vocabSize := len(outputs) / activeBatch.batch.Outputs.Dim(0)
|
||||
logutil.Trace("computeBatch: vocab details", "batchID", activeBatch.id, "seqIdx", i, "len(logits)", len(outputs), "len(activeBatch.batch.Outputs)", activeBatch.batch.Outputs.Dim(0), "vocabSize", vocabSize, "iBatches", iBatches)
|
||||
token, err := seq.sampler.Sample(outputs[iBatches[i]*vocabSize : (iBatches[i]+1)*vocabSize])
|
||||
logits := outputs[iBatches[i]*vocabSize : (iBatches[i]+1)*vocabSize]
|
||||
token, err := seq.sampler.Sample(logits)
|
||||
if err != nil {
|
||||
panic("failed to sample token")
|
||||
}
|
||||
@@ -751,6 +780,12 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
||||
panic("failed to decode token")
|
||||
}
|
||||
|
||||
// Calculate logprobs if requested (after EOS check to avoid logprobs for EOS tokens)
|
||||
if seq.logprobs {
|
||||
logprobs := calculateLogprobs(logits, token, seq.topLogprobs, s.model.(model.TextProcessor))
|
||||
seq.pendingLogprobs = append(seq.pendingLogprobs, logprobs...)
|
||||
}
|
||||
|
||||
seq.pendingResponses = append(seq.pendingResponses, piece)
|
||||
sequence := strings.Join(seq.pendingResponses, "")
|
||||
|
||||
@@ -762,6 +797,17 @@ func (s *Server) computeBatch(activeBatch batchState) {
|
||||
seq.pendingResponses, tokenTruncated = common.TruncateStop(seq.pendingResponses, stop)
|
||||
newLen := len(seq.pendingResponses)
|
||||
|
||||
// Truncate logprobs to match the truncated responses
|
||||
if seq.logprobs {
|
||||
origLogprobsLen := len(seq.pendingLogprobs)
|
||||
numTokensRemoved := origLen - newLen
|
||||
newLogprobsLen := origLogprobsLen - numTokensRemoved
|
||||
if newLogprobsLen < 0 {
|
||||
newLogprobsLen = 0
|
||||
}
|
||||
seq.pendingLogprobs = seq.pendingLogprobs[:newLogprobsLen]
|
||||
}
|
||||
|
||||
// Update the cache based on the tokens that will be returned:
|
||||
// - We have 1 token more than is currently in the cache because
|
||||
// the last one generated wasn't submitted to Decode
|
||||
@@ -845,13 +891,15 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||
)
|
||||
|
||||
seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
|
||||
numPredict: req.Options.NumPredict,
|
||||
stop: req.Options.Stop,
|
||||
numKeep: int32(req.Options.NumKeep),
|
||||
sampler: sampler,
|
||||
embedding: false,
|
||||
shift: req.Shift,
|
||||
truncate: req.Truncate,
|
||||
numPredict: req.Options.NumPredict,
|
||||
stop: req.Options.Stop,
|
||||
numKeep: int32(req.Options.NumKeep),
|
||||
sampler: sampler,
|
||||
embedding: false,
|
||||
shift: req.Shift,
|
||||
truncate: req.Truncate,
|
||||
logprobs: req.Logprobs,
|
||||
topLogprobs: req.TopLogprobs,
|
||||
})
|
||||
if err != nil {
|
||||
if errors.Is(err, errorInputTooLong) {
|
||||
@@ -903,10 +951,11 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||
case <-r.Context().Done():
|
||||
close(seq.quit)
|
||||
return
|
||||
case content, ok := <-seq.responses:
|
||||
case resp, ok := <-seq.responses:
|
||||
if ok {
|
||||
if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
|
||||
Content: content,
|
||||
Content: resp.content,
|
||||
Logprobs: resp.logprobs,
|
||||
}); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||
close(seq.quit)
|
||||
|
||||
Reference in New Issue
Block a user