fix: qwen2.5 vl rope (#13486)

* qwen25vl: bump max pixels

* qwen25vl: mrope

fix qwen2.5vl window

* qwen25vl: vision rope
This commit is contained in:
Michael Yang
2025-12-15 17:30:33 -08:00
committed by GitHub
parent ffbe8e076d
commit 971d62595a
6 changed files with 195 additions and 216 deletions

View File

@@ -2,7 +2,6 @@ package qwen25vl
import (
"bytes"
"fmt"
"image"
"slices"
@@ -33,7 +32,7 @@ func New(c fs.Config) (model.Model, error) {
Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Ints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"),
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
EOS: append(
@@ -54,19 +53,18 @@ func New(c fs.Config) (model.Model, error) {
}
func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error) {
image, _, err := image.Decode(bytes.NewReader(multimodalData))
img, _, err := image.Decode(bytes.NewReader(multimodalData))
if err != nil {
return nil, nil, err
}
f32s, grid, err := m.ImageProcessor.ProcessImage(image)
f32s, grid, err := m.ImageProcessor.ProcessImage(img)
if err != nil {
return nil, nil, err
}
// Calculate tensor dimensions
patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
patchDim := m.numChannels * m.temporalPatchSize * m.patchSize * m.patchSize
numPatches := grid.Temporal * grid.Height * grid.Width
pixelValues := ctx.Input().FromFloats(f32s, patchDim, numPatches)
@@ -85,11 +83,13 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
}
visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
return []input.Multimodal{{Tensor: visionOutputs}}, nil
return []input.Multimodal{{Tensor: visionOutputs, Data: grid}}, nil
}
// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
// Reset position cache
m.positionCache = m.positionCache[:0]
var result []*input.Input
var (
@@ -98,40 +98,37 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
visionEndToken int32 = 151653
)
nImg := 0
appendInput := func(i *input.Input, p int) int {
result = append(result, i)
m.positionCache = append(m.positionCache, int32(p))
return p + 1
}
var p int
for _, inp := range inputs {
if inp.Multimodal == nil {
// If not a multimodal input, add it to the result unchanged
result = append(result, inp)
p = appendInput(inp, p)
} else {
// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
// the image tokens with a prompt, so we add a prefix here
nImg++
pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
if err != nil {
return nil, fmt.Errorf("failed to encode image prompt: %w", err)
}
for i := range pre {
result = append(result, &input.Input{Token: pre[i]})
}
patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
// First add the vision start token
result = append(result, &input.Input{Token: visionStartToken})
p = appendInput(&input.Input{Token: visionStartToken}, p)
// Add the image token with the multimodal tensor data at the first position
result = append(result, &input.Input{
tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
appendInput(&input.Input{
Token: imageToken,
Multimodal: inp.Multimodal,
MultimodalHash: inp.MultimodalHash,
SameBatch: patchesPerChunk,
})
SameBatch: tokensPerGrid,
}, p)
// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
result = append(result, slices.Repeat([]*input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
for range tokensPerGrid - 1 {
appendInput(&input.Input{Token: imageToken}, p)
}
result = append(result, &input.Input{Token: visionEndToken})
grid := inp.Multimodal[0].Data.(*Grid)
p = appendInput(&input.Input{Token: visionEndToken}, p+max(grid.Width/m.spatialMergeSize, grid.Height/m.spatialMergeSize))
}
}
@@ -139,9 +136,58 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
}
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
// Initial token embedding
hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
return m.TextModel.Forward(ctx, batch.Inputs, positions, batch.Outputs, batch, m.Cache)
positionSlice := func() [][]int32 {
s := [][]int32{
make([]int32, len(batch.Positions)),
make([]int32, len(batch.Positions)),
make([]int32, len(batch.Positions)),
make([]int32, len(batch.Positions)),
}
for i, position := range batch.Positions {
if position < int32(len(m.positionCache)) {
position = m.positionCache[position]
} else if len(m.positionCache) > 0 {
position = position - int32(len(m.positionCache)) + m.positionCache[len(m.positionCache)-1] + 1
}
s[0][i] = position
s[1][i] = position
s[2][i] = position
}
return s
}()
for _, mi := range batch.Multimodal {
img := mi.Multimodal[0].Tensor
ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
for i := range img.Dim(1) {
w := grid.Width / m.spatialMergeSize
positionSlice[1][mi.Index+i] += int32(i / w)
positionSlice[2][mi.Index+i] += int32(i % w)
}
}
}
positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
// Process through transformer layers
for i, layer := range m.TextModel.Layers {
m.Cache.SetLayer(i)
var lastLayerOutputs ml.Tensor
if i == len(m.TextModel.Layers)-1 {
lastLayerOutputs = batch.Outputs
}
hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, m.Cache, m.TextOptions)
}
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.TextModel.eps)
return m.Output.Forward(ctx, hiddenStates), nil
}
func init() {