mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 22:33:56 +00:00
fix: qwen2.5 vl rope (#13486)
* qwen25vl: bump max pixels * qwen25vl: mrope fix qwen2.5vl window * qwen25vl: vision rope
This commit is contained in:
@@ -2,7 +2,6 @@ package qwen25vl
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"image"
|
||||
"slices"
|
||||
|
||||
@@ -33,7 +32,7 @@ func New(c fs.Config) (model.Model, error) {
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
@@ -54,19 +53,18 @@ func New(c fs.Config) (model.Model, error) {
|
||||
}
|
||||
|
||||
func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error) {
|
||||
image, _, err := image.Decode(bytes.NewReader(multimodalData))
|
||||
img, _, err := image.Decode(bytes.NewReader(multimodalData))
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
f32s, grid, err := m.ImageProcessor.ProcessImage(image)
|
||||
f32s, grid, err := m.ImageProcessor.ProcessImage(img)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// Calculate tensor dimensions
|
||||
patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
|
||||
m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
|
||||
patchDim := m.numChannels * m.temporalPatchSize * m.patchSize * m.patchSize
|
||||
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||
|
||||
pixelValues := ctx.Input().FromFloats(f32s, patchDim, numPatches)
|
||||
@@ -85,11 +83,13 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
|
||||
}
|
||||
|
||||
visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
|
||||
return []input.Multimodal{{Tensor: visionOutputs}}, nil
|
||||
return []input.Multimodal{{Tensor: visionOutputs, Data: grid}}, nil
|
||||
}
|
||||
|
||||
// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
|
||||
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
|
||||
// Reset position cache
|
||||
m.positionCache = m.positionCache[:0]
|
||||
var result []*input.Input
|
||||
|
||||
var (
|
||||
@@ -98,40 +98,37 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
|
||||
visionEndToken int32 = 151653
|
||||
)
|
||||
|
||||
nImg := 0
|
||||
appendInput := func(i *input.Input, p int) int {
|
||||
result = append(result, i)
|
||||
m.positionCache = append(m.positionCache, int32(p))
|
||||
return p + 1
|
||||
}
|
||||
|
||||
var p int
|
||||
for _, inp := range inputs {
|
||||
if inp.Multimodal == nil {
|
||||
// If not a multimodal input, add it to the result unchanged
|
||||
result = append(result, inp)
|
||||
p = appendInput(inp, p)
|
||||
} else {
|
||||
// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
|
||||
// the image tokens with a prompt, so we add a prefix here
|
||||
nImg++
|
||||
pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to encode image prompt: %w", err)
|
||||
}
|
||||
for i := range pre {
|
||||
result = append(result, &input.Input{Token: pre[i]})
|
||||
}
|
||||
|
||||
patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
|
||||
|
||||
// First add the vision start token
|
||||
result = append(result, &input.Input{Token: visionStartToken})
|
||||
p = appendInput(&input.Input{Token: visionStartToken}, p)
|
||||
|
||||
// Add the image token with the multimodal tensor data at the first position
|
||||
result = append(result, &input.Input{
|
||||
tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
|
||||
appendInput(&input.Input{
|
||||
Token: imageToken,
|
||||
Multimodal: inp.Multimodal,
|
||||
MultimodalHash: inp.MultimodalHash,
|
||||
SameBatch: patchesPerChunk,
|
||||
})
|
||||
SameBatch: tokensPerGrid,
|
||||
}, p)
|
||||
|
||||
// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
|
||||
result = append(result, slices.Repeat([]*input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
|
||||
for range tokensPerGrid - 1 {
|
||||
appendInput(&input.Input{Token: imageToken}, p)
|
||||
}
|
||||
|
||||
result = append(result, &input.Input{Token: visionEndToken})
|
||||
grid := inp.Multimodal[0].Data.(*Grid)
|
||||
p = appendInput(&input.Input{Token: visionEndToken}, p+max(grid.Width/m.spatialMergeSize, grid.Height/m.spatialMergeSize))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -139,9 +136,58 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
|
||||
// Initial token embedding
|
||||
hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
|
||||
|
||||
return m.TextModel.Forward(ctx, batch.Inputs, positions, batch.Outputs, batch, m.Cache)
|
||||
positionSlice := func() [][]int32 {
|
||||
s := [][]int32{
|
||||
make([]int32, len(batch.Positions)),
|
||||
make([]int32, len(batch.Positions)),
|
||||
make([]int32, len(batch.Positions)),
|
||||
make([]int32, len(batch.Positions)),
|
||||
}
|
||||
for i, position := range batch.Positions {
|
||||
if position < int32(len(m.positionCache)) {
|
||||
position = m.positionCache[position]
|
||||
} else if len(m.positionCache) > 0 {
|
||||
position = position - int32(len(m.positionCache)) + m.positionCache[len(m.positionCache)-1] + 1
|
||||
}
|
||||
|
||||
s[0][i] = position
|
||||
s[1][i] = position
|
||||
s[2][i] = position
|
||||
}
|
||||
return s
|
||||
}()
|
||||
|
||||
for _, mi := range batch.Multimodal {
|
||||
img := mi.Multimodal[0].Tensor
|
||||
ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
|
||||
if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
|
||||
for i := range img.Dim(1) {
|
||||
w := grid.Width / m.spatialMergeSize
|
||||
positionSlice[1][mi.Index+i] += int32(i / w)
|
||||
positionSlice[2][mi.Index+i] += int32(i % w)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
|
||||
|
||||
// Process through transformer layers
|
||||
for i, layer := range m.TextModel.Layers {
|
||||
m.Cache.SetLayer(i)
|
||||
|
||||
var lastLayerOutputs ml.Tensor
|
||||
if i == len(m.TextModel.Layers)-1 {
|
||||
lastLayerOutputs = batch.Outputs
|
||||
}
|
||||
|
||||
hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, m.Cache, m.TextOptions)
|
||||
}
|
||||
|
||||
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.TextModel.eps)
|
||||
return m.Output.Forward(ctx, hiddenStates), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
|
||||
Reference in New Issue
Block a user