feat(model): add qwen3vl (#12665)

2025-12-23 23:18:26 +00:00 · 2025-10-28 17:39:47 -07:00
parent 36d64fb531
commit 7d25b9e194
22 changed files with 1502 additions and 35 deletions
--- a/model/models/qwen3vl/model.go
+++ b/model/models/qwen3vl/model.go
@@ -0,0 +1,204 @@
+package qwen3vl
+
+import (
+	"bytes"
+	"image"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Model struct {
+	model.Base
+	model.TextProcessor
+
+	*TextModel
+	*VisionModel `gguf:"v"`
+
+	ImageProcessor
+
+	positionCache []int32
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	img, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
+	}
+
+	pixelValues, grid, err := m.ProcessImage(ctx, img)
+	if err != nil {
+		return nil, err
+	}
+
+	// Calculate tensor dimensions
+	visionOutputs, deepstackVisualEmbeds := m.VisionModel.Forward(ctx, pixelValues, grid)
+	mm := []input.Multimodal{{Tensor: visionOutputs, Data: grid}}
+	for i := range deepstackVisualEmbeds {
+		mm = append(mm, input.Multimodal{Tensor: deepstackVisualEmbeds[i]})
+	}
+
+	return mm, nil
+}
+
+var (
+	tokenVision      int32 = 151655
+	tokenVisionStart int32 = 151652
+	tokenVisionEnd   int32 = 151653
+)
+
+type modelInput struct {
+	*input.Input
+	position int32
+}
+
+// PostTokenize arranges Qwen 3 VL's inputs for the forward pass
+func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+	m.positionCache = m.positionCache[:0]
+	return slices.Collect(func(yield func(*input.Input) bool) {
+		for i := range inputs {
+			s := []modelInput{{Input: inputs[i]}}
+			if mm := inputs[i].Multimodal; mm != nil {
+				t := mm[0].Tensor
+				s = slices.Repeat([]modelInput{
+					{
+						position: int32(i + 1),
+						Input:    &input.Input{Token: tokenVision},
+					},
+				}, t.Dim(1)+1+1)
+
+				s[0] = modelInput{
+					Input:    &input.Input{Token: tokenVisionStart},
+					position: int32(i),
+				}
+
+				s[len(s)-1] = modelInput{
+					Input:    &input.Input{Token: tokenVisionEnd},
+					position: int32(i + mm[0].Data.(*Grid).Width/m.spatialMergeSize + 1),
+				}
+
+				s[1] = modelInput{
+					Input: &input.Input{
+						Token:          tokenVision,
+						Multimodal:     inputs[i].Multimodal,
+						MultimodalHash: inputs[i].MultimodalHash,
+						SameBatch:      t.Dim(1),
+					},
+					position: int32(i + 1),
+				}
+			}
+
+			for _, e := range s {
+				position := e.position
+				if position == 0 && len(m.positionCache) > 0 {
+					position = m.positionCache[len(m.positionCache)-1] + 1
+				}
+
+				m.positionCache = append(m.positionCache, position)
+				if !yield(e.Input) {
+					return
+				}
+			}
+		}
+	}), nil
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positionSlice := slices.Collect(makeSlice2D[int32](3, len(batch.Positions)))
+	for i, id := range batch.Positions {
+		if id < int32(len(m.positionCache)) {
+			id = m.positionCache[id]
+		} else if len(m.positionCache) > 0 {
+			id = id - int32(len(m.positionCache)) + m.positionCache[len(m.positionCache)-1] + 1
+		}
+
+		positionSlice[0][i] = id
+		positionSlice[1][i] = id
+		positionSlice[2][i] = id
+	}
+
+	hiddenStates := m.TextModel.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
+
+	var deepstackVisualEmbeds []ml.Tensor
+	for _, mi := range batch.Multimodal {
+		visionOutputs := mi.Multimodal[0].Tensor
+		ctx.Forward(visionOutputs.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
+
+		if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
+			for i := range visionOutputs.Dim(1) {
+				w := grid.Width / m.spatialMergeSize
+				positionSlice[1][mi.Index+i] += int32(i / w)
+				positionSlice[2][mi.Index+i] += int32(i % w)
+			}
+		}
+
+		deepstackVisualEmbeds = make([]ml.Tensor, len(mi.Multimodal[1:]))
+		for i, mm := range mi.Multimodal[1:] {
+			deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
+			ctx.Forward(mm.Tensor.Copy(ctx, deepstackVisualEmbeds[i].View(ctx, mi.Index*deepstackVisualEmbeds[i].Stride(1), mm.Tensor.Dim(0)*mm.Tensor.Dim(1))))
+		}
+	}
+
+	positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0]), len(positionSlice))
+	cos, sin := m.rotaryEmbedding(ctx, positions)
+	for i, layer := range m.TextModel.Layers {
+		if m.Cache != nil {
+			m.Cache.SetLayer(i)
+		}
+
+		var outputs ml.Tensor
+		if i == len(m.TextModel.Layers)-1 {
+			outputs = batch.Outputs
+		}
+
+		hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, outputs, m.Cache, m.Options)
+		if i < len(deepstackVisualEmbeds) {
+			hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i])
+		}
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, 1e-06)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
+
+func New(c fs.Config) (model.Model, error) {
+	m := Model{
+		TextProcessor: model.NewBytePairEncoding(
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+		),
+		TextModel:      newTextModel(c),
+		VisionModel:    newVisionModel(c),
+		ImageProcessor: newImageProcessor(c),
+	}
+
+	m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, position ml.Tensor) (ml.Tensor, error) {
+		m.positionCache = nil
+		return nil, kvcache.ErrNotSupported
+	})
+	return &m, nil
+}
+
+func init() {
+	model.Register("qwen3vl", New)
+	model.Register("qwen3vlmoe", New)
+}