From adff143bcda0c7ab4ca3a85dc3db5a81552368c7 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 22 May 2025 11:30:49 -0700 Subject: [PATCH] fix: mllama quality (#10807) * fix mllama convert - transform attn_gate and ffn_gate - swap attention heads for vision models * fix mllama the mlp gate which was applied in the wrong place --- convert/convert_mllama.go | 65 +++++++++++++++++++---------- model/models/mllama/model_vision.go | 22 ++-------- 2 files changed, 45 insertions(+), 42 deletions(-) diff --git a/convert/convert_mllama.go b/convert/convert_mllama.go index 12478be7..69d7f588 100644 --- a/convert/convert_mllama.go +++ b/convert/convert_mllama.go @@ -94,7 +94,9 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor { var out []*ggml.Tensor var text []Tensor for _, t := range ts { - if t.Name() == "v.position_embd.gate" { + if !strings.HasPrefix(t.Name(), "v.") && !strings.HasPrefix(t.Name(), "mm.") { + text = append(text, t) + } else if t.Name() == "v.position_embd.gate" { for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} { tt := t.Clone() tt.SetRepacker(m.repack(name)) @@ -105,23 +107,21 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor { WriterTo: tt, }) } - } else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" { - t.SetRepacker(m.repack(t.Name())) - out = append(out, &ggml.Tensor{ - Name: t.Name(), - Kind: t.Kind(), - Shape: t.Shape(), - WriterTo: t, - }) - } else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") { - out = append(out, &ggml.Tensor{ - Name: t.Name(), - Kind: t.Kind(), - Shape: t.Shape(), - WriterTo: t, - }) } else { - text = append(text, t) + if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" { + t.SetRepacker(m.repack(t.Name())) + } else if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") { + t.SetRepacker(m.repack(t.Name())) + } else if strings.HasSuffix(t.Name(), "attn_gate") || strings.HasSuffix(t.Name(), "ffn_gate") { + t.SetRepacker(m.repack(t.Name())) + } + + out = append(out, &ggml.Tensor{ + Name: t.Name(), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) } } @@ -137,16 +137,35 @@ func (m *mllamaModel) repack(name string) Repacker { var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) - t, err = tensor.Tanh(t) - if err != nil { - return nil, err - } + if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_k.weight") { + heads := m.VisionModel.AttentionHeads + if err := t.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil { + return nil, err + } - if name == "v.position_embd.gate" { - t, err = tensor.Sub(float32(1), t) + if err := t.T(0, 2, 1, 3); err != nil { + return nil, err + } + + if err := t.Reshape(dims...); err != nil { + return nil, err + } + + if err := t.Transpose(); err != nil { + return nil, err + } + } else { + t, err = tensor.Tanh(t) if err != nil { return nil, err } + + if name == "v.position_embd.gate" { + t, err = tensor.Sub(float32(1), t) + if err != nil { + return nil, err + } + } } t = tensor.Materialize(t) diff --git a/model/models/mllama/model_vision.go b/model/models/mllama/model_vision.go index 77ea5373..2d424947 100644 --- a/model/models/mllama/model_vision.go +++ b/model/models/mllama/model_vision.go @@ -16,8 +16,6 @@ type VisionSelfAttention struct { Key *nn.Linear `gguf:"attn_k"` Value *nn.Linear `gguf:"attn_v"` Output *nn.Linear `gguf:"attn_output"` - - Gate ml.Tensor `gguf:"attn_gate"` } func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor { @@ -25,27 +23,16 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op query := sa.Query.Forward(ctx, hiddenState) query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize) - query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx) key := sa.Key.Forward(ctx, hiddenState) key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize) - key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx) value := sa.Value.Forward(ctx, hiddenState) value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize) - value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx) - scores := key.Mulmat(ctx, query) - scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim))) - scores = scores.Softmax(ctx) - - attention := value.Mulmat(ctx, scores) - attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize) - attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx) + attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), nil) attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize) - - hiddenState = sa.Output.Forward(ctx, attention) - return hiddenState + return sa.Output.Forward(ctx, attention) } type VisionMLP struct { @@ -76,21 +63,18 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts // self attention hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts) - if e.AttentionGate != nil { hiddenState = hiddenState.Mul(ctx, e.AttentionGate) } hiddenState = hiddenState.Add(ctx, residual) residual = hiddenState - // feed forward hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = e.MLP.Forward(ctx, hiddenState, opts) - hiddenState = hiddenState.Add(ctx, residual) if e.MLPGate != nil { hiddenState = hiddenState.Mul(ctx, e.MLPGate) } - + hiddenState = hiddenState.Add(ctx, residual) return hiddenState }