models: Prune unused outputs earlier in the forward pass

Currently Rows is called as the last step in a model computation to get the values for the output tokens. However, if we move it earlier in the process then we can trim out computations that never get used. This is similar to how models are defined in llama.cpp. Changing the model definition in this way improves token generation performance by approximately 8%.
2025-12-23 15:08:27 +00:00 · 2025-02-18 17:16:57 -08:00
parent e5bcc51ae1
commit 5c5535c064
3 changed files with 46 additions and 23 deletions
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -93,15 +93,13 @@ func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
 		return nil, err
 	}

-	// TODO: attention mask, cross attention mask
-	hiddenState := m.TextModel.Forward(ctx, inputs, positions, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache))
-
 	outputs, err := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
 	if err != nil {
 		return nil, err
 	}

-	return hiddenState.Rows(ctx, outputs), nil
+	// TODO: attention mask, cross attention mask
+	return m.TextModel.Forward(ctx, inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
 }

 func init() {