multi-regexp pretokenizer (#12325)

This commit is contained in:
Michael Yang
2025-09-23 13:21:47 -07:00
committed by GitHub
parent 64883e3c4c
commit a40d427bce
12 changed files with 124 additions and 34 deletions

View File

@@ -35,7 +35,6 @@ func newEmbed(c fs.Config) (model.Model, error) {
}
m := embedModel{
BytePairEncoding: model.NewBytePairEncoding(
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
&model.Vocabulary{
Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Ints("tokenizer.ggml.token_type"),
@@ -48,6 +47,7 @@ func newEmbed(c fs.Config) (model.Model, error) {
c.Ints("tokenizer.ggml.eos_token_ids")...,
),
},
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
),
Model: &Model{
Layers: layers,