multi-regexp pretokenizer (#12325)

2025-12-23 23:18:26 +00:00 · 2025-09-23 13:21:47 -07:00
parent 64883e3c4c
commit a40d427bce
12 changed files with 124 additions and 34 deletions
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
@@ -139,7 +139,6 @@ func New(c fs.Config) (model.Model, error) {
 	m := Model{
 		Layers: make([]DecoderLayer, c.Uint("block_count")),
 		BytePairEncoding: model.NewBytePairEncoding(
-			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
@@ -152,6 +151,7 @@ func New(c fs.Config) (model.Model, error) {
 					c.Ints("tokenizer.ggml.eos_token_ids")...,
 				),
 			},
+			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
 		),
 		Options: Options{
 			hiddenSize: int(c.Uint("embedding_length")),