@@ -503,7 +503,9 @@ namespace vl
503503 public:
504504 Tokenizer (const Config &config)
505505 : Tokenizer(config, &_chat_encoder)
506- {}
506+ {
507+ auto_add_bos = false ;
508+ }
507509
508510 Tokenizer (const Config &config, BaseHistoryEncoder *chat_encoder)
509511 : BaseTokenizer(config, chat_encoder, nullptr , nullptr )
@@ -515,17 +517,19 @@ namespace vl
515517 {
516518 tp = new tokenizer::BPEProcessor2 (
517519 {
518- " [\\ p{Han}]+" ,
519520 // FIXME: support &&
521+ // "[\\p{Han}]+",
520522 // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
521- // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
522- " [^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]+[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?" ,
523- " [^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]*[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?" ,
523+ // "[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
524+
525+ " (?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])" ,
526+ " [^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+" ,
527+
524528 " \\ p{N}{1,3}" ,
525529 " ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*" ,
526530 " \\ s*[\\ r\\ n]+" ,
527531 " \\ s+(?!\\ S)" ,
528- " \\ +" ,
532+ // "\\s +",
529533 }
530534 );
531535 size_t size = tp->Load (buffer, n_vocab);
0 commit comments