@@ -191,14 +191,14 @@ class PreTokenizerWithRegEx {
191191 return {};
192192 }
193193
194- // "\s+(?!\S)|\s+) "
194+ // "\s+(?!\S)|\s+"
195195 std::u32string_view Match_GPT2_Pattern_4 () {
196196 if ((m_text.size () >= 1 ) && (IsZ (m_text[0 ]))) {
197197 size_t i = 1 ;
198198 for (; i < m_text.size (); ++i) {
199199 if (!IsZ (m_text[i])) break ;
200200 }
201- if ((i > 1 ) && (i != m_text.size ())) { // \s+( ?!\S)
201+ if ((i > 1 ) && (i != m_text.size ())) { // ?!\S
202202 i--;
203203 std::u32string_view res = m_text.substr (0 , i);
204204 m_text = m_text.substr (i);
@@ -504,28 +504,29 @@ class PreTokenizerWithRegEx {
504504 OrtxStatus Compile (const std::string& regex) {
505505 // NOTES: to avoid the short pattern shadowing the longer one, the longer pattern should be placed first
506506 auto patterns = std::vector<std::tuple<std::string_view, RegexMatchFunc>>{
507- {R"( (?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))" ,
508- &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
509507 {R"( [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?)" ,
510508 &PreTokenizerWithRegEx::Match_PHI4_Pattern_1},
511509 {R"( [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?)" ,
512510 &PreTokenizerWithRegEx::Match_PHI4_Pattern_2},
511+ {R"( (?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))" ,
512+ &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
513513 {R"( (?i:'s|'t|'re|'ve|'m|'ll|'d))" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
514- {R"( 's|'t|'re|'ve|'m|'ll|'d)" , &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
515- {R"( [^\r\n\p{L}\p{N}]?\p{L}+)" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
516- {R"( \p{N}{1,3})" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
514+ {R"( ?[^\s\p{L}\p{N}]+[\r\n/]*)" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
517515 {R"( ?[^\s\p{L}\p{N}]+[\r\n]*)" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
518- {R"( \s*[ \r\n] +)" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5 },
519- {R"( ?\p{L}+| ?\p{N}+ )" , &PreTokenizerWithRegEx::Match_GPT2_Pattern_2 },
516+ {R"( [^ \r\n\p{L}\p{N}]?\p{L} +)" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2 },
517+ {R"( 's|'t|'re|'ve|'m|'ll|'d )" , &PreTokenizerWithRegEx::Match_GPT2_Pattern_1 },
520518 {R"( ?[^\s\p{L}\p{N}]+)" , &PreTokenizerWithRegEx::Match_GPT2_Pattern_3},
521- {R"( \s+(?!\S)|\s +)" , &PreTokenizerWithRegEx::Match_GPT2_Pattern_4 },
519+ {R"( ?\p{L}+| ?\p{N} +)" , &PreTokenizerWithRegEx::Match_GPT2_Pattern_2 },
522520 {R"( [\p{L}]+|[\p{N}])" , &PreTokenizerWithRegEx::Match_CLIP_Pattern_1},
523521 {R"( [^\s\p{L}\p{N}]+)" , &PreTokenizerWithRegEx::Match_CLIP_Pattern_2},
524- {R"( ?[^\s\p{L}\p{N}]+[\r\n/]*)" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
522+ {R"( \s+(?!\S)|\s+)" , &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
523+ {R"( \p{N}{1,3})" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
524+ {R"( \s*[\r\n]+)" , &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5},
525525 {R"( \p{N})" , &PreTokenizerWithRegEx::Match_General_Pattern_1},
526526 };
527527
528528 std::string regex_compound = regex;
529+ std::map<size_t , RegexMatchFunc> patterns_map; // using map for a ordered pattern matchers
529530 for (const auto & [pattern, func] : patterns) {
530531 auto pos = regex_compound.find (pattern);
531532 if (pos != std::string::npos) {
@@ -539,8 +540,9 @@ class PreTokenizerWithRegEx {
539540 continue ;
540541 }
541542 }
542-
543- activated_matchers_.push_back (func);
543+ auto original_pos = regex.find (pattern);
544+ assert (original_pos != std::string::npos);
545+ patterns_map[original_pos] = func;
544546 std::string regex_prefix;
545547 auto pattern_size = pattern.size ();
546548 if (pos > 0 ) { // remove the '|' at the end of the prefix
@@ -557,6 +559,9 @@ class PreTokenizerWithRegEx {
557559 regex_compound = regex_prefix + regex_compound.substr (pos + pattern_size);
558560 }
559561 }
562+ for (const auto & [_, func] : patterns_map) {
563+ activated_matchers_.push_back (func);
564+ }
560565
561566 if (regex_compound.size () > 0 ) {
562567 try {
0 commit comments