Fix phi-4 regex pattern handling in the tokenizer (#905)

wenbingl · web-flow · commit bfeb3dd142d7 · 2025-03-07T15:01:38.000-08:00
diff --git a/operators/tokenizer/bpe_utils.hpp b/operators/tokenizer/bpe_utils.hpp
@@ -191,14 +191,14 @@ class PreTokenizerWithRegEx {
     return {};
   }
 
-  // "\s+(?!\S)|\s+)"
+  // "\s+(?!\S)|\s+"
   std::u32string_view Match_GPT2_Pattern_4() {
     if ((m_text.size() >= 1) && (IsZ(m_text[0]))) {
       size_t i = 1;
       for (; i < m_text.size(); ++i) {
         if (!IsZ(m_text[i])) break;
       }
-      if ((i > 1) && (i != m_text.size())) {  //\s+(?!\S)
+      if ((i > 1) && (i != m_text.size())) {  // ?!\S
         i--;
         std::u32string_view res = m_text.substr(0, i);
         m_text = m_text.substr(i);
@@ -504,28 +504,29 @@ class PreTokenizerWithRegEx {
   OrtxStatus Compile(const std::string& regex) {
     // NOTES: to avoid the short pattern shadowing the longer one, the longer pattern should be placed first
     auto patterns = std::vector<std::tuple<std::string_view, RegexMatchFunc>>{
-        {R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))",
-         &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
         {R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?)",
          &PreTokenizerWithRegEx::Match_PHI4_Pattern_1},
         {R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?)",
          &PreTokenizerWithRegEx::Match_PHI4_Pattern_2},
+        {R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))",
+         &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
         {R"((?i:'s|'t|'re|'ve|'m|'ll|'d))", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
-        {R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
-        {R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
-        {R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
+        {R"( ?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
         {R"( ?[^\s\p{L}\p{N}]+[\r\n]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
-        {R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5},
-        {R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2},
+        {R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
+        {R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
         {R"( ?[^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_3},
-        {R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
+        {R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2},
         {R"([\p{L}]+|[\p{N}])", &PreTokenizerWithRegEx::Match_CLIP_Pattern_1},
         {R"([^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_CLIP_Pattern_2},
-        {R"(?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
+        {R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
+        {R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
+        {R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5},
         {R"(\p{N})", &PreTokenizerWithRegEx::Match_General_Pattern_1},
     };
 
     std::string regex_compound = regex;
+    std::map<size_t, RegexMatchFunc> patterns_map;  // using map for a ordered pattern matchers
     for (const auto& [pattern, func] : patterns) {
       auto pos = regex_compound.find(pattern);
       if (pos != std::string::npos) {
@@ -539,8 +540,9 @@ class PreTokenizerWithRegEx {
             continue;
           }
         }
-
-        activated_matchers_.push_back(func);
+        auto original_pos = regex.find(pattern);
+        assert(original_pos != std::string::npos);
+        patterns_map[original_pos] = func;
         std::string regex_prefix;
         auto pattern_size = pattern.size();
         if (pos > 0) {  // remove the '|' at the end of the prefix
@@ -557,6 +559,9 @@ class PreTokenizerWithRegEx {
         regex_compound = regex_prefix + regex_compound.substr(pos + pattern_size);
       }
     }
+    for (const auto& [_, func] : patterns_map) {
+      activated_matchers_.push_back(func);
+    }
 
     if (regex_compound.size() > 0) {
       try {
diff --git a/test/test_pp_api.py b/test/test_pp_api.py
@@ -10,7 +10,7 @@
 # os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
 is_pp_api_available = False
 hf_token_id = None
-phi4_model_local_path = None
+phi4_model_local_path = "microsoft/Phi-4-multimodal-instruct"
 try:
     from transformers import AutoImageProcessor, AutoTokenizer
     from onnxruntime_extensions import pp_api
@@ -201,7 +201,7 @@ def test_Qwen_QVQ_tokenizer(self):
     def test_Phi4_tokenizer(self):
         model_id = phi4_model_local_path
         test_sentence = ['<|user|>\n' + self.tokenizer_test_sentence]
-        hf_enc = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+        hf_enc = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=True)
         inputs = hf_enc(test_sentence)["input_ids"]
         tokenizer = pp_api.Tokenizer(model_id)
         ortx_inputs = tokenizer.tokenize(test_sentence)