Skip to content

Commit bfeb3dd

Browse files
authored
Fix phi-4 regex pattern handling in the tokenizer (#905)
1 parent d50723a commit bfeb3dd

File tree

2 files changed

+20
-15
lines changed

2 files changed

+20
-15
lines changed

operators/tokenizer/bpe_utils.hpp

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -191,14 +191,14 @@ class PreTokenizerWithRegEx {
191191
return {};
192192
}
193193

194-
// "\s+(?!\S)|\s+)"
194+
// "\s+(?!\S)|\s+"
195195
std::u32string_view Match_GPT2_Pattern_4() {
196196
if ((m_text.size() >= 1) && (IsZ(m_text[0]))) {
197197
size_t i = 1;
198198
for (; i < m_text.size(); ++i) {
199199
if (!IsZ(m_text[i])) break;
200200
}
201-
if ((i > 1) && (i != m_text.size())) { //\s+(?!\S)
201+
if ((i > 1) && (i != m_text.size())) { // ?!\S
202202
i--;
203203
std::u32string_view res = m_text.substr(0, i);
204204
m_text = m_text.substr(i);
@@ -504,28 +504,29 @@ class PreTokenizerWithRegEx {
504504
OrtxStatus Compile(const std::string& regex) {
505505
// NOTES: to avoid the short pattern shadowing the longer one, the longer pattern should be placed first
506506
auto patterns = std::vector<std::tuple<std::string_view, RegexMatchFunc>>{
507-
{R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))",
508-
&PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
509507
{R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?)",
510508
&PreTokenizerWithRegEx::Match_PHI4_Pattern_1},
511509
{R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?)",
512510
&PreTokenizerWithRegEx::Match_PHI4_Pattern_2},
511+
{R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))",
512+
&PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
513513
{R"((?i:'s|'t|'re|'ve|'m|'ll|'d))", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
514-
{R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
515-
{R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
516-
{R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
514+
{R"( ?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
517515
{R"( ?[^\s\p{L}\p{N}]+[\r\n]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
518-
{R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5},
519-
{R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2},
516+
{R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
517+
{R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
520518
{R"( ?[^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_3},
521-
{R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
519+
{R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2},
522520
{R"([\p{L}]+|[\p{N}])", &PreTokenizerWithRegEx::Match_CLIP_Pattern_1},
523521
{R"([^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_CLIP_Pattern_2},
524-
{R"(?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
522+
{R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
523+
{R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
524+
{R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5},
525525
{R"(\p{N})", &PreTokenizerWithRegEx::Match_General_Pattern_1},
526526
};
527527

528528
std::string regex_compound = regex;
529+
std::map<size_t, RegexMatchFunc> patterns_map; // using map for a ordered pattern matchers
529530
for (const auto& [pattern, func] : patterns) {
530531
auto pos = regex_compound.find(pattern);
531532
if (pos != std::string::npos) {
@@ -539,8 +540,9 @@ class PreTokenizerWithRegEx {
539540
continue;
540541
}
541542
}
542-
543-
activated_matchers_.push_back(func);
543+
auto original_pos = regex.find(pattern);
544+
assert(original_pos != std::string::npos);
545+
patterns_map[original_pos] = func;
544546
std::string regex_prefix;
545547
auto pattern_size = pattern.size();
546548
if (pos > 0) { // remove the '|' at the end of the prefix
@@ -557,6 +559,9 @@ class PreTokenizerWithRegEx {
557559
regex_compound = regex_prefix + regex_compound.substr(pos + pattern_size);
558560
}
559561
}
562+
for (const auto& [_, func] : patterns_map) {
563+
activated_matchers_.push_back(func);
564+
}
560565

561566
if (regex_compound.size() > 0) {
562567
try {

test/test_pp_api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
1111
is_pp_api_available = False
1212
hf_token_id = None
13-
phi4_model_local_path = None
13+
phi4_model_local_path = "microsoft/Phi-4-multimodal-instruct"
1414
try:
1515
from transformers import AutoImageProcessor, AutoTokenizer
1616
from onnxruntime_extensions import pp_api
@@ -201,7 +201,7 @@ def test_Qwen_QVQ_tokenizer(self):
201201
def test_Phi4_tokenizer(self):
202202
model_id = phi4_model_local_path
203203
test_sentence = ['<|user|>\n' + self.tokenizer_test_sentence]
204-
hf_enc = AutoTokenizer.from_pretrained(model_id, use_fast=True)
204+
hf_enc = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=True)
205205
inputs = hf_enc(test_sentence)["input_ids"]
206206
tokenizer = pp_api.Tokenizer(model_id)
207207
ortx_inputs = tokenizer.tokenize(test_sentence)

0 commit comments

Comments
 (0)