@@ -144,44 +144,10 @@ PreTokenizerConfig& PreTokenizerConfig::parse_json(const json& json_config) {
144144
145145// RegexPreTokenizer ///////////////////////////////////////////////////////////
146146
147- namespace {
148-
149- // Make Hugging Face Split patterns RE2-compatible by:
150- // 1) removing the negative look-ahead "\s+(?!\S)" (→ "\s+$")
151- // 2) expanding the inline case-insensitive contractions
152- // "(?i:'s|'t|'re|'ve|'m|'ll|'d)" into explicit alternations.
153- static void replace_all_in_place (
154- std::string& input,
155- const std::string& needle,
156- const std::string& replacement) {
157- if (needle.empty ()) {
158- return ;
159- }
160- size_t search_pos = 0 ;
161- while ((search_pos = input.find (needle, search_pos)) != std::string::npos) {
162- input.replace (search_pos, needle.size (), replacement);
163- search_pos += replacement.size ();
164- }
165- }
166-
167- static std::string make_re2_compatible (std::string pattern) {
168- const std::string lookahead_trailing_space = R"( \s+(?!\S))" ;
169- const std::string trailing_space_replacement = R"( \s+$)" ;
170- replace_all_in_place (
171- pattern, lookahead_trailing_space, trailing_space_replacement);
172- const std::string ci_contractions = R"( (?i:'s|'t|'re|'ve|'m|'ll|'d))" ;
173- const std::string contractions_expanded =
174- " (?:'s|'S|'t|'T|'re|'RE|'ve|'VE|'m|'M|'ll|'LL|'d|'D)" ;
175- replace_all_in_place (pattern, ci_contractions, contractions_expanded);
176- return pattern;
177- }
178-
179- } // namespace
180-
181147std::unique_ptr<IRegex> RegexPreTokenizer::create_regex_ (
182148 const std::string& pattern) {
183149 assert (!pattern.empty ());
184- return TK_UNWRAP_THROW (create_regex (make_re2_compatible ( pattern) ));
150+ return TK_UNWRAP_THROW (create_regex (pattern));
185151}
186152
187153std::vector<std::string> RegexPreTokenizer::pre_tokenize (
0 commit comments