Skip to content

Commit 2944de1

Browse files
shoumikhinfacebook-github-bot
authored andcommitted
Add Voxtral test. (meta-pytorch#136)
Summary: X-link: pytorch/executorch#14918 . Reviewed By: larryliu0820 Differential Revision: D84081392
1 parent 1b55911 commit 2944de1

File tree

1 file changed

+1
-35
lines changed

1 file changed

+1
-35
lines changed

src/pre_tokenizer.cpp

Lines changed: 1 addition & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -144,44 +144,10 @@ PreTokenizerConfig& PreTokenizerConfig::parse_json(const json& json_config) {
144144

145145
// RegexPreTokenizer ///////////////////////////////////////////////////////////
146146

147-
namespace {
148-
149-
// Make Hugging Face Split patterns RE2-compatible by:
150-
// 1) removing the negative look-ahead "\s+(?!\S)" (→ "\s+$")
151-
// 2) expanding the inline case-insensitive contractions
152-
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)" into explicit alternations.
153-
static void replace_all_in_place(
154-
std::string& input,
155-
const std::string& needle,
156-
const std::string& replacement) {
157-
if (needle.empty()) {
158-
return;
159-
}
160-
size_t search_pos = 0;
161-
while ((search_pos = input.find(needle, search_pos)) != std::string::npos) {
162-
input.replace(search_pos, needle.size(), replacement);
163-
search_pos += replacement.size();
164-
}
165-
}
166-
167-
static std::string make_re2_compatible(std::string pattern) {
168-
const std::string lookahead_trailing_space = R"(\s+(?!\S))";
169-
const std::string trailing_space_replacement = R"(\s+$)";
170-
replace_all_in_place(
171-
pattern, lookahead_trailing_space, trailing_space_replacement);
172-
const std::string ci_contractions = R"((?i:'s|'t|'re|'ve|'m|'ll|'d))";
173-
const std::string contractions_expanded =
174-
"(?:'s|'S|'t|'T|'re|'RE|'ve|'VE|'m|'M|'ll|'LL|'d|'D)";
175-
replace_all_in_place(pattern, ci_contractions, contractions_expanded);
176-
return pattern;
177-
}
178-
179-
} // namespace
180-
181147
std::unique_ptr<IRegex> RegexPreTokenizer::create_regex_(
182148
const std::string& pattern) {
183149
assert(!pattern.empty());
184-
return TK_UNWRAP_THROW(create_regex(make_re2_compatible(pattern)));
150+
return TK_UNWRAP_THROW(create_regex(pattern));
185151
}
186152

187153
std::vector<std::string> RegexPreTokenizer::pre_tokenize(

0 commit comments

Comments
 (0)