@@ -37,7 +37,24 @@ PreTokenizer::Ptr PreTokenizerConfig::create() const {
37
37
throw std::runtime_error (
38
38
" Missing pattern for PreTokenizer of type Split" );
39
39
}
40
- return PreTokenizer::Ptr (new RegexPreTokenizer (*pattern));
40
+
41
+ // Validate behavior parameter
42
+ std::string behavior_str = behavior ? *behavior : " " ;
43
+ if (!behavior_str.empty () && behavior_str != " MergedWithPrevious" ) {
44
+ throw std::runtime_error (
45
+ " Unsupported behavior '" + behavior_str +
46
+ " ' for Split PreTokenizer. Only 'MergedWithPrevious' is supported." );
47
+ }
48
+
49
+ // Validate invert parameter
50
+ bool invert_flag = invert ? *invert : false ;
51
+ if (invert_flag) {
52
+ throw std::runtime_error (
53
+ " invert=true is not supported for Split PreTokenizer. Only invert=false is supported." );
54
+ }
55
+
56
+ return PreTokenizer::Ptr (new RegexPreTokenizer (
57
+ *pattern, is_delimiter ? *is_delimiter : false , behavior_str));
41
58
}
42
59
if (type == " Digits" ) {
43
60
if (individual_digits) {
@@ -79,7 +96,27 @@ PreTokenizerConfig& PreTokenizerConfig::parse_json(const json& json_config) {
79
96
if (type == " Split" ) {
80
97
try {
81
98
pattern = json_config.at (" pattern" ).at (" Regex" );
99
+ is_delimiter = false ;
100
+ } catch (json::out_of_range&) {
101
+ // "Regex" is not there, check "String", which is a delimiter
102
+ std::string delimiter = json_config.at (" pattern" ).at (" String" );
103
+ // For string patterns, escape regex special characters to treat them as
104
+ // literal strings (same as Rust's regex::escape)
105
+ pattern = IRegex::escape (delimiter);
106
+ is_delimiter = true ;
107
+ }
108
+
109
+ // Parse behavior and invert fields
110
+ try {
111
+ behavior = json_config.at (" behavior" );
112
+ } catch (json::out_of_range&) {
113
+ // behavior is optional, default to empty string
114
+ }
115
+
116
+ try {
117
+ invert = json_config.at (" invert" );
82
118
} catch (json::out_of_range&) {
119
+ // invert is optional, default to false
83
120
}
84
121
} else if (type == " Digits" ) {
85
122
try {
@@ -115,9 +152,66 @@ std::vector<std::string> RegexPreTokenizer::pre_tokenize(
115
152
const std::string& input) const {
116
153
if (!regex_)
117
154
return {};
155
+
118
156
std::vector<std::string> results;
119
- for (const auto & match : regex_->find_all (input)) {
120
- results.push_back (input.substr (match.start , match.end - match.start ));
157
+ auto matches = regex_->find_all (input);
158
+
159
+ if (!is_delimiter_) {
160
+ // Original behavior: return the matches themselves
161
+ for (const auto & match : matches) {
162
+ results.push_back (input.substr (match.start , match.end - match.start ));
163
+ }
164
+ } else {
165
+ // Delimiter behavior
166
+ if (matches.empty ()) {
167
+ // No matches found, return the entire input
168
+ results.push_back (input);
169
+ return results;
170
+ }
171
+
172
+ if (behavior_ == " MergedWithPrevious" ) {
173
+ // MergedWithPrevious: Include delimiter with previous token
174
+ // Example: "the-final--countdown" with delimiter "-"
175
+ // -> ["the-", "final-", "-", "countdown"]
176
+ size_t last_end = 0 ;
177
+
178
+ for (size_t i = 0 ; i < matches.size (); ++i) {
179
+ const auto & match = matches[i];
180
+
181
+ // Add text before the match plus the delimiter
182
+ if (match.start > last_end) {
183
+ std::string token = input.substr (last_end, match.end - last_end);
184
+ results.push_back (token);
185
+ } else {
186
+ // Only delimiter, no preceding text
187
+ std::string delimiter =
188
+ input.substr (match.start , match.end - match.start );
189
+ results.push_back (delimiter);
190
+ }
191
+
192
+ last_end = match.end ;
193
+ }
194
+
195
+ // Add remaining text after the last match (if any)
196
+ if (last_end < input.length ()) {
197
+ results.push_back (input.substr (last_end));
198
+ }
199
+ } else {
200
+ // Default delimiter behavior (split on delimiters)
201
+ size_t last_end = 0 ;
202
+ for (const auto & match : matches) {
203
+ // Add text before the match (if any)
204
+ if (match.start > last_end) {
205
+ results.push_back (input.substr (last_end, match.start - last_end));
206
+ }
207
+ last_end = match.end ;
208
+ }
209
+
210
+ // Add remaining text after the last match (if any)
211
+ if (last_end < input.length ()) {
212
+ results.push_back (input.substr (last_end));
213
+ }
214
+ }
121
215
}
122
216
return results;
123
217
}
0 commit comments