Skip to content

Commit 9f55139

Browse files
committed
Rollback due to incomplete detokenizer fix
1 parent ced723d commit 9f55139

File tree

1 file changed

+10
-8
lines changed

1 file changed

+10
-8
lines changed

base/unicode.cpp

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
232232
};
233233

234234
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
235-
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
235+
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
236+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
236237
};
237238

238239
size_t _prev_end = offset_ini;
@@ -294,9 +295,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
294295
continue;
295296
}
296297
// regex: <space>?[^\s\p{L}\p{N}]+
297-
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
298+
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
298299
pos += (cpt == ' ');
299-
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
300+
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
300301
flags2 = _get_flags(++pos);
301302
}
302303
_add_token(pos);
@@ -350,7 +351,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
350351
};
351352

352353
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
353-
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
354+
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
355+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
354356
};
355357

356358
size_t _prev_end = offset_ini;
@@ -392,8 +394,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
392394
}
393395
}
394396

395-
// regex: [^\r\n\p{L}\p{N}]?\p{L}+
396-
if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
397+
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
398+
if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
397399
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
398400
pos++;
399401
while (_get_flags(pos).is_letter) {
@@ -419,9 +421,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
419421

420422
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
421423
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
422-
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
424+
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
423425
pos += (cpt == ' ');
424-
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
426+
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
425427
flags2 = _get_flags(++pos);
426428
}
427429
uint32_t cpt2 = _get_cpt(pos);

0 commit comments

Comments
 (0)