@@ -232,7 +232,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
232232 };
233233
234234 auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
235- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : codepoint_flags{};
235+ static const codepoint_flags undef (codepoint_flags::UNDEFINED);
236+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : undef;
236237 };
237238
238239 size_t _prev_end = offset_ini;
@@ -294,9 +295,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
294295 continue ;
295296 }
296297 // regex: <space>?[^\s\p{L}\p{N}]+
297- if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number ) && flags2.as_uint ( )) {
298+ if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined )) {
298299 pos += (cpt == ' ' );
299- while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number ) && flags2.as_uint ( )) {
300+ while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined )) {
300301 flags2 = _get_flags (++pos);
301302 }
302303 _add_token (pos);
@@ -350,7 +351,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
350351 };
351352
352353 auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
353- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : codepoint_flags{};
354+ static const codepoint_flags undef (codepoint_flags::UNDEFINED);
355+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : undef;
354356 };
355357
356358 size_t _prev_end = offset_ini;
@@ -392,8 +394,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
392394 }
393395 }
394396
395- // regex: [^\r\n\p{L}\p{N}]?\p{L}+
396- if (!(cpt == ' \r ' || cpt == ' \n ' || flags.is_number )) {
397+ // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
398+ if (!(cpt == ' \r ' || cpt == ' \n ' || /* flags.is_letter | */ flags.is_number )) {
397399 if (flags.is_letter || _get_flags (pos+1 ).is_letter ) { // one or more letters
398400 pos++;
399401 while (_get_flags (pos).is_letter ) {
@@ -419,9 +421,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
419421
420422 // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
421423 auto flags2 = (cpt == ' ' ? _get_flags (pos+1 ) : flags);
422- if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number ) && flags. as_uint ( )) {
424+ if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2. is_undefined )) {
423425 pos += (cpt == ' ' );
424- while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number ) && flags2.as_uint ( )) {
426+ while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined )) {
425427 flags2 = _get_flags (++pos);
426428 }
427429 uint32_t cpt2 = _get_cpt (pos);
0 commit comments