@@ -226,7 +226,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
226226 assert (offset_end <= cpts.size ());
227227 start = offset_end;
228228
229- auto _get_cpt = [&] (const size_t pos) -> char32_t {
229+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
230230 return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0 ;
231231 };
232232
@@ -253,18 +253,18 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
253253 };
254254
255255 for (size_t pos = offset_ini; pos < offset_end; /* pos++*/ ) {
256- const char32_t cpt = _get_cpt (pos);
256+ const uint32_t cpt = _get_cpt (pos);
257257 const auto flags = _get_flags (pos);
258258
259259 // regex: 's|'t|'re|'ve|'m|'ll|'d
260260 if (cpt == ' \' ' && pos+1 < offset_end) {
261- char32_t cpt_next = _get_cpt (pos+1 );
261+ uint32_t cpt_next = _get_cpt (pos+1 );
262262 if (cpt_next == ' s' || cpt_next == ' t' || cpt_next == ' m' || cpt_next == ' d' ) {
263263 pos += _add_token (pos+2 );
264264 continue ;
265265 }
266266 if (pos+2 < offset_end) {
267- char32_t cpt_next_next = _get_cpt (pos+2 );
267+ uint32_t cpt_next_next = _get_cpt (pos+2 );
268268 if ((cpt_next == ' r' && cpt_next_next == ' e' ) ||
269269 (cpt_next == ' v' && cpt_next_next == ' e' ) ||
270270 (cpt_next == ' l' && cpt_next_next == ' l' )) {
@@ -344,7 +344,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
344344 assert (offset_end <= cpts.size ());
345345 start = offset_end;
346346
347- auto _get_cpt = [&] (const size_t pos) -> char32_t {
347+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
348348 return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0 ;
349349 };
350350
@@ -371,18 +371,18 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
371371 };
372372
373373 for (size_t pos = offset_ini; pos < offset_end; /* pos++*/ ) {
374- const char32_t cpt = _get_cpt (pos);
374+ const uint32_t cpt = _get_cpt (pos);
375375 const auto flags = _get_flags (pos);
376376
377377 // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
378378 if (cpt == ' \' ' && pos+1 < offset_end) {
379- char32_t cpt_next = unicode_tolower (_get_cpt (pos+1 ));
379+ uint32_t cpt_next = unicode_tolower (_get_cpt (pos+1 ));
380380 if (cpt_next == ' s' || cpt_next == ' t' || cpt_next == ' m' || cpt_next == ' d' ) {
381381 pos += _add_token (pos+2 );
382382 continue ;
383383 }
384384 if (pos+2 < offset_end) {
385- char32_t cpt_next_next = unicode_tolower (_get_cpt (pos+2 ));
385+ uint32_t cpt_next_next = unicode_tolower (_get_cpt (pos+2 ));
386386 if ((cpt_next == ' r' && cpt_next_next == ' e' ) ||
387387 (cpt_next == ' v' && cpt_next_next == ' e' ) ||
388388 (cpt_next == ' l' && cpt_next_next == ' l' )) {
@@ -424,7 +424,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
424424 while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined )) {
425425 flags2 = _get_flags (++pos);
426426 }
427- char32_t cpt2 = _get_cpt (pos);
427+ uint32_t cpt2 = _get_cpt (pos);
428428 while (cpt2 == ' \r ' || cpt2 == ' \n ' ) {
429429 cpt2 = _get_cpt (++pos);
430430 }
@@ -435,7 +435,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
435435 size_t num_whitespaces = 0 ;
436436 size_t last_end_r_or_n = 0 ;
437437 while (_get_flags (pos+num_whitespaces).is_whitespace ) {
438- char32_t cpt2 = _get_cpt (pos+num_whitespaces);
438+ uint32_t cpt2 = _get_cpt (pos+num_whitespaces);
439439 if (cpt2 == ' \r ' || cpt2 == ' \n ' ) {
440440 last_end_r_or_n = pos + num_whitespaces + 1 ;
441441 }
@@ -626,7 +626,7 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
626626 return map.at (utf8);
627627}
628628
629- char32_t unicode_tolower (char32_t cp) {
629+ uint32_t unicode_tolower (uint32_t cp) {
630630 auto it = unicode_map_lowercase.find (cp);
631631 return it == unicode_map_lowercase.end () ? cp : it->second ;
632632}
0 commit comments