@@ -612,31 +612,31 @@ static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string
612612 bool is_letter_pattern = (flags.is_letter && !unicode_cpt_is_han (cpt)) ||
613613 (!(cpt == ' \r ' || cpt == ' \n ' || flags.is_letter || flags.is_number ) &&
614614 _get_flags (pos + 1 ).is_letter && !unicode_cpt_is_han (_get_cpt (pos + 1 )));
615-
615+
616616 if (is_letter_pattern) {
617617 // Handle optional leading non-letter/non-number character
618618 bool has_leading_char = false ;
619619 if (!(cpt == ' \r ' || cpt == ' \n ' || flags.is_letter || flags.is_number )) {
620620 has_leading_char = true ;
621621 pos++;
622622 }
623-
623+
624624 // Match letter sequence (excluding Han characters)
625625 bool has_letters = false ;
626626 while (_get_flags (pos).is_letter && !unicode_cpt_is_han (_get_cpt (pos))) {
627627 has_letters = true ;
628628 pos++;
629629 }
630-
630+
631631 // Only proceed if we found letters (after potentially skipping leading char)
632632 if (has_letters || (!has_leading_char && _get_flags (pos).is_letter && !unicode_cpt_is_han (_get_cpt (pos)))) {
633633 if (!has_letters) pos++; // consume the first letter if we didn't already
634-
634+
635635 // Continue consuming letters
636636 while (_get_flags (pos).is_letter && !unicode_cpt_is_han (_get_cpt (pos))) {
637637 pos++;
638638 }
639-
639+
640640 // Check for optional contractions (?:'s|'t|'re|'ve|'m|'ll|'d)
641641 if (_get_cpt (pos) == ' \' ' && pos + 1 < offset_end) {
642642 uint32_t cpt_next = unicode_tolower (_get_cpt (pos + 1 ));
@@ -651,7 +651,7 @@ static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string
651651 }
652652 }
653653 }
654-
654+
655655 _add_token (pos);
656656 continue ;
657657 } else if (has_leading_char) {
@@ -851,31 +851,31 @@ bool unicode_cpt_is_han(uint32_t cpt) {
851851 // Han character ranges (Chinese/CJK characters)
852852 // CJK Unified Ideographs (most common)
853853 if (cpt >= 0x4E00 && cpt <= 0x9FFF ) return true ;
854-
854+
855855 // CJK Extension A
856856 if (cpt >= 0x3400 && cpt <= 0x4DBF ) return true ;
857-
857+
858858 // CJK Extension B
859859 if (cpt >= 0x20000 && cpt <= 0x2A6DF ) return true ;
860-
860+
861861 // CJK Extension C
862862 if (cpt >= 0x2A700 && cpt <= 0x2B73F ) return true ;
863-
863+
864864 // CJK Extension D
865865 if (cpt >= 0x2B740 && cpt <= 0x2B81F ) return true ;
866-
866+
867867 // CJK Extension E
868868 if (cpt >= 0x2B820 && cpt <= 0x2CEAF ) return true ;
869-
869+
870870 // CJK Extension F
871871 if (cpt >= 0x2CEB0 && cpt <= 0x2EBEF ) return true ;
872-
872+
873873 // CJK Compatibility Ideographs
874874 if (cpt >= 0xF900 && cpt <= 0xFAFF ) return true ;
875-
875+
876876 // CJK Compatibility Ideographs Supplement
877877 if (cpt >= 0x2F800 && cpt <= 0x2FA1F ) return true ;
878-
878+
879879 return false ;
880880}
881881
0 commit comments