File tree Expand file tree Collapse file tree 1 file changed +3
-3
lines changed Expand file tree Collapse file tree 1 file changed +3
-3
lines changed Original file line number Diff line number Diff line change @@ -557,7 +557,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
557557 return bpe_offsets;
558558}
559559
560- // K2 system regex patterns (from tokenization_kimi.py):
560+ // K2 system regex patterns (from tokenization_kimi.py):
561561// [\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
562562static std::vector<size_t > unicode_regex_split_custom_kimi_k2 (const std::string & text, const std::vector<size_t > & offsets) {
563563 std::vector<size_t > bpe_offsets;
@@ -610,7 +610,7 @@ static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string
610610 // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?:'s|'t|'re|'ve|'m|'ll|'d)?
611611 // Check if current char is a letter OR if current char could be a leading char and next char is a letter
612612 bool is_letter_pattern = (flags.is_letter && !unicode_cpt_is_han (cpt)) ||
613- (!(cpt == ' \r ' || cpt == ' \n ' || flags.is_letter || flags.is_number ) &&
613+ (!(cpt == ' \r ' || cpt == ' \n ' || flags.is_letter || flags.is_number ) &&
614614 _get_flags (pos + 1 ).is_letter && !unicode_cpt_is_han (_get_cpt (pos + 1 )));
615615
616616 if (is_letter_pattern) {
@@ -861,7 +861,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
861861 // CJK Extension C
862862 if (cpt >= 0x2A700 && cpt <= 0x2B73F ) return true ;
863863
864- // CJK Extension D
864+ // CJK Extension D
865865 if (cpt >= 0x2B740 && cpt <= 0x2B81F ) return true ;
866866
867867 // CJK Extension E
You can’t perform that action at this time.
0 commit comments