Skip to content

Commit ecb345f

Browse files
fix whitespaces
1 parent b023e53 commit ecb345f

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

src/unicode.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
557557
return bpe_offsets;
558558
}
559559

560-
// K2 system regex patterns (from tokenization_kimi.py):
560+
// K2 system regex patterns (from tokenization_kimi.py):
561561
// [\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
562562
static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string & text, const std::vector<size_t> & offsets) {
563563
std::vector<size_t> bpe_offsets;
@@ -610,7 +610,7 @@ static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string
610610
// [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?:'s|'t|'re|'ve|'m|'ll|'d)?
611611
// Check if current char is a letter OR if current char could be a leading char and next char is a letter
612612
bool is_letter_pattern = (flags.is_letter && !unicode_cpt_is_han(cpt)) ||
613-
(!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number) &&
613+
(!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number) &&
614614
_get_flags(pos + 1).is_letter && !unicode_cpt_is_han(_get_cpt(pos + 1)));
615615

616616
if (is_letter_pattern) {
@@ -861,7 +861,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
861861
// CJK Extension C
862862
if (cpt >= 0x2A700 && cpt <= 0x2B73F) return true;
863863

864-
// CJK Extension D
864+
// CJK Extension D
865865
if (cpt >= 0x2B740 && cpt <= 0x2B81F) return true;
866866

867867
// CJK Extension E

0 commit comments

Comments
 (0)