@@ -428,23 +428,21 @@ struct llm_tokenizer_bpe : llm_tokenizer {
428428            case  LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
429429                //  Same as GPT-4o tokenizer except for Han characters [\\p{Han}]+
430430                regex_exprs = {
431-                     //  1. Add the high-priority  Han character rule. Backslashes must be escaped. 
431+                     //  1. Han characters 
432432                    " [\\ p{Han}]+"  ,
433- 
434-                     //  2 & 3. Use the adapted word patterns from GPT4O/Tekken, which emulate the uppercase/lowercase logic in a C++-compatible way. 
435-                     //  We also adapt the case-insensitive contraction to be C++ compatible.
436-                     " [^\\ r\\ n\\ p{L}\\ p{N}]?((?=[\\ p{L}])([^a-z]))*((?=[\\ p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?"  ,
437-                     " [^\\ r\\ n\\ p{L}\\ p{N}]?((?=[\\ p{L}])([^a-z]))+((?=[\\ p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?"  ,
438- 
439-                     //  4. Add the number rule.
433+                     //  2. Words ending in lowercase (non-Han) with contractions
434+                     " [^\\ r\\ n\\ p{L}\\ p{N}]?(?:(?![\\ p{Han}])[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}])*(?:(?![\\ p{Han}])[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}])+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?"  ,
435+                     //  3. Words starting with uppercase (non-Han) with contractions
436+                     " [^\\ r\\ n\\ p{L}\\ p{N}]?(?:(?![\\ p{Han}])[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}])+(?:(?![\\ p{Han}])[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}])*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?"  ,
437+                     //  4. Numbers (1-3 digits)
440438                    " \\ p{N}{1,3}"  ,
441- 
442-                     //  5. Use the Kimi K2 symbol rule precisely (no trailing '/').
439+                     //  5. Punctuation and symbols
443440                    "  ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*"  ,
444- 
445-                     //  6, 7, 8. Add the identical whitespace rules.
441+                     //  6. Newlines
446442                    " \\ s*[\\ r\\ n]+"  ,
443+                     //  7.Whitespace at the end of a line
447444                    " \\ s+(?!\\ S)"  ,
445+                     //  8. General whitespace
448446                    " \\ s+"  ,
449447                };
450448                break ;
0 commit comments