Skip to content

Commit d80d7fc

Browse files
Kimi-K2
1 parent bf674c3 commit d80d7fc

File tree

1 file changed

+11
-0
lines changed

1 file changed

+11
-0
lines changed

src/llama-vocab.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
404404
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
405405
};
406406
break;
407+
case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
408+
regex_exprs = {
409+
// K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
410+
// The custom handler implements all K2 patterns with proper Han character exclusion
411+
"\\p{Han}+",
412+
};
413+
break;
407414
case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
408415
regex_exprs = {
409416
"\\p{N}+",
@@ -1665,6 +1672,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
16651672
tokenizer_pre == "hunyuan") {
16661673
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
16671674
clean_spaces = false;
1675+
} else if (
1676+
tokenizer_pre == "kimi-k2") {
1677+
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
1678+
clean_spaces = false;
16681679
} else {
16691680
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
16701681
}

0 commit comments

Comments
 (0)