@@ -840,6 +840,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
840840 if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51" :
841841 # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
842842 res = "lfm2"
843+ if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890" :
844+ # ref: https://huggingface.co/moonshotai/Kimi-K2-Base
845+ res = "kimi-k2"
843846
844847 if res is None :
845848 logger .warning ("\n " )
@@ -5739,7 +5742,58 @@ class DeepseekV2Model(TextModel):
57395742 model_arch = gguf .MODEL_ARCH .DEEPSEEK2
57405743
57415744 def set_vocab (self ):
5742- self ._set_vocab_gpt2 ()
5745+ try :
5746+ self ._set_vocab_gpt2 ()
5747+ return
5748+ except Exception :
5749+ pass
5750+
5751+ from transformers import AutoTokenizer
5752+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
5753+ tokpre = self .get_vocab_base_pre (tokenizer )
5754+
5755+ if tokpre == "kimi-k2" :
5756+ # Build merges list using the approach similar to HunYuanMoE
5757+ merges = []
5758+ vocab = {}
5759+ mergeable_ranks = tokenizer .model ._mergeable_ranks
5760+ for token , rank in mergeable_ranks .items ():
5761+ vocab [QwenModel .token_bytes_to_string (token )] = rank
5762+ if len (token ) == 1 :
5763+ continue
5764+ merged = QwenModel .bpe (mergeable_ranks , token , max_rank = rank )
5765+ if len (merged ) == 2 :
5766+ merges .append (' ' .join (map (QwenModel .token_bytes_to_string , merged )))
5767+
5768+ # Build token list
5769+ vocab_size = self .hparams ["vocab_size" ]
5770+ special_tokens = tokenizer .special_tokens
5771+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in {** vocab , ** special_tokens }.items ()}
5772+ tokens : list [str ] = []
5773+ toktypes : list [int ] = []
5774+
5775+ for i in range (vocab_size ):
5776+ if i not in reverse_vocab :
5777+ tokens .append (f"[PAD{ i } ]" )
5778+ toktypes .append (gguf .TokenType .UNUSED )
5779+ else :
5780+ token = reverse_vocab [i ]
5781+ tokens .append (token )
5782+ if i in special_tokens .values ():
5783+ toktypes .append (gguf .TokenType .CONTROL )
5784+ else :
5785+ toktypes .append (gguf .TokenType .NORMAL )
5786+
5787+ self .gguf_writer .add_tokenizer_model ("gpt2" )
5788+ self .gguf_writer .add_tokenizer_pre (tokpre )
5789+ self .gguf_writer .add_token_list (tokens )
5790+ self .gguf_writer .add_token_types (toktypes )
5791+ self .gguf_writer .add_token_merges (merges )
5792+
5793+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = False )
5794+ special_vocab .add_to_gguf (self .gguf_writer )
5795+ else :
5796+ raise NotImplementedError (f"Deepseek pre-tokenizer { tokpre !r} is not supported yet!" )
57435797
57445798 def set_gguf_parameters (self ):
57455799
0 commit comments