@@ -5563,7 +5563,53 @@ class DeepseekV2Model(TextModel):
55635563 model_arch = gguf .MODEL_ARCH .DEEPSEEK2
55645564
55655565 def set_vocab (self ):
5566- self ._set_vocab_gpt2 ()
5566+ if (self .hparams ["vocab_size" ]== 163840 ): # Kimi-K2 model
5567+ from transformers import AutoTokenizer
5568+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
5569+ tokpre = "kimi-k2" # TODO: add identifier hash
5570+
5571+ # Build merges list using the approach similar to HunYuanMoE
5572+ merges = []
5573+ vocab = {}
5574+ mergeable_ranks = tokenizer .model ._mergeable_ranks
5575+ for token , rank in mergeable_ranks .items ():
5576+ vocab [QwenModel .token_bytes_to_string (token )] = rank
5577+ if len (token ) == 1 :
5578+ continue
5579+ merged = QwenModel .bpe (mergeable_ranks , token , max_rank = rank )
5580+ if len (merged ) == 2 :
5581+ merges .append (' ' .join (map (QwenModel .token_bytes_to_string , merged )))
5582+
5583+
5584+ # Build token list
5585+ vocab_size = self .hparams ["vocab_size" ]
5586+ special_tokens = tokenizer .special_tokens
5587+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in {** vocab , ** special_tokens }.items ()}
5588+ tokens : list [str ] = []
5589+ toktypes : list [int ] = []
5590+
5591+ for i in range (tokenizer .vocab_size ):
5592+ if i not in reverse_vocab :
5593+ tokens .append (f"[PAD{ i } ]" )
5594+ toktypes .append (gguf .TokenType .UNUSED )
5595+ else :
5596+ token = reverse_vocab [i ]
5597+ tokens .append (token )
5598+ if i in special_tokens .values ():
5599+ toktypes .append (gguf .TokenType .CONTROL )
5600+ else :
5601+ toktypes .append (gguf .TokenType .NORMAL )
5602+
5603+ self .gguf_writer .add_tokenizer_model ("gpt2" )
5604+ self .gguf_writer .add_tokenizer_pre (tokpre )
5605+ self .gguf_writer .add_token_list (tokens )
5606+ self .gguf_writer .add_token_types (toktypes )
5607+ self .gguf_writer .add_token_merges (merges )
5608+
5609+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = False )
5610+ special_vocab .add_to_gguf (self .gguf_writer )
5611+ else :
5612+ self ._set_vocab_gpt2 ()
55675613
55685614 def set_gguf_parameters (self ):
55695615
0 commit comments