@@ -840,6 +840,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
840840 if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51" :
841841 # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
842842 res = "lfm2"
843+ if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890" :
844+ # ref: https://huggingface.co/moonshotai/Kimi-K2-Instruct
845+ res = "kimi-k2"
843846
844847 if res is None :
845848 logger .warning ("\n " )
@@ -5563,7 +5566,68 @@ class DeepseekV2Model(TextModel):
55635566 model_arch = gguf .MODEL_ARCH .DEEPSEEK2
55645567
55655568 def set_vocab (self ):
5566- self ._set_vocab_gpt2 ()
5569+ try :
5570+ self ._set_vocab_gpt2 ()
5571+ return
5572+ except :
5573+ pass
5574+ # Try using trust_remote_code=True
5575+ from transformers import AutoTokenizer
5576+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
5577+ tokpre = self .get_vocab_base_pre (tokenizer )
5578+ merges = []
5579+ vocab = {}
5580+ tokens : list [str ] = []
5581+ toktypes : list [int ] = []
5582+
5583+ if tokpre == "kimi-k2" :
5584+ # Copied from Hunyuan tokenizer conversion
5585+ # 2. Reverse-engineer the merges list from mergeable_ranks
5586+ merges = []
5587+ vocab = {}
5588+ from tiktoken .load import load_tiktoken_bpe
5589+ mergeable_ranks = load_tiktoken_bpe (tokenizer .vocab_file )
5590+ for token , rank in mergeable_ranks .items ():
5591+ vocab [QwenModel .token_bytes_to_string (token )] = rank
5592+ if len (token ) == 1 :
5593+ continue
5594+ merged = QwenModel .bpe (mergeable_ranks , token , max_rank = rank )
5595+ if len (merged ) == 2 : # todo this is an assert in Qwen, why?
5596+ merges .append (' ' .join (map (QwenModel .token_bytes_to_string , merged )))
5597+
5598+ # 3. Generate the tokens and toktypes lists
5599+ vocab_size = self .hparams ["vocab_size" ]
5600+ assert tokenizer .vocab_size == vocab_size
5601+ special_tokens = tokenizer .special_tokens
5602+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in {** vocab , ** special_tokens }.items ()}
5603+ tokens : list [str ] = []
5604+ toktypes : list [int ] = []
5605+ for i in range (vocab_size ):
5606+ if i not in reverse_vocab :
5607+ tokens .append (f"[PAD{ i } ]" )
5608+ toktypes .append (gguf .TokenType .UNUSED )
5609+ else :
5610+ token = reverse_vocab [i ]
5611+ tokens .append (token )
5612+ if i in special_tokens .values ():
5613+ toktypes .append (gguf .TokenType .CONTROL )
5614+ else :
5615+ toktypes .append (gguf .TokenType .NORMAL )
5616+
5617+ # 5. Add special tokens and chat templates
5618+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = False )
5619+ special_vocab .add_to_gguf (self .gguf_writer )
5620+ # FIX - Kimi-K2 does not add a BOS
5621+ self .gguf_writer .add_bos_token (False )
5622+ else :
5623+ raise NotImplementedError (f"{ self .dir_model } is not supported yet!" )
5624+
5625+ # 4. Write all vocab-related fields to the GGUF writer
5626+ self .gguf_writer .add_tokenizer_model ("gpt2" )
5627+ self .gguf_writer .add_tokenizer_pre (tokpre )
5628+ self .gguf_writer .add_token_list (tokens )
5629+ self .gguf_writer .add_token_types (toktypes )
5630+ self .gguf_writer .add_token_merges (merges )
55675631
55685632 def set_gguf_parameters (self ):
55695633
@@ -6973,6 +7037,8 @@ def set_vocab(self):
69737037 special_vocab .add_to_gguf (self .gguf_writer )
69747038 # FIX for BOS token: Overwrite incorrect id read from config.json
69757039 self .gguf_writer .add_bos_token_id (127959 ) # <|bos|>
7040+ # FIX - Hunyuan does not add a BOS
7041+ self .gguf_writer .add_bos_token (False )
69767042
69777043 def set_gguf_parameters (self ):
69787044 super ().set_gguf_parameters ()
0 commit comments