@@ -780,6 +780,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
780780 if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5" :
781781 # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
782782 res = "deepseek-v3"
783+ if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890" :
784+ # ref: https://huggingface.co/moonshotai/Kimi-K2-Instruct
785+ res = "deepseek-v3"
783786 if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5" :
784787 # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
785788 res = "deepseek-r1-qwen"
@@ -5562,8 +5565,69 @@ def prepare_tensors(self):
55625565class DeepseekV2Model (TextModel ):
55635566 model_arch = gguf .MODEL_ARCH .DEEPSEEK2
55645567
5568+ def __init__ (self , * args , ** kwargs ):
5569+ super ().__init__ (* args , ** kwargs )
5570+
5571+ print ("yeahdongcn: __init__" )
5572+ # For handling tied embeddings
5573+ self ._tok_embd = None
5574+
5575+
55655576 def set_vocab (self ):
5566- self ._set_vocab_gpt2 ()
5577+ print ("yeahdongcn: set_vocab" )
5578+ from transformers import AutoTokenizer
5579+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
5580+
5581+ # 1. Get the pre-tokenizer identifier hash
5582+ tokpre = self .get_vocab_base_pre (tokenizer )
5583+
5584+ # 2. Reverse-engineer the merges list from mergeable_ranks
5585+ merges = []
5586+ vocab = {}
5587+ print (f"yeahdongcn: tokenizer={ tokenizer } " )
5588+ # mergeable_ranks = tokenizer.mergeable_ranks
5589+ # for token, rank in mergeable_ranks.items():
5590+ # vocab[QwenModel.token_bytes_to_string(token)] = rank
5591+ # if len(token) == 1:
5592+ # continue
5593+ # merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
5594+ # if len(merged) == 2: # todo this is an assert in Qwen, why?
5595+ # merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
5596+ # Hardcoded to make merge not empty
5597+ merges .append ("<|endoftext|> <|endoftext|>" )
5598+
5599+ # 3. Generate the tokens and toktypes lists
5600+ vocab_size = self .hparams ["vocab_size" ]
5601+ print (f"yeahdongcn: vocab_size={ vocab_size } " )
5602+ print (f"yeahdongcn: tokenizer.vocab_size={ tokenizer .vocab_size } " )
5603+ # assert tokenizer.vocab_size == vocab_size
5604+ special_tokens = tokenizer .special_tokens
5605+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in {** vocab , ** special_tokens }.items ()}
5606+ tokens : list [str ] = []
5607+ toktypes : list [int ] = []
5608+ for i in range (vocab_size ):
5609+ if i not in reverse_vocab :
5610+ tokens .append (f"[PAD{ i } ]" )
5611+ toktypes .append (gguf .TokenType .UNUSED )
5612+ else :
5613+ token = reverse_vocab [i ]
5614+ tokens .append (token )
5615+ if i in special_tokens .values ():
5616+ toktypes .append (gguf .TokenType .CONTROL )
5617+ else :
5618+ toktypes .append (gguf .TokenType .NORMAL )
5619+
5620+ # 4. Write all vocab-related fields to the GGUF writer
5621+ self .gguf_writer .add_tokenizer_model ("gpt2" )
5622+ self .gguf_writer .add_tokenizer_pre (tokpre )
5623+ self .gguf_writer .add_token_list (tokens )
5624+ self .gguf_writer .add_token_types (toktypes )
5625+ self .gguf_writer .add_token_merges (merges )
5626+
5627+ # 5. Add special tokens and chat templates
5628+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = False )
5629+ special_vocab .add_to_gguf (self .gguf_writer )
5630+
55675631
55685632 def set_gguf_parameters (self ):
55695633
@@ -5610,6 +5674,9 @@ def set_gguf_parameters(self):
56105674 _experts : list [dict [str , Tensor ]] | None = None
56115675
56125676 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
5677+ if name == "model.embed_tokens.weight" :
5678+ self ._tok_embd = data_torch .clone ()
5679+
56135680 # rename e_score_correction_bias tensors
56145681 if name .endswith ("e_score_correction_bias" ):
56155682 name = name .replace ("e_score_correction_bias" , "e_score_correction.bias" )
0 commit comments