@@ -2737,13 +2737,15 @@ def set_vocab(self):
27372737
27382738            text  =  piece .encode ("utf-8" )
27392739            score  =  0.0 
2740-             if  len (piece ) !=  0  and  token_id  <  64789 :
2740+             # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), 
2741+             # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() 
2742+             if  len (piece ) !=  0  and  token_id  <  tokenizer .tokenizer .sp_model .vocab_size ():
27412743                score  =  tokenizer .tokenizer .sp_model .get_score (token_id )
27422744
27432745            if  len (piece ) ==  0 :
27442746                text  =  f"[PAD{ token_id }  .encode ("utf-8" )
27452747
2746-             if  token_id  >=  64789 :
2748+             if  token_id  >=  tokenizer . tokenizer . sp_model . vocab_size () :
27472749                toktype  =  SentencePieceTokenTypes .UNKNOWN 
27482750                tokens .append (text )
27492751                scores .append (score )
@@ -2773,7 +2775,7 @@ def set_vocab(self):
27732775        special_vocab .add_to_gguf (self .gguf_writer )
27742776
27752777    def  set_gguf_parameters (self ):
2776-         self .gguf_writer .add_name ("ChatGLM-6b-chat" )
2778+         self .gguf_writer .add_name (self . dir_model . name )
27772779        n_embed  =  self .hparams .get ("hidden_size" , self .hparams .get ("n_embed" ))
27782780        n_head  =  self .hparams .get ("n_head" , self .hparams .get ("num_attention_heads" ))
27792781        n_head_kv  =  self .hparams .get ("multi_query_group_num" , n_head )
@@ -2789,16 +2791,12 @@ def set_gguf_parameters(self):
27892791        self .gguf_writer .add_add_bos_token (False )
27902792
27912793    def  modify_tensors (self , data_torch : Tensor , name : str , bid : int  |  None ) ->  Iterable [tuple [str , Tensor ]]:
2792-         if  name .endswith (".rotary_pos_emb.inv_freq" ):
2793-             return  []
2794- 
27952794        del  bid   # unused 
27962795
2797-         name  =  re .sub (r'transformer\.' , '' , name )
2798- 
2799-         if  name  ==  "word_embeddings.weight" :
2800-             assert  self .tensor_names  is  not None 
2796+         if  name .endswith (".rotary_pos_emb.inv_freq" ):
2797+             return  []
28012798
2799+         name  =  name .removeprefix ("transformer." )
28022800        return  [(self .map_tensor_name (name ), data_torch )]
28032801
28042802
0 commit comments