@@ -538,7 +538,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
538538 toktypes : list [int ] = []
539539
540540 from transformers import AutoTokenizer
541- tokenizer = AutoTokenizer .from_pretrained (self .dir_model )
541+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
542542 vocab_size = self .hparams .get ("vocab_size" , len (tokenizer .vocab ))
543543 assert max (tokenizer .vocab .values ()) < vocab_size
544544
@@ -738,6 +738,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
738738 if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" :
739739 # ref: https://huggingface.co/THUDM/glm-4-9b-hf
740740 res = "glm4"
741+ if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" :
742+ # ref: https://huggingface.co/THUDM/glm-4-9b-hf
743+ res = "glm4"
741744
742745 if res is None :
743746 logger .warning ("\n " )
@@ -5022,16 +5025,60 @@ def set_vocab(self):
50225025
50235026 from transformers import AutoTokenizer
50245027 tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
5025- vocab_size = hparams .get ("padded_vocab_size" ,hparams [ "vocab_size" ] )
5028+ vocab_size = hparams .get ("padded_vocab_size" ,hparams . get ( "vocab_size" ) )
50265029 assert max (tokenizer .get_vocab ().values ()) < vocab_size
50275030
5028- tokens , toktypes , tokpre = self .get_vocab_base ()
5029- self .gguf_writer .add_tokenizer_model ("gpt2" )
5031+ tokpre = self .get_vocab_base_pre (tokenizer )
5032+
5033+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in tokenizer .get_vocab ().items ()}
5034+ added_vocab = tokenizer .get_added_vocab ()
5035+
5036+ added_tokens_decoder = tokenizer .added_tokens_decoder
5037+
5038+ for i in range (vocab_size ):
5039+ if i not in reverse_vocab :
5040+ tokens .append (f"[PAD{ i } ]" )
5041+ toktypes .append (gguf .TokenType .UNUSED )
5042+ else :
5043+ token : str = reverse_vocab [i ]
5044+ if token in added_vocab :
5045+ # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
5046+ # To avoid unexpected issues - we make sure to normalize non-normalized tokens
5047+ if not added_tokens_decoder [i ].normalized :
5048+ previous_token = token
5049+ token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
5050+ if previous_token != token :
5051+ logger .info (f"{ repr (previous_token )} is encoded and decoded back to { repr (token )} using AutoTokenizer" )
5052+
5053+ if added_tokens_decoder [i ].special or self .does_token_look_special (token ):
5054+ toktypes .append (gguf .TokenType .CONTROL )
5055+ else :
5056+ # NOTE: this was added for Gemma.
5057+ # Encoding and decoding the tokens above isn't sufficient for this case.
5058+ token = token .replace (b"\xe2 \x96 \x81 " .decode ("utf-8" ), " " ) # pre-normalize user-defined spaces
5059+ toktypes .append (gguf .TokenType .USER_DEFINED )
5060+ else :
5061+ toktypes .append (gguf .TokenType .NORMAL )
5062+ tokens .append (token )
5063+
5064+ self .gguf_writer .add_tokenizer_model ("llama" )
50305065 self .gguf_writer .add_tokenizer_pre (tokpre )
50315066 self .gguf_writer .add_token_list (tokens )
50325067 self .gguf_writer .add_token_types (toktypes )
5033- special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
5068+
5069+ special_vocab = gguf .SpecialVocab (
5070+ self .dir_model ,
5071+ load_merges = False ,
5072+ n_vocab = vocab_size
5073+ )
50345074 # only add special tokens when they were not already loaded from config.json
5075+
5076+ #TODO In llama.cpp, special tokens are mapped one-to-one between a token and a coordinate. However, in reality, a transformer might associate a special token like eos_token_id with multiple tokens.
5077+ # Currently, llama.cpp only supports a one-to-one mapping.
5078+ # This can lead to an issue where the model fails to terminate properly.
5079+ # I'm still unclear about how llama.cpp handles special_token and what the exact call chain is!
5080+ special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|observation|>" ])
5081+ special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|user|>" ])
50355082 special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
50365083 special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ])
50375084 # this one is usually not in config.json anyway
@@ -5045,7 +5092,7 @@ def set_gguf_parameters(self):
50455092 self .gguf_writer .add_context_length (self .hparams .get ("seq_length" , n_embed ))
50465093 self .gguf_writer .add_embedding_length (n_embed )
50475094 self .gguf_writer .add_feed_forward_length (self .hparams .get ("ffn_hidden_size" , self .hparams .get ("intermediate_size" , 4 * n_embed )))
5048- self .gguf_writer .add_block_count (self .hparams .get ("num_layers" , self .hparams [ "num_hidden_layers" ] ))
5095+ self .gguf_writer .add_block_count (self .hparams .get ("num_layers" , self .hparams . get ( "num_hidden_layers" ) ))
50495096 self .gguf_writer .add_head_count (n_head )
50505097 self .gguf_writer .add_head_count_kv (n_head_kv )
50515098 self .gguf_writer .add_layer_norm_rms_eps (self .hparams .get ("layernorm_epsilon" ,1e-5 ))
0 commit comments