@@ -538,7 +538,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
538538 toktypes : list [int ] = []
539539
540540 from transformers import AutoTokenizer
541- tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
541+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model )
542542 vocab_size = self .hparams .get ("vocab_size" , len (tokenizer .vocab ))
543543 assert max (tokenizer .vocab .values ()) < vocab_size
544544
@@ -738,9 +738,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
738738 if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" :
739739 # ref: https://huggingface.co/THUDM/glm-4-9b-hf
740740 res = "glm4"
741- if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" :
742- # ref: https://huggingface.co/THUDM/glm-4-9b-hf
743- res = "glm4"
744741
745742 if res is None :
746743 logger .warning ("\n " )
@@ -5025,7 +5022,7 @@ def set_vocab(self):
50255022
50265023 from transformers import AutoTokenizer
50275024 tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
5028- vocab_size = hparams .get ("padded_vocab_size" ,hparams .get ("vocab_size" ))
5025+ vocab_size = hparams .get ("padded_vocab_size" , hparams .get ("vocab_size" ))
50295026 assert max (tokenizer .get_vocab ().values ()) < vocab_size
50305027
50315028 tokpre = self .get_vocab_base_pre (tokenizer )
@@ -5052,16 +5049,12 @@ def set_vocab(self):
50525049
50535050 if added_tokens_decoder [i ].special or self .does_token_look_special (token ):
50545051 toktypes .append (gguf .TokenType .CONTROL )
5055- else :
5056- # NOTE: this was added for Gemma.
5057- # Encoding and decoding the tokens above isn't sufficient for this case.
5058- token = token .replace (b"\xe2 \x96 \x81 " .decode ("utf-8" ), " " ) # pre-normalize user-defined spaces
5059- toktypes .append (gguf .TokenType .USER_DEFINED )
5052+
50605053 else :
50615054 toktypes .append (gguf .TokenType .NORMAL )
50625055 tokens .append (token )
50635056
5064- self .gguf_writer .add_tokenizer_model ("llama " )
5057+ self .gguf_writer .add_tokenizer_model ("gpt2 " )
50655058 self .gguf_writer .add_tokenizer_pre (tokpre )
50665059 self .gguf_writer .add_token_list (tokens )
50675060 self .gguf_writer .add_token_types (toktypes )
@@ -5076,11 +5069,9 @@ def set_vocab(self):
50765069 #TODO In llama.cpp, special tokens are mapped one-to-one between a token and a coordinate. However, in reality, a transformer might associate a special token like eos_token_id with multiple tokens.
50775070 # Currently, llama.cpp only supports a one-to-one mapping.
50785071 # This can lead to an issue where the model fails to terminate properly.
5079- # I'm still unclear about how llama.cpp handles special_token and what the exact call chain is!
5080- special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|observation|>" ])
5081- special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|user|>" ])
5082- special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
5083- special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ])
5072+ # You can see a temporary workaround here. https://github.com/ggml-org/llama.cpp/issues/9606
5073+ special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
5074+ special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ])
50845075 # this one is usually not in config.json anyway
50855076 special_vocab ._set_special_token ("unk" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
50865077 special_vocab .add_to_gguf (self .gguf_writer )
0 commit comments