@@ -608,13 +608,12 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
608608
609609 from transformers import AutoTokenizer
610610 tokenizer = AutoTokenizer .from_pretrained (self .dir_model )
611- vocab = getattr (tokenizer , 'vocab' , tokenizer .get_vocab ())
612- vocab_size = self .hparams .get ("vocab_size" , len (vocab ))
613- assert max (vocab .values ()) < vocab_size
611+ vocab_size = self .hparams .get ("vocab_size" , len (tokenizer .vocab ))
612+ assert max (tokenizer .vocab .values ()) < vocab_size
614613
615614 tokpre = self .get_vocab_base_pre (tokenizer )
616615
617- reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in vocab .items ()}
616+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in tokenizer . vocab .items ()}
618617 added_vocab = tokenizer .get_added_vocab ()
619618
620619 added_tokens_decoder = tokenizer .added_tokens_decoder
@@ -3212,7 +3211,45 @@ def set_vocab(self):
32123211 self ._set_vocab_gpt2 ()
32133212
32143213 def _set_vocab_interns1 (self ):
3215- tokens , toktypes , tokpre = self .get_vocab_base ()
3214+ tokens : list [str ] = []
3215+ toktypes : list [int ] = []
3216+
3217+ from transformers import AutoTokenizer
3218+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
3219+ vocab = getattr (tokenizer , 'vocab' , tokenizer .get_vocab ())
3220+ vocab_size = self .hparams .get ("vocab_size" , len (vocab ))
3221+ assert max (vocab .values ()) < vocab_size
3222+
3223+ tokpre = self .get_vocab_base_pre (tokenizer )
3224+
3225+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in vocab .items ()}
3226+ added_vocab = tokenizer .get_added_vocab ()
3227+
3228+ added_tokens_decoder = tokenizer .added_tokens_decoder
3229+
3230+ for i in range (vocab_size ):
3231+ if i not in reverse_vocab :
3232+ tokens .append (f"[PAD{ i } ]" )
3233+ toktypes .append (gguf .TokenType .UNUSED )
3234+ else :
3235+ token : str = reverse_vocab [i ]
3236+ if token in added_vocab :
3237+ # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3238+ # To avoid unexpected issues - we make sure to normalize non-normalized tokens
3239+ if not added_tokens_decoder [i ].normalized :
3240+ previous_token = token
3241+ token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
3242+ if previous_token != token :
3243+ logger .info (f"{ repr (previous_token )} is encoded and decoded back to { repr (token )} using AutoTokenizer" )
3244+
3245+ if added_tokens_decoder [i ].special or self .does_token_look_special (token ):
3246+ toktypes .append (gguf .TokenType .CONTROL )
3247+ else :
3248+ toktypes .append (gguf .TokenType .USER_DEFINED )
3249+ else :
3250+ toktypes .append (gguf .TokenType .NORMAL )
3251+ tokens .append (token )
3252+
32163253 self .gguf_writer .add_tokenizer_model ("gpt2" )
32173254 self .gguf_writer .add_tokenizer_pre (tokpre )
32183255 self .gguf_writer .add_token_list (tokens )
0 commit comments