@@ -1210,6 +1210,54 @@ def _try_set_pooling_type(self) -> None:
12101210 raise NotImplementedError ("Only MEAN, CLS, and LAST pooling types supported" )
12111211 self .gguf_writer .add_pooling_type (pooling_type )
12121212
1213+ def _set_vocab_interns1 (self ):
1214+ tokens : list [str ] = []
1215+ toktypes : list [int ] = []
1216+
1217+ from transformers import AutoTokenizer
1218+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
1219+ vocab = getattr (tokenizer , 'vocab' , tokenizer .get_vocab ())
1220+ vocab_size = self .hparams .get ("vocab_size" , len (vocab ))
1221+ assert max (vocab .values ()) < vocab_size
1222+
1223+ tokpre = self .get_vocab_base_pre (tokenizer )
1224+
1225+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in vocab .items ()}
1226+ added_vocab = tokenizer .get_added_vocab ()
1227+
1228+ added_tokens_decoder = tokenizer .added_tokens_decoder
1229+
1230+ for i in range (vocab_size ):
1231+ if i not in reverse_vocab :
1232+ tokens .append (f"[PAD{ i } ]" )
1233+ toktypes .append (gguf .TokenType .UNUSED )
1234+ else :
1235+ token : str = reverse_vocab [i ]
1236+ if token in added_vocab :
1237+ # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
1238+ # To avoid unexpected issues - we make sure to normalize non-normalized tokens
1239+ if not added_tokens_decoder [i ].normalized :
1240+ previous_token = token
1241+ token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
1242+ if previous_token != token :
1243+ logger .info (f"{ repr (previous_token )} is encoded and decoded back to { repr (token )} using AutoTokenizer" )
1244+
1245+ if added_tokens_decoder [i ].special or self .does_token_look_special (token ):
1246+ toktypes .append (gguf .TokenType .CONTROL )
1247+ else :
1248+ toktypes .append (gguf .TokenType .USER_DEFINED )
1249+ else :
1250+ toktypes .append (gguf .TokenType .NORMAL )
1251+ tokens .append (token )
1252+
1253+ self .gguf_writer .add_tokenizer_model ("gpt2" )
1254+ self .gguf_writer .add_tokenizer_pre (tokpre )
1255+ self .gguf_writer .add_token_list (tokens )
1256+ self .gguf_writer .add_token_types (toktypes )
1257+
1258+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
1259+ special_vocab .add_to_gguf (self .gguf_writer )
1260+
12131261
12141262class MmprojModel (ModelBase ):
12151263 model_type = ModelType .MMPROJ
@@ -3603,69 +3651,6 @@ def set_vocab(self):
36033651
36043652 super ().set_vocab ()
36053653
3606- def _set_vocab_interns1 (self ):
3607- tokens : list [str ] = []
3608- toktypes : list [int ] = []
3609-
3610- from transformers import AutoTokenizer
3611- tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
3612- vocab = getattr (tokenizer , 'vocab' , tokenizer .get_vocab ())
3613- vocab_size = self .hparams .get ("vocab_size" , len (vocab ))
3614- assert max (vocab .values ()) < vocab_size
3615-
3616- tokpre = self .get_vocab_base_pre (tokenizer )
3617-
3618- reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in vocab .items ()}
3619- added_vocab = tokenizer .get_added_vocab ()
3620-
3621- added_tokens_decoder = tokenizer .added_tokens_decoder
3622-
3623- for i in range (vocab_size ):
3624- if i not in reverse_vocab :
3625- tokens .append (f"[PAD{ i } ]" )
3626- toktypes .append (gguf .TokenType .UNUSED )
3627- else :
3628- token : str = reverse_vocab [i ]
3629- if token in added_vocab :
3630- # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3631- # To avoid unexpected issues - we make sure to normalize non-normalized tokens
3632- if not added_tokens_decoder [i ].normalized :
3633- previous_token = token
3634- token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
3635- if previous_token != token :
3636- logger .info (f"{ repr (previous_token )} is encoded and decoded back to { repr (token )} using AutoTokenizer" )
3637-
3638- if added_tokens_decoder [i ].special or self .does_token_look_special (token ):
3639- toktypes .append (gguf .TokenType .CONTROL )
3640- else :
3641- toktypes .append (gguf .TokenType .USER_DEFINED )
3642- else :
3643- toktypes .append (gguf .TokenType .NORMAL )
3644- tokens .append (token )
3645-
3646- self .gguf_writer .add_tokenizer_model ("gpt2" )
3647- self .gguf_writer .add_tokenizer_pre (tokpre )
3648- self .gguf_writer .add_token_list (tokens )
3649- self .gguf_writer .add_token_types (toktypes )
3650-
3651- special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
3652- special_tokens_map_file = self .dir_model / 'special_tokens_map.json'
3653- additional_special_tokens = []
3654- if special_tokens_map_file .is_file ():
3655- with open (special_tokens_map_file , encoding = 'utf-8' ) as f :
3656- additional_special_tokens = json .load (f ).get ('additional_special_tokens' , [])
3657- tokenizer_cfg_file = self .dir_model / 'special_tokens_map.json'
3658- if tokenizer_cfg_file .is_file ():
3659- with open (tokenizer_cfg_file , encoding = 'utf-8' ) as f :
3660- added_tokens_decoder = json .load (f ).get ('added_tokens_decoder' , {})
3661- token2ids_map = {data ['content' ] : int (token ) for token , data in added_tokens_decoder .items () if data ['special' ]}
3662- for token in additional_special_tokens :
3663- if token in token2ids_map :
3664- special_vocab ._set_special_token (token , token2ids_map [token ])
3665- special_vocab ._set_special_token ('eos' , 151645 )
3666- special_vocab ._set_special_token ("bos" , 151643 )
3667- special_vocab .add_to_gguf (self .gguf_writer )
3668-
36693654
36703655@ModelBase .register ("Qwen3MoeForCausalLM" )
36713656class Qwen3MoeModel (Qwen2MoeModel ):
@@ -3684,69 +3669,6 @@ def set_vocab(self):
36843669
36853670 super ().set_vocab ()
36863671
3687- def _set_vocab_interns1 (self ):
3688- tokens : list [str ] = []
3689- toktypes : list [int ] = []
3690-
3691- from transformers import AutoTokenizer
3692- tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
3693- vocab = getattr (tokenizer , 'vocab' , tokenizer .get_vocab ())
3694- vocab_size = self .hparams .get ("vocab_size" , len (vocab ))
3695- assert max (vocab .values ()) < vocab_size
3696-
3697- tokpre = self .get_vocab_base_pre (tokenizer )
3698-
3699- reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in vocab .items ()}
3700- added_vocab = tokenizer .get_added_vocab ()
3701-
3702- added_tokens_decoder = tokenizer .added_tokens_decoder
3703-
3704- for i in range (vocab_size ):
3705- if i not in reverse_vocab :
3706- tokens .append (f"[PAD{ i } ]" )
3707- toktypes .append (gguf .TokenType .UNUSED )
3708- else :
3709- token : str = reverse_vocab [i ]
3710- if token in added_vocab :
3711- # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3712- # To avoid unexpected issues - we make sure to normalize non-normalized tokens
3713- if not added_tokens_decoder [i ].normalized :
3714- previous_token = token
3715- token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
3716- if previous_token != token :
3717- logger .info (f"{ repr (previous_token )} is encoded and decoded back to { repr (token )} using AutoTokenizer" )
3718-
3719- if added_tokens_decoder [i ].special or self .does_token_look_special (token ):
3720- toktypes .append (gguf .TokenType .CONTROL )
3721- else :
3722- toktypes .append (gguf .TokenType .USER_DEFINED )
3723- else :
3724- toktypes .append (gguf .TokenType .NORMAL )
3725- tokens .append (token )
3726-
3727- self .gguf_writer .add_tokenizer_model ("gpt2" )
3728- self .gguf_writer .add_tokenizer_pre (tokpre )
3729- self .gguf_writer .add_token_list (tokens )
3730- self .gguf_writer .add_token_types (toktypes )
3731-
3732- special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
3733- special_tokens_map_file = self .dir_model / 'special_tokens_map.json'
3734- additional_special_tokens = []
3735- if special_tokens_map_file .is_file ():
3736- with open (special_tokens_map_file , encoding = 'utf-8' ) as f :
3737- additional_special_tokens = json .load (f ).get ('additional_special_tokens' , [])
3738- tokenizer_cfg_file = self .dir_model / 'special_tokens_map.json'
3739- if tokenizer_cfg_file .is_file ():
3740- with open (tokenizer_cfg_file , encoding = 'utf-8' ) as f :
3741- added_tokens_decoder = json .load (f ).get ('added_tokens_decoder' , {})
3742- token2ids_map = {data ['content' ] : int (token ) for token , data in added_tokens_decoder .items () if data ['special' ]}
3743- for token in additional_special_tokens :
3744- if token in token2ids_map :
3745- special_vocab ._set_special_token (token , token2ids_map [token ])
3746- special_vocab ._set_special_token ('eos' , 151645 )
3747- special_vocab ._set_special_token ("bos" , 151643 )
3748- special_vocab .add_to_gguf (self .gguf_writer )
3749-
37503672
37513673@ModelBase .register ("GPT2LMHeadModel" )
37523674class GPT2Model (TextModel ):
0 commit comments