@@ -1216,6 +1216,55 @@ def _try_set_pooling_type(self) -> None:
12161216 raise NotImplementedError ("Only MEAN, CLS, and LAST pooling types supported" )
12171217 self .gguf_writer .add_pooling_type (pooling_type )
12181218
1219+ def _set_vocab_interns1 (self ):
1220+ tokens : list [str ] = []
1221+ toktypes : list [int ] = []
1222+
1223+ from transformers import AutoTokenizer
1224+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
1225+ vocab = getattr (tokenizer , 'vocab' , tokenizer .get_vocab ())
1226+ vocab_size = self .hparams .get ("vocab_size" , len (vocab ))
1227+ assert max (vocab .values ()) < vocab_size
1228+
1229+ tokpre = self .get_vocab_base_pre (tokenizer )
1230+
1231+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in vocab .items ()}
1232+ added_vocab = tokenizer .get_added_vocab ()
1233+
1234+ added_tokens_decoder = tokenizer .added_tokens_decoder
1235+
1236+ for i in range (vocab_size ):
1237+ if i not in reverse_vocab :
1238+ tokens .append (f"[PAD{ i } ]" )
1239+ toktypes .append (gguf .TokenType .UNUSED )
1240+ else :
1241+ token : str = reverse_vocab [i ]
1242+ if token in added_vocab :
1243+ # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
1244+ # To avoid unexpected issues - we make sure to normalize non-normalized tokens
1245+ if not added_tokens_decoder [i ].normalized :
1246+ previous_token = token
1247+ token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
1248+ if previous_token != token :
1249+ logger .info (f"{ repr (previous_token )} is encoded and decoded back to { repr (token )} using AutoTokenizer" )
1250+
1251+ if added_tokens_decoder [i ].special or self .does_token_look_special (token ):
1252+ toktypes .append (gguf .TokenType .CONTROL )
1253+ else :
1254+ toktypes .append (gguf .TokenType .USER_DEFINED )
1255+ else :
1256+ toktypes .append (gguf .TokenType .NORMAL )
1257+ tokens .append (token )
1258+
1259+ self .gguf_writer .add_tokenizer_model ("gpt2" )
1260+ self .gguf_writer .add_tokenizer_pre (tokpre )
1261+ self .gguf_writer .add_token_list (tokens )
1262+ self .gguf_writer .add_token_types (toktypes )
1263+
1264+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
1265+ special_vocab ._set_special_token ("bos" , 151643 )
1266+ special_vocab .add_to_gguf (self .gguf_writer )
1267+
12191268
12201269class MmprojModel (ModelBase ):
12211270 model_type = ModelType .MMPROJ
@@ -2932,7 +2981,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
29322981 if "language_model." in name :
29332982 name = name .replace ("language_model." , "" ) # for InternVL
29342983 if name .startswith ("mlp" ) or name .startswith ("multi_modal_projector" ) \
2935- or name .startswith ("vision_model" ) or name .startswith ("audio_tower" ):
2984+ or name .startswith ("vision_model" ) or name .startswith ("audio_tower" ) \
2985+ or name .startswith ("model.vision_tower" ) or name .startswith ("model.multi_modal_projector" ):
29362986 # skip vision and audio tensors
29372987 return []
29382988 yield from super ().modify_tensors (data_torch , name , bid )
@@ -3604,6 +3654,19 @@ def prepare_tensors(self):
36043654class Qwen3Model (Qwen2Model ):
36053655 model_arch = gguf .MODEL_ARCH .QWEN3
36063656
3657+ def __init__ (self , * args , ** kwargs ):
3658+ super ().__init__ (* args , ** kwargs )
3659+ hparams = ModelBase .load_hparams (self .dir_model , is_mistral_format = False )
3660+ self .origin_hf_arch = hparams .get ('architectures' , [None ])[0 ]
3661+
3662+ def set_vocab (self ):
3663+ # deal with intern-s1-mini
3664+ if self .origin_hf_arch == 'InternS1ForConditionalGeneration' :
3665+ self ._set_vocab_interns1 ()
3666+ return
3667+
3668+ super ().set_vocab ()
3669+
36073670
36083671@ModelBase .register ("Qwen3MoeForCausalLM" )
36093672class Qwen3MoeModel (Qwen2MoeModel ):
@@ -3620,73 +3683,7 @@ def set_vocab(self):
36203683 self ._set_vocab_interns1 ()
36213684 return
36223685
3623- try :
3624- self ._set_vocab_sentencepiece ()
3625- except FileNotFoundError :
3626- self ._set_vocab_gpt2 ()
3627-
3628- def _set_vocab_interns1 (self ):
3629- tokens : list [str ] = []
3630- toktypes : list [int ] = []
3631-
3632- from transformers import AutoTokenizer
3633- tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
3634- vocab = getattr (tokenizer , 'vocab' , tokenizer .get_vocab ())
3635- vocab_size = self .hparams .get ("vocab_size" , len (vocab ))
3636- assert max (vocab .values ()) < vocab_size
3637-
3638- tokpre = self .get_vocab_base_pre (tokenizer )
3639-
3640- reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in vocab .items ()}
3641- added_vocab = tokenizer .get_added_vocab ()
3642-
3643- added_tokens_decoder = tokenizer .added_tokens_decoder
3644-
3645- for i in range (vocab_size ):
3646- if i not in reverse_vocab :
3647- tokens .append (f"[PAD{ i } ]" )
3648- toktypes .append (gguf .TokenType .UNUSED )
3649- else :
3650- token : str = reverse_vocab [i ]
3651- if token in added_vocab :
3652- # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3653- # To avoid unexpected issues - we make sure to normalize non-normalized tokens
3654- if not added_tokens_decoder [i ].normalized :
3655- previous_token = token
3656- token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
3657- if previous_token != token :
3658- logger .info (f"{ repr (previous_token )} is encoded and decoded back to { repr (token )} using AutoTokenizer" )
3659-
3660- if added_tokens_decoder [i ].special or self .does_token_look_special (token ):
3661- toktypes .append (gguf .TokenType .CONTROL )
3662- else :
3663- toktypes .append (gguf .TokenType .USER_DEFINED )
3664- else :
3665- toktypes .append (gguf .TokenType .NORMAL )
3666- tokens .append (token )
3667-
3668- self .gguf_writer .add_tokenizer_model ("gpt2" )
3669- self .gguf_writer .add_tokenizer_pre (tokpre )
3670- self .gguf_writer .add_token_list (tokens )
3671- self .gguf_writer .add_token_types (toktypes )
3672-
3673- special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
3674- special_tokens_map_file = self .dir_model / 'special_tokens_map.json'
3675- additional_special_tokens = []
3676- if special_tokens_map_file .is_file ():
3677- with open (special_tokens_map_file , encoding = 'utf-8' ) as f :
3678- additional_special_tokens = json .load (f ).get ('additional_special_tokens' , [])
3679- tokenizer_cfg_file = self .dir_model / 'special_tokens_map.json'
3680- if tokenizer_cfg_file .is_file ():
3681- with open (tokenizer_cfg_file , encoding = 'utf-8' ) as f :
3682- added_tokens_decoder = json .load (f ).get ('added_tokens_decoder' , {})
3683- token2ids_map = {data ['content' ] : int (token ) for token , data in added_tokens_decoder .items () if data ['special' ]}
3684- for token in additional_special_tokens :
3685- if token in token2ids_map :
3686- special_vocab ._set_special_token (token , token2ids_map [token ])
3687- special_vocab ._set_special_token ('eos' , 151645 )
3688- special_vocab ._set_special_token ("bos" , 151643 )
3689- special_vocab .add_to_gguf (self .gguf_writer )
3686+ super ().set_vocab ()
36903687
36913688
36923689@ModelBase .register ("GPT2LMHeadModel" )
0 commit comments