@@ -2917,7 +2917,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
29172917 if "language_model." in name :
29182918 name = name .replace ("language_model." , "" ) # for InternVL
29192919 if name .startswith ("mlp" ) or name .startswith ("multi_modal_projector" ) \
2920- or name .startswith ("vision_model" ) or name .startswith ("audio_tower" ):
2920+ or name .startswith ("vision_model" ) or name .startswith ("audio_tower" ) \
2921+ or name .startswith ("model.vision_tower" ) or name .startswith ("model.multi_modal_projector" ):
29212922 # skip vision and audio tensors
29222923 return []
29232924 yield from super ().modify_tensors (data_torch , name , bid )
@@ -3589,6 +3590,82 @@ def prepare_tensors(self):
35893590class Qwen3Model (Qwen2Model ):
35903591 model_arch = gguf .MODEL_ARCH .QWEN3
35913592
3593+ def __init__ (self , * args , ** kwargs ):
3594+ super ().__init__ (* args , ** kwargs )
3595+ hparams = ModelBase .load_hparams (self .dir_model , is_mistral_format = False )
3596+ self .origin_hf_arch = hparams .get ('architectures' , [None ])[0 ]
3597+
3598+ def set_vocab (self ):
3599+ # deal with intern-s1-mini
3600+ if self .origin_hf_arch == 'InternS1ForConditionalGeneration' :
3601+ self ._set_vocab_interns1 ()
3602+ return
3603+
3604+ super ().set_vocab ()
3605+
3606+ def _set_vocab_interns1 (self ):
3607+ tokens : list [str ] = []
3608+ toktypes : list [int ] = []
3609+
3610+ from transformers import AutoTokenizer
3611+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
3612+ vocab = getattr (tokenizer , 'vocab' , tokenizer .get_vocab ())
3613+ vocab_size = self .hparams .get ("vocab_size" , len (vocab ))
3614+ assert max (vocab .values ()) < vocab_size
3615+
3616+ tokpre = self .get_vocab_base_pre (tokenizer )
3617+
3618+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in vocab .items ()}
3619+ added_vocab = tokenizer .get_added_vocab ()
3620+
3621+ added_tokens_decoder = tokenizer .added_tokens_decoder
3622+
3623+ for i in range (vocab_size ):
3624+ if i not in reverse_vocab :
3625+ tokens .append (f"[PAD{ i } ]" )
3626+ toktypes .append (gguf .TokenType .UNUSED )
3627+ else :
3628+ token : str = reverse_vocab [i ]
3629+ if token in added_vocab :
3630+ # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3631+ # To avoid unexpected issues - we make sure to normalize non-normalized tokens
3632+ if not added_tokens_decoder [i ].normalized :
3633+ previous_token = token
3634+ token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
3635+ if previous_token != token :
3636+ logger .info (f"{ repr (previous_token )} is encoded and decoded back to { repr (token )} using AutoTokenizer" )
3637+
3638+ if added_tokens_decoder [i ].special or self .does_token_look_special (token ):
3639+ toktypes .append (gguf .TokenType .CONTROL )
3640+ else :
3641+ toktypes .append (gguf .TokenType .USER_DEFINED )
3642+ else :
3643+ toktypes .append (gguf .TokenType .NORMAL )
3644+ tokens .append (token )
3645+
3646+ self .gguf_writer .add_tokenizer_model ("gpt2" )
3647+ self .gguf_writer .add_tokenizer_pre (tokpre )
3648+ self .gguf_writer .add_token_list (tokens )
3649+ self .gguf_writer .add_token_types (toktypes )
3650+
3651+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
3652+ special_tokens_map_file = self .dir_model / 'special_tokens_map.json'
3653+ additional_special_tokens = []
3654+ if special_tokens_map_file .is_file ():
3655+ with open (special_tokens_map_file , encoding = 'utf-8' ) as f :
3656+ additional_special_tokens = json .load (f ).get ('additional_special_tokens' , [])
3657+ tokenizer_cfg_file = self .dir_model / 'special_tokens_map.json'
3658+ if tokenizer_cfg_file .is_file ():
3659+ with open (tokenizer_cfg_file , encoding = 'utf-8' ) as f :
3660+ added_tokens_decoder = json .load (f ).get ('added_tokens_decoder' , {})
3661+ token2ids_map = {data ['content' ] : int (token ) for token , data in added_tokens_decoder .items () if data ['special' ]}
3662+ for token in additional_special_tokens :
3663+ if token in token2ids_map :
3664+ special_vocab ._set_special_token (token , token2ids_map [token ])
3665+ special_vocab ._set_special_token ('eos' , 151645 )
3666+ special_vocab ._set_special_token ("bos" , 151643 )
3667+ special_vocab .add_to_gguf (self .gguf_writer )
3668+
35923669
35933670@ModelBase .register ("Qwen3MoeForCausalLM" )
35943671class Qwen3MoeModel (Qwen2MoeModel ):
@@ -3605,10 +3682,7 @@ def set_vocab(self):
36053682 self ._set_vocab_interns1 ()
36063683 return
36073684
3608- try :
3609- self ._set_vocab_sentencepiece ()
3610- except FileNotFoundError :
3611- self ._set_vocab_gpt2 ()
3685+ super ().set_vocab ()
36123686
36133687 def _set_vocab_interns1 (self ):
36143688 tokens : list [str ] = []
0 commit comments