@@ -1900,6 +1900,7 @@ def prepare_tensors(self):
19001900 "MixtralForCausalLM" ,
19011901 "VLlama3ForCausalLM" ,
19021902 "LlavaForConditionalGeneration" ,
1903+ "VoxtralForConditionalGeneration" ,
19031904 "LlamaModel" )
19041905class LlamaModel (TextModel ):
19051906 model_arch = gguf .MODEL_ARCH .LLAMA
@@ -1912,6 +1913,11 @@ def __init__(self, *args, **kwargs):
19121913 self .hparams ["num_attention_heads" ] = self .hparams .get ("num_attention_heads" , 32 )
19131914
19141915 def set_vocab (self ):
1916+ path_tekken_json = self .dir_model / "tekken.json"
1917+ path_tokenizer_json = self .dir_model / "tokenizer.json"
1918+ if path_tekken_json .is_file () and not path_tokenizer_json .is_file ():
1919+ return self .set_vocab_tekken ()
1920+
19151921 try :
19161922 self ._set_vocab_sentencepiece ()
19171923 except FileNotFoundError :
@@ -1944,6 +1950,52 @@ def set_vocab(self):
19441950 if self .hparams .get ("vocab_size" , 32000 ) == 49152 :
19451951 self .gguf_writer .add_add_bos_token (False )
19461952
1953+ def set_vocab_tekken (self ):
1954+ vocab = gguf .vocab .MistralVocab (self .dir_model )
1955+ self .gguf_writer .add_tokenizer_model (vocab .gguf_tokenizer_model )
1956+
1957+ tokens = []
1958+ scores = []
1959+ toktypes = []
1960+
1961+ for text , score , toktype in vocab .all_tokens ():
1962+ tokens .append (text )
1963+ scores .append (score )
1964+ toktypes .append (toktype )
1965+
1966+ assert len (tokens ) == vocab .vocab_size , (
1967+ f"token count ({ len (tokens )} ) != vocab size ({ vocab .vocab_size } )"
1968+ )
1969+
1970+ if vocab .tokenizer_type == gguf .vocab .MistralTokenizerType .tekken :
1971+ self .gguf_writer .add_tokenizer_pre ("tekken" )
1972+ self .gguf_writer .add_token_merges (
1973+ vocab .extract_vocab_merges_from_model ()
1974+ )
1975+
1976+ logger .info (
1977+ f"Setting bos, eos, unk and pad token IDs to { vocab .bos_id } , { vocab .eos_id } , { vocab .unk_id } , { vocab .pad_id } ."
1978+ )
1979+
1980+ self .gguf_writer .add_bos_token_id (vocab .bos_id )
1981+ self .gguf_writer .add_eos_token_id (vocab .eos_id )
1982+ self .gguf_writer .add_unk_token_id (vocab .unk_id )
1983+ self .gguf_writer .add_pad_token_id (vocab .pad_id )
1984+
1985+ self .gguf_writer .add_token_list (tokens )
1986+ self .gguf_writer .add_token_scores (scores )
1987+ self .gguf_writer .add_token_types (toktypes )
1988+ self .gguf_writer .add_vocab_size (vocab .vocab_size )
1989+
1990+ self .gguf_writer .add_add_bos_token (True )
1991+ self .gguf_writer .add_add_eos_token (False )
1992+
1993+ script_dir = Path (__file__ ).parent
1994+ template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
1995+ with open (template_path , "r" , encoding = "utf-8" ) as f :
1996+ template = f .read ()
1997+ self .gguf_writer .add_chat_template (template )
1998+
19471999 def set_gguf_parameters (self ):
19482000 super ().set_gguf_parameters ()
19492001 hparams = self .hparams
@@ -1971,12 +2023,13 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
19712023 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
19722024 n_head = self .hparams ["num_attention_heads" ]
19732025 n_kv_head = self .hparams .get ("num_key_value_heads" )
1974- is_vision_tensor = "vision_tower" in name \
2026+ is_multimodal_tensor = "vision_tower" in name \
19752027 or "vision_model" in name \
2028+ or "audio_tower" in name \
19762029 or "model.connector" in name \
19772030 or "multi_modal_projector" in name
19782031
1979- if is_vision_tensor :
2032+ if is_multimodal_tensor :
19802033 return [] # skip vision tensors
19812034 elif self .hf_arch == "LlamaModel" :
19822035 name = "model." + name
@@ -7231,9 +7284,10 @@ class WhisperEncoderModel(MmprojModel):
72317284
72327285 def __init__ (self , * args , ** kwargs ):
72337286 super ().__init__ (* args , ** kwargs )
7234- self .hparams ["hidden_size" ] = self .hparams ["d_model" ]
7235- self .hparams ["intermediate_size" ] = self .hparams ["encoder_ffn_dim" ]
7236- self .hparams ["num_attention_heads" ] = self .hparams ["encoder_attention_heads" ]
7287+ if "hidden_size" not in self .hparams and "intermediate_size" not in self .hparams :
7288+ self .hparams ["hidden_size" ] = self .hparams ["d_model" ]
7289+ self .hparams ["intermediate_size" ] = self .hparams ["encoder_ffn_dim" ]
7290+ self .hparams ["num_attention_heads" ] = self .hparams ["encoder_attention_heads" ]
72377291
72387292 def set_gguf_parameters (self ):
72397293 super ().set_gguf_parameters ()
@@ -7272,9 +7326,21 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
72727326
72737327 def set_gguf_parameters (self ):
72747328 super ().set_gguf_parameters ()
7329+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .ULTRAVOX )
72757330 self .gguf_writer .add_audio_stack_factor (self .global_config ["stack_factor" ])
72767331
72777332
7333+ @ModelBase .register ("VoxtralForConditionalGeneration" )
7334+ class VoxtralWhisperEncoderModel (WhisperEncoderModel ):
7335+ has_vision_encoder = False # no vision encoder
7336+ has_audio_encoder = True
7337+
7338+ def set_gguf_parameters (self ):
7339+ super ().set_gguf_parameters ()
7340+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .VOXTRAL )
7341+ self .gguf_writer .add_audio_stack_factor (4 ) # == intermediate_size // hidden_size
7342+
7343+
72787344@ModelBase .register ("FalconH1ForCausalLM" )
72797345class FalconH1Model (Mamba2Model ):
72807346 model_arch = gguf .MODEL_ARCH .FALCON_H1
0 commit comments