@@ -1124,6 +1124,8 @@ class MmprojModel(ModelBase):
11241124 preprocessor_config : dict [str , Any ]
11251125 global_config : dict [str , Any ]
11261126
1127+ n_block_keys = ["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" ]
1128+
11271129 has_vision_encoder : bool = True # by default
11281130 has_audio_encoder : bool = False
11291131
@@ -1160,8 +1162,7 @@ def __init__(self, *args, **kwargs):
11601162
11611163 # TODO @ngxson : this is a hack to support both vision and audio encoders
11621164 have_multiple_encoders = self .has_audio_encoder and self .has_vision_encoder
1163- self .block_count = 128 if have_multiple_encoders else \
1164- self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" ], True )
1165+ self .block_count = 128 if have_multiple_encoders else self .find_hparam (self .n_block_keys , True )
11651166 self .tensor_map = gguf .get_tensor_name_map (gguf .MODEL_ARCH .MMPROJ , self .block_count )
11661167
11671168 # load preprocessor config
@@ -1185,33 +1186,51 @@ def set_gguf_parameters(self):
11851186 self .gguf_writer .add_vision_projection_dim (self .n_embd_text )
11861187
11871188 # vision config
1188- self .gguf_writer .add_vision_image_size (self .find_hparam (["image_size" ]))
1189- self .gguf_writer .add_vision_patch_size (self .find_hparam (["patch_size" ]))
1190- self .gguf_writer .add_vision_embedding_length (self .find_hparam (["hidden_size" ]))
1191- self .gguf_writer .add_vision_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1192- self .gguf_writer .add_vision_block_count (self .block_count )
1193- self .gguf_writer .add_vision_head_count (self .find_hparam (["num_attention_heads" ]))
1189+ self .gguf_writer .add_vision_image_size (self .find_vparam (["image_size" ]))
1190+ self .gguf_writer .add_vision_patch_size (self .find_vparam (["patch_size" ]))
1191+ self .gguf_writer .add_vision_embedding_length (self .find_vparam (["hidden_size" ]))
1192+ self .gguf_writer .add_vision_feed_forward_length (self .find_vparam (["intermediate_size" ]))
1193+ self .gguf_writer .add_vision_block_count (self .find_vparam ( self . n_block_keys ) )
1194+ self .gguf_writer .add_vision_head_count (self .find_vparam (["num_attention_heads" ]))
11941195
11951196 # preprocessor config
11961197 self .gguf_writer .add_vision_image_mean (self .preprocessor_config ["image_mean" ])
11971198 self .gguf_writer .add_vision_image_std (self .preprocessor_config ["image_std" ])
11981199
1199- elif self .has_audio_encoder :
1200+ if self .has_audio_encoder :
12001201 self .gguf_writer .add_clip_has_audio_encoder (True )
12011202 self .gguf_writer .add_audio_projection_dim (self .n_embd_text )
12021203
12031204 # audio config
1204- self .gguf_writer .add_audio_embedding_length (self .find_hparam (["hidden_size" ]))
1205- self .gguf_writer .add_audio_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1206- self .gguf_writer .add_audio_block_count (self .block_count )
1207- self .gguf_writer .add_audio_head_count (self .find_hparam (["num_attention_heads" ]))
1205+ self .gguf_writer .add_audio_embedding_length (self .find_aparam (["hidden_size" ]))
1206+ self .gguf_writer .add_audio_feed_forward_length (self .find_aparam (["intermediate_size" ]))
1207+ self .gguf_writer .add_audio_block_count (self .find_aparam ( self . n_block_keys ) )
1208+ self .gguf_writer .add_audio_head_count (self .find_aparam (["num_attention_heads" ]))
12081209
12091210 else :
12101211 raise ValueError ("MmprojModel must have either vision or audio encoder" )
12111212
12121213 def write_vocab (self ):
12131214 raise ValueError ("MmprojModel does not support vocab writing" )
12141215
1216+ def find_vparam (self , keys : Iterable [str ], optional : bool = False ) -> Any :
1217+ key = next ((k for k in keys if k in self .hparams ), None )
1218+ assert self .hparams_vision is not None
1219+ return self ._find_param (self .hparams_vision , keys , optional )
1220+
1221+ def find_aparam (self , keys : Iterable [str ], optional : bool = False ) -> Any :
1222+ key = next ((k for k in keys if k in self .hparams ), None )
1223+ assert self .hparams_audio is not None
1224+ return self ._find_param (self .hparams_audio , keys , optional )
1225+
1226+ def _find_param (self , obj : dict [str , Any ], keys : Iterable [str ], optional : bool = False ) -> Any :
1227+ key = next ((k for k in keys if k in obj ), None )
1228+ if key is not None :
1229+ return obj [key ]
1230+ if optional :
1231+ return None
1232+ raise KeyError (f"could not find any of: { keys } " )
1233+
12151234
12161235@ModelBase .register ("GPTNeoXForCausalLM" )
12171236class GPTNeoXModel (TextModel ):
@@ -2743,9 +2762,9 @@ def set_gguf_parameters(self):
27432762 self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN2VL )
27442763 elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni' :
27452764 if model_type == 'qwen2_5_omni' :
2746- self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25VL )
2747- else :
27482765 self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25O )
2766+ else :
2767+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25VL )
27492768 self .gguf_writer .add_vision_use_silu (True )
27502769 # find n_wa_pattern (window attention pattern)
27512770 fullatt_block_indexes = hparams .get ("fullatt_block_indexes" )
@@ -2808,6 +2827,19 @@ class Qwen25OmniModel(Qwen2VLVisionModel):
28082827 has_vision_encoder = True
28092828 has_audio_encoder = True
28102829
2830+ def __init__ (self , * args , ** kwargs ):
2831+ super ().__init__ (* args , ** kwargs )
2832+ assert self .hparams_audio is not None
2833+ self .hparams_audio ["hidden_size" ] = self .hparams_audio ["d_model" ]
2834+ self .hparams_audio ["intermediate_size" ] = self .hparams_audio ["encoder_ffn_dim" ]
2835+ self .hparams_audio ["num_attention_heads" ] = self .hparams_audio ["encoder_attention_heads" ]
2836+
2837+ def set_gguf_parameters (self ):
2838+ super ().set_gguf_parameters ()
2839+ assert self .hparams_audio is not None
2840+ self .gguf_writer .add_audio_num_mel_bins (self .hparams_audio ["num_mel_bins" ])
2841+ self .gguf_writer .add_audio_attention_layernorm_eps (self .hparams_audio .get ("layer_norm_eps" , 1e-5 ))
2842+
28112843 def get_vision_config (self ) -> dict [str , Any ] | None :
28122844 return self .global_config ["thinker_config" ].get ("vision_config" )
28132845
0 commit comments