@@ -432,6 +432,9 @@ def load_hparams(dir_model: Path):
432432 if "llm_config" in config :
433433 # rename for InternVL
434434 config ["text_config" ] = config ["llm_config" ]
435+ if "thinker_config" in config :
436+ # rename for Qwen2.5-Omni
437+ config ["text_config" ] = config ["thinker_config" ]["text_config" ]
435438 return config
436439
437440 @classmethod
@@ -1124,15 +1127,16 @@ class MmprojModel(ModelBase):
11241127 has_vision_encoder : bool = True # by default
11251128 has_audio_encoder : bool = False
11261129
1130+ # for models having multiple encoders, we need to separate their hparams
1131+ hparams_vision : dict [str , Any ] | None = None
1132+ hparams_audio : dict [str , Any ] | None = None
1133+
11271134 def __init__ (self , * args , ** kwargs ):
11281135 super ().__init__ (* args , ** kwargs )
11291136
11301137 if self .model_arch != gguf .MODEL_ARCH .MMPROJ :
11311138 raise TypeError ("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ" )
11321139
1133- if self .has_vision_encoder and self .has_audio_encoder :
1134- raise NotImplementedError ("both vision + audio not supported yet" )
1135-
11361140 # get n_embd of the text model
11371141 if "text_config" not in self .hparams :
11381142 self .hparams ["text_config" ] = {}
@@ -1143,22 +1147,33 @@ def __init__(self, *args, **kwargs):
11431147 assert self .n_embd_text > 0 , "n_embd not found in hparams"
11441148
11451149 # move vision config to the top level, while preserving the original hparams in global_config
1146- self .global_config = self .hparams
1147-
1148- if "vision_config" in self .hparams :
1149- self .hparams = self .hparams ["vision_config" ]
1150- elif "audio_config" in self .hparams :
1151- self .hparams = self .hparams ["audio_config" ]
1152- else :
1150+ import copy
1151+ self .global_config = copy .deepcopy (self .hparams )
1152+ self .hparams_vision = self .get_vision_config ()
1153+ self .hparams_audio = self .get_audio_config ()
1154+
1155+ if self .hparams_vision is None and self .hparams_audio is None :
11531156 raise ValueError ("vision_config / audio_config not found in hparams" )
11541157
1155- self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" ])
1158+ # for compat with vision-only models
1159+ self .hparams = self .hparams_vision or self .hparams_audio or self .hparams
1160+
1161+ # TODO @ngxson : this is a hack to support both vision and audio encoders
1162+ have_multiple_encoders = self .has_audio_encoder and self .has_vision_encoder
1163+ self .block_count = 128 if have_multiple_encoders else \
1164+ self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" ], True )
11561165 self .tensor_map = gguf .get_tensor_name_map (gguf .MODEL_ARCH .MMPROJ , self .block_count )
11571166
11581167 # load preprocessor config
11591168 with open (self .dir_model / "preprocessor_config.json" , "r" , encoding = "utf-8" ) as f :
11601169 self .preprocessor_config = json .load (f )
11611170
1171+ def get_vision_config (self ) -> dict [str , Any ] | None :
1172+ return self .global_config .get ("vision_config" )
1173+
1174+ def get_audio_config (self ) -> dict [str , Any ] | None :
1175+ return self .global_config .get ("audio_config" )
1176+
11621177 def set_type (self ):
11631178 self .gguf_writer .add_type (gguf .GGUFType .MMPROJ )
11641179
@@ -2674,7 +2689,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26742689 yield from super ().modify_tensors (data_torch , name , bid )
26752690
26762691
2677- @ModelBase .register ("Qwen2VLModel" , "Qwen2VLForConditionalGeneration" , "Qwen2_5_VLForConditionalGeneration" )
2692+ @ModelBase .register (
2693+ "Qwen2VLModel" ,
2694+ "Qwen2VLForConditionalGeneration" ,
2695+ "Qwen2_5_VLForConditionalGeneration" ,
2696+ "Qwen2_5OmniModel" ,
2697+ )
26782698class Qwen2VLModel (TextModel ):
26792699 model_arch = gguf .MODEL_ARCH .QWEN2VL
26802700
@@ -2692,8 +2712,11 @@ def set_vocab(self):
26922712
26932713 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
26942714 del bid # unused
2695- if name .startswith ("visual." ):
2696- # skip visual tensors
2715+ if name .startswith ("thinker." ):
2716+ name = name .replace ("thinker." , "" )
2717+ if name .startswith ("visual" ) or name .startswith ("audio" ) or \
2718+ name .startswith ("talker" ) or name .startswith ("token2wav" ):
2719+ # skip multimodal tensors
26972720 return []
26982721 return [(self .map_tensor_name (name ), data_torch )]
26992722
@@ -2702,21 +2725,27 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27022725class Qwen2VLVisionModel (MmprojModel ):
27032726 def __init__ (self , * args , ** kwargs ):
27042727 super ().__init__ (* args , ** kwargs )
2705- self .hparams ["image_size" ] = self .hparams .get ("image_size" , 560 )
2728+ assert self .hparams_vision is not None
2729+ self .hparams_vision ["image_size" ] = self .hparams_vision .get ("image_size" , 560 )
27062730 # rename config.json values
2707- self .hparams ["num_attention_heads" ] = self .hparams .get ("num_heads" )
2708- self .hparams ["num_hidden_layers" ] = self .hparams .get ("depth" )
2709- if "embed_dim" in self .hparams : # qwen2vl
2710- self .hparams ["intermediate_size" ] = self .hparams .get ("hidden_size" )
2711- self .hparams ["hidden_size" ] = self .hparams .get ("embed_dim" )
2731+ self .hparams_vision ["num_attention_heads" ] = self .hparams_vision .get ("num_heads" )
2732+ self .hparams_vision ["num_hidden_layers" ] = self .hparams_vision .get ("depth" )
2733+ if "embed_dim" in self .hparams_vision : # qwen2vl
2734+ self .hparams_vision ["intermediate_size" ] = self .hparams_vision .get ("hidden_size" )
2735+ self .hparams_vision ["hidden_size" ] = self .hparams_vision .get ("embed_dim" )
27122736
27132737 def set_gguf_parameters (self ):
27142738 super ().set_gguf_parameters ()
2715- hparams = self .hparams
2716- if self .global_config ['model_type' ] == 'qwen2_vl' :
2739+ assert self .hparams_vision is not None
2740+ hparams = self .hparams_vision
2741+ model_type = self .global_config ['model_type' ]
2742+ if model_type == 'qwen2_vl' :
27172743 self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN2VL )
2718- elif self .global_config ['model_type' ] == 'qwen2_5_vl' :
2719- self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25VL )
2744+ elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni' :
2745+ if model_type == 'qwen2_5_omni' :
2746+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25VL )
2747+ else :
2748+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25O )
27202749 self .gguf_writer .add_vision_use_silu (True )
27212750 # find n_wa_pattern (window attention pattern)
27222751 fullatt_block_indexes = hparams .get ("fullatt_block_indexes" )
@@ -2774,6 +2803,32 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27742803 return [] # skip other tensors
27752804
27762805
2806+ @ModelBase .register ("Qwen2_5OmniModel" )
2807+ class Qwen25OmniModel (Qwen2VLVisionModel ):
2808+ has_vision_encoder = True
2809+ has_audio_encoder = True
2810+
2811+ def get_vision_config (self ) -> dict [str , Any ] | None :
2812+ return self .global_config ["thinker_config" ].get ("vision_config" )
2813+
2814+ def get_audio_config (self ) -> dict [str , Any ] | None :
2815+ return self .global_config ["thinker_config" ].get ("audio_config" )
2816+
2817+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2818+ if name .startswith ("thinker." ):
2819+ name = name .replace ("thinker." , "" )
2820+
2821+ if name .startswith ("audio_tower" ):
2822+ # process audio tensors
2823+ if "audio_bos_eos_token" in name :
2824+ # this tensor is left unused in transformers code
2825+ # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
2826+ return []
2827+ return [(self .map_tensor_name (name ), data_torch )]
2828+
2829+ return super ().modify_tensors (data_torch , name , bid )
2830+
2831+
27772832@ModelBase .register ("InternVisionModel" )
27782833class InternVisionModel (MmprojModel ):
27792834 def set_gguf_parameters (self ):
0 commit comments