@@ -45,7 +45,7 @@ class SentencePieceTokenTypes(IntEnum):
4545
4646class ModelType (IntEnum ):
4747 TEXT = 1
48- VISION = 2
48+ MMPROJ = 2
4949
5050
5151AnyModel = TypeVar ("AnyModel" , bound = "type[ModelBase]" )
@@ -54,7 +54,7 @@ class ModelType(IntEnum):
5454class ModelBase :
5555 _model_classes : dict [ModelType , dict [str , type [ModelBase ]]] = {
5656 ModelType .TEXT : {},
57- ModelType .VISION : {},
57+ ModelType .MMPROJ : {},
5858 }
5959
6060 dir_model : Path
@@ -88,7 +88,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
8888 small_first_shard : bool = False , hparams : dict [str , Any ] | None = None , remote_hf_model_id : str | None = None ):
8989 if type (self ) is ModelBase or \
9090 type (self ) is TextModel or \
91- type (self ) is VisionModel :
91+ type (self ) is MmprojModel :
9292 raise TypeError (f"{ type (self ).__name__ !r} should not be directly instantiated" )
9393
9494 self .dir_model = dir_model
@@ -439,7 +439,7 @@ def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
439439 assert names
440440
441441 def func (modelcls : AnyModel ) -> AnyModel :
442- model_type = ModelType .VISION if modelcls .model_arch == gguf .MODEL_ARCH .CLIP_VISION else ModelType .TEXT
442+ model_type = ModelType .MMPROJ if modelcls .model_arch == gguf .MODEL_ARCH .MMPROJ else ModelType .TEXT
443443 for name in names :
444444 cls ._model_classes [model_type ][name ] = modelcls
445445 return modelcls
@@ -1115,24 +1115,27 @@ def _try_set_pooling_type(self) -> None:
11151115 self .gguf_writer .add_pooling_type (pooling_type )
11161116
11171117
1118- class VisionModel (ModelBase ):
1119- model_type = ModelType .VISION
1120- model_arch = gguf .MODEL_ARCH .CLIP_VISION
1118+ class MmprojModel (ModelBase ):
1119+ model_type = ModelType .MMPROJ
1120+ model_arch = gguf .MODEL_ARCH .MMPROJ
11211121 preprocessor_config : dict [str , Any ]
11221122 global_config : dict [str , Any ]
1123- has_vision_encoder : bool = True
1123+
1124+ has_vision_encoder : bool = True # by default
11241125 has_audio_encoder : bool = False
11251126
11261127 def __init__ (self , * args , ** kwargs ):
11271128 super ().__init__ (* args , ** kwargs )
11281129
1129- if self .model_arch != gguf .MODEL_ARCH .CLIP_VISION :
1130- raise TypeError ("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION" )
1130+ if self .model_arch != gguf .MODEL_ARCH .MMPROJ :
1131+ raise TypeError ("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ" )
1132+
1133+ if self .has_vision_encoder and self .has_audio_encoder :
1134+ raise NotImplementedError ("both vision + audio not supported yet" )
11311135
11321136 # get n_embd of the text model
11331137 if "text_config" not in self .hparams :
11341138 self .hparams ["text_config" ] = {}
1135- # TODO @ngxson : separate VisionModel and AudioModel
11361139 if "audio_config" not in self .hparams :
11371140 self .hparams ["audio_config" ] = {}
11381141 text_config = {** self .hparams , ** self .hparams ["text_config" ]}
@@ -1150,37 +1153,49 @@ def __init__(self, *args, **kwargs):
11501153 raise ValueError ("vision_config / audio_config not found in hparams" )
11511154
11521155 self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" ])
1153- self .tensor_map = gguf .get_tensor_name_map (gguf .MODEL_ARCH .CLIP_VISION , self .block_count )
1156+ self .tensor_map = gguf .get_tensor_name_map (gguf .MODEL_ARCH .MMPROJ , self .block_count )
11541157
11551158 # load preprocessor config
11561159 with open (self .dir_model / "preprocessor_config.json" , "r" , encoding = "utf-8" ) as f :
11571160 self .preprocessor_config = json .load (f )
11581161
11591162 def set_type (self ):
1160- self .gguf_writer .add_type (gguf .GGUFType .CLIP_VISION )
1163+ self .gguf_writer .add_type (gguf .GGUFType .MMPROJ )
11611164
11621165 def set_gguf_parameters (self ):
11631166 self .gguf_writer .add_file_type (self .ftype )
1164- self . gguf_writer . add_vision_projection_dim ( self . n_embd_text )
1167+
11651168 if self .has_vision_encoder :
1166- self .gguf_writer .add_vision_has_vision_encoder (True )
1167- if self .has_audio_encoder :
1168- self .gguf_writer .add_vision_has_audio_encoder (True )
1169-
1170- # vision config
1171- self .gguf_writer .add_vision_image_size (self .find_hparam (["image_size" ]))
1172- self .gguf_writer .add_vision_patch_size (self .find_hparam (["patch_size" ]))
1173- self .gguf_writer .add_vision_embedding_length (self .find_hparam (["hidden_size" ]))
1174- self .gguf_writer .add_vision_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1175- self .gguf_writer .add_vision_block_count (self .block_count )
1176- self .gguf_writer .add_vision_head_count (self .find_hparam (["num_attention_heads" ]))
1177-
1178- # preprocessor config
1179- self .gguf_writer .add_vision_image_mean (self .preprocessor_config ["image_mean" ])
1180- self .gguf_writer .add_vision_image_std (self .preprocessor_config ["image_std" ])
1169+ self .gguf_writer .add_clip_has_vision_encoder (True )
1170+ self .gguf_writer .add_vision_projection_dim (self .n_embd_text )
1171+
1172+ # vision config
1173+ self .gguf_writer .add_vision_image_size (self .find_hparam (["image_size" ]))
1174+ self .gguf_writer .add_vision_patch_size (self .find_hparam (["patch_size" ]))
1175+ self .gguf_writer .add_vision_embedding_length (self .find_hparam (["hidden_size" ]))
1176+ self .gguf_writer .add_vision_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1177+ self .gguf_writer .add_vision_block_count (self .block_count )
1178+ self .gguf_writer .add_vision_head_count (self .find_hparam (["num_attention_heads" ]))
1179+
1180+ # preprocessor config
1181+ self .gguf_writer .add_vision_image_mean (self .preprocessor_config ["image_mean" ])
1182+ self .gguf_writer .add_vision_image_std (self .preprocessor_config ["image_std" ])
1183+
1184+ elif self .has_audio_encoder :
1185+ self .gguf_writer .add_clip_has_audio_encoder (True )
1186+ self .gguf_writer .add_audio_projection_dim (self .n_embd_text )
1187+
1188+ # audio config
1189+ self .gguf_writer .add_audio_embedding_length (self .find_hparam (["hidden_size" ]))
1190+ self .gguf_writer .add_audio_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1191+ self .gguf_writer .add_audio_block_count (self .block_count )
1192+ self .gguf_writer .add_audio_head_count (self .find_hparam (["num_attention_heads" ]))
1193+
1194+ else :
1195+ raise ValueError ("MmprojModel must have either vision or audio encoder" )
11811196
11821197 def write_vocab (self ):
1183- raise ValueError ("VisionModel does not support vocab writing" )
1198+ raise ValueError ("MmprojModel does not support vocab writing" )
11841199
11851200
11861201@ModelBase .register ("GPTNeoXForCausalLM" )
@@ -1964,7 +1979,7 @@ def prepare_tensors(self):
19641979 "LlavaForConditionalGeneration" , # pixtral
19651980 "Mistral3ForConditionalGeneration" , # mistral small 3.1
19661981)
1967- class LlavaVisionModel (VisionModel ):
1982+ class LlavaVisionModel (MmprojModel ):
19681983 img_break_tok_id = - 1
19691984
19701985 def __init__ (self , * args , ** kwargs ):
@@ -1990,7 +2005,7 @@ def set_gguf_parameters(self):
19902005 super ().set_gguf_parameters ()
19912006 hparams = self .hparams
19922007 if hparams ["model_type" ] == "pixtral" :
1993- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .PIXTRAL )
2008+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .PIXTRAL )
19942009 self .gguf_writer .add_vision_attention_layernorm_eps (hparams ["layer_norm_eps" ])
19952010
19962011 # hidden_act
@@ -2029,7 +2044,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
20292044
20302045
20312046@ModelBase .register ("Idefics3ForConditionalGeneration" , "SmolVLMForConditionalGeneration" )
2032- class SmolVLMModel (VisionModel ):
2047+ class SmolVLMModel (MmprojModel ):
20332048 def __init__ (self , * args , ** kwargs ):
20342049 super ().__init__ (* args , ** kwargs )
20352050 if self .hparams ["model_type" ] == "smolvlm_vision" :
@@ -2041,7 +2056,7 @@ def __init__(self, *args, **kwargs):
20412056
20422057 def set_gguf_parameters (self ):
20432058 super ().set_gguf_parameters ()
2044- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .IDEFICS3 )
2059+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .IDEFICS3 )
20452060 self .gguf_writer .add_vision_attention_layernorm_eps (self .hparams .get ("layer_norm_eps" , 1e-5 ))
20462061 self .gguf_writer .add_vision_projector_scale_factor (self .global_config .get ("scale_factor" , 2 ))
20472062 self .gguf_writer .add_vision_use_gelu (True )
@@ -2107,10 +2122,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
21072122
21082123
21092124@ModelBase .register ("Llama4ForConditionalGeneration" )
2110- class Llama4VisionModel (VisionModel ):
2125+ class Llama4VisionModel (MmprojModel ):
21112126 def set_gguf_parameters (self ):
21122127 super ().set_gguf_parameters ()
2113- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .LLAMA4 )
2128+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .LLAMA4 )
21142129 self .gguf_writer .add_vision_attention_layernorm_eps (self .hparams ["norm_eps" ])
21152130 self .gguf_writer .add_vision_projector_scale_factor (int (1.0 / self .hparams ["pixel_shuffle_ratio" ]))
21162131 assert self .hparams ["hidden_act" ] == "gelu"
@@ -2683,7 +2698,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26832698
26842699
26852700@ModelBase .register ("Qwen2VLModel" , "Qwen2VLForConditionalGeneration" , "Qwen2_5_VLForConditionalGeneration" )
2686- class Qwen2VLVisionModel (VisionModel ):
2701+ class Qwen2VLVisionModel (MmprojModel ):
26872702 def __init__ (self , * args , ** kwargs ):
26882703 super ().__init__ (* args , ** kwargs )
26892704 self .hparams ["image_size" ] = self .hparams .get ("image_size" , 560 )
@@ -2698,9 +2713,9 @@ def set_gguf_parameters(self):
26982713 super ().set_gguf_parameters ()
26992714 hparams = self .hparams
27002715 if self .global_config ['model_type' ] == 'qwen2_vl' :
2701- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .QWEN2VL )
2716+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN2VL )
27022717 elif self .global_config ['model_type' ] == 'qwen2_5_vl' :
2703- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .QWEN25VL )
2718+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25VL )
27042719 self .gguf_writer .add_vision_use_silu (True )
27052720 # find n_wa_pattern (window attention pattern)
27062721 fullatt_block_indexes = hparams .get ("fullatt_block_indexes" )
@@ -2759,11 +2774,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27592774
27602775
27612776@ModelBase .register ("InternVisionModel" )
2762- class InternVisionModel (VisionModel ):
2777+ class InternVisionModel (MmprojModel ):
27632778 def set_gguf_parameters (self ):
27642779 super ().set_gguf_parameters ()
27652780 hparams = self .hparams
2766- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .INTERNVL )
2781+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .INTERNVL )
27672782 self .gguf_writer .add_vision_attention_layernorm_eps (hparams ["layer_norm_eps" ])
27682783 # hidden_act
27692784 if hparams ["hidden_act" ] == "silu" :
@@ -4021,11 +4036,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
40214036
40224037
40234038@ModelBase .register ("Gemma3ForConditionalGeneration" )
4024- class Gemma3VisionModel (VisionModel ):
4039+ class Gemma3VisionModel (MmprojModel ):
40254040 def set_gguf_parameters (self ):
40264041 super ().set_gguf_parameters ()
40274042 hparams = self .hparams
4028- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .GEMMA3 )
4043+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .GEMMA3 )
40294044 # default values below are taken from HF tranformers code
40304045 self .gguf_writer .add_vision_attention_layernorm_eps (hparams .get ("layer_norm_eps" , 1e-6 ))
40314046 self .gguf_writer .add_vision_use_gelu (True )
@@ -5982,24 +5997,22 @@ def __init__(self, *args, **kwargs):
59825997
59835998
59845999@ModelBase .register ("UltravoxModel" )
5985- class UltravoxAudioModel (VisionModel ):
6000+ class UltravoxAudioModel (MmprojModel ):
6001+ has_vision_encoder = False # no vision encoder
6002+ has_audio_encoder = True
6003+
59866004 def __init__ (self , * args , ** kwargs ):
59876005 super ().__init__ (* args , ** kwargs )
5988- self .has_vision_encoder = False
5989- self .has_audio_encoder = True
5990- self .hparams ["image_size" ] = self .hparams ["num_mel_bins" ]
5991- self .hparams ["patch_size" ] = self .hparams ["num_mel_bins" ]
59926006 self .hparams ["hidden_size" ] = self .hparams ["d_model" ]
59936007 self .hparams ["intermediate_size" ] = self .hparams ["encoder_ffn_dim" ]
59946008 self .hparams ["num_attention_heads" ] = self .hparams ["encoder_attention_heads" ]
5995- self .preprocessor_config ["image_mean" ] = [0 , 0 , 0 ]
5996- self .preprocessor_config ["image_std" ] = [0 , 0 , 0 ]
59976009
59986010 def set_gguf_parameters (self ):
59996011 super ().set_gguf_parameters ()
6000- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .ULTRAVOX )
6001- self .gguf_writer .add_vision_attention_layernorm_eps (self .hparams .get ("layer_norm_eps" , 1e-5 ))
6002- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .Projector .STACK_FACTOR , self .global_config ["stack_factor" ])
6012+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .ULTRAVOX )
6013+ self .gguf_writer .add_audio_num_mel_bins (self .hparams ["num_mel_bins" ])
6014+ self .gguf_writer .add_audio_attention_layernorm_eps (self .hparams .get ("layer_norm_eps" , 1e-5 ))
6015+ self .gguf_writer .add_audio_stack_factor (self .global_config ["stack_factor" ])
60036016
60046017 def tensor_force_quant (self , name , new_name , bid , n_dims ):
60056018 del bid , new_name , n_dims # unused
@@ -6195,13 +6208,15 @@ def split_str_to_n_bytes(split_str: str) -> int:
61956208
61966209
61976210def get_model_architecture (hparams : dict [str , Any ], model_type : ModelType ) -> str :
6211+ # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
6212+ # maybe we should fallback to text model's arch in that case, since not many models have both
61986213 text_config = hparams .get ("text_config" , {})
61996214 vision_config = hparams .get ("vision_config" , {})
62006215 arch = hparams ["architectures" ][0 ]
62016216 # if "architectures" is found in the sub-config, use that instead
62026217 if model_type == ModelType .TEXT and text_config .get ("architectures" ) is not None :
62036218 arch = text_config ["architectures" ][0 ]
6204- elif model_type == ModelType .VISION and vision_config .get ("architectures" ) is not None :
6219+ elif model_type == ModelType .MMPROJ and vision_config .get ("architectures" ) is not None :
62056220 arch = vision_config ["architectures" ][0 ]
62066221 return arch
62076222
@@ -6264,7 +6279,7 @@ def main() -> None:
62646279
62656280 with torch .inference_mode ():
62666281 output_type = ftype_map [args .outtype ]
6267- model_type = ModelType .VISION if args .mmproj else ModelType .TEXT
6282+ model_type = ModelType .MMPROJ if args .mmproj else ModelType .TEXT
62686283 hparams = ModelBase .load_hparams (dir_model )
62696284 model_architecture = get_model_architecture (hparams , model_type )
62706285 logger .info (f"Model architecture: { model_architecture } " )
0 commit comments