@@ -419,8 +419,12 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
419419 def load_hparams (dir_model : Path ):
420420 with open (dir_model / "config.json" , "r" , encoding = "utf-8" ) as f :
421421 hparams = json .load (f )
422+ architectures = hparams .get ("architectures" )
422423 if "text_config" in hparams :
423424 hparams = {** hparams , ** hparams ["text_config" ]}
425+ if architectures is not None :
426+ # preserve "architectures" from root level config
427+ hparams ["architectures" ] = architectures
424428 return hparams
425429
426430 @classmethod
@@ -1061,6 +1065,8 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
10611065class VisionModel (ModelBase ):
10621066 model_arch = gguf .MODEL_ARCH .CLIP_VISION
10631067 n_text_embd = 0
1068+ preprocessor_config : dict [str , Any ]
1069+ global_config : dict [str , Any ]
10641070
10651071 def __init__ (self , * args , ** kwargs ):
10661072 super ().__init__ (* args , ** kwargs )
@@ -1075,24 +1081,33 @@ def __init__(self, *args, **kwargs):
10751081
10761082 if "vision_config" not in self .hparams :
10771083 raise ValueError ("vision_config not found in hparams" )
1078- # move vision config to the top level
1084+ # move vision config to the top level, while preserving the original hparams in global_config
1085+ self .global_config = self .hparams
10791086 self .hparams = self .hparams ["vision_config" ]
10801087
1088+ # load preprocessor config
1089+ with open (self .dir_model / "preprocessor_config.json" , "r" , encoding = "utf-8" ) as f :
1090+ self .preprocessor_config = json .load (f )
1091+
10811092 def set_type (self ):
10821093 self .gguf_writer .add_type (gguf .GGUFType .CLIP_VISION )
10831094
10841095 def set_gguf_parameters (self ):
10851096 self .gguf_writer .add_file_type (self .ftype )
1086- self .gguf_writer .add_uint32 ( gguf . Keys . ClipVision . PROJECTION_DIM , self .n_embd_text )
1087- self .gguf_writer .add_bool ( gguf . Keys . ClipVision . HAS_VISION_ENCODER , True )
1097+ self .gguf_writer .add_vision_projection_dim ( self .n_embd_text )
1098+ self .gguf_writer .add_vision_has_vision_encoder ( True )
10881099
10891100 # vision config
1090- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .IMAGE_SIZE , self .find_hparam (["image_size" ]))
1091- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .PATCH_SIZE , self .find_hparam (["patch_size" ]))
1092- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .EMBEDDING_LENGTH , self .find_hparam (["hidden_size" ]))
1093- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .FEED_FORWARD_LENGTH , self .find_hparam (["intermediate_size" ]))
1094- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .BLOCK_COUNT , self .find_hparam (["num_hidden_layers" ]))
1095- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .Attention .HEAD_COUNT , self .find_hparam (["num_attention_heads" ]))
1101+ self .gguf_writer .add_vision_image_size (self .find_hparam (["image_size" ]))
1102+ self .gguf_writer .add_vision_patch_size (self .find_hparam (["patch_size" ]))
1103+ self .gguf_writer .add_vision_embedding_length (self .find_hparam (["hidden_size" ]))
1104+ self .gguf_writer .add_vision_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1105+ self .gguf_writer .add_vision_block_count (self .find_hparam (["num_hidden_layers" ]))
1106+ self .gguf_writer .add_vision_head_count (self .find_hparam (["num_attention_heads" ]))
1107+
1108+ # preprocessor config
1109+ self .gguf_writer .add_vision_image_mean (self .preprocessor_config ["image_mean" ])
1110+ self .gguf_writer .add_vision_image_std (self .preprocessor_config ["image_mean" ])
10961111
10971112 def write_vocab (self ):
10981113 raise ValueError ("VisionModel does not support vocab writing" )
@@ -1703,11 +1718,23 @@ def prepare_tensors(self):
17031718 raise ValueError (f"Unprocessed norms: { norms } " )
17041719
17051720
1706- @ModelBase .register ("LLaMAForCausalLM" , "LlamaForCausalLM" , "MistralForCausalLM" , "MixtralForCausalLM" )
1721+ @ModelBase .register (
1722+ "LLaMAForCausalLM" ,
1723+ "LlamaForCausalLM" ,
1724+ "MistralForCausalLM" ,
1725+ "MixtralForCausalLM" ,
1726+ "Idefics3ForConditionalGeneration" ,
1727+ "SmolVLMForConditionalGeneration" )
17071728class LlamaModel (TextModel ):
17081729 model_arch = gguf .MODEL_ARCH .LLAMA
17091730 undo_permute = True
17101731
1732+ def __init__ (self , * args , ** kwargs ):
1733+ super ().__init__ (* args , ** kwargs )
1734+ # fix for SmolVLM2, missing `num_attention_heads` in config.json
1735+ if self .hparams ["architectures" ][0 ] == "SmolVLMForConditionalGeneration" :
1736+ self .hparams ["num_attention_heads" ] = self .hparams .get ("num_attention_heads" , 32 )
1737+
17111738 def set_vocab (self ):
17121739 try :
17131740 self ._set_vocab_sentencepiece ()
@@ -1770,6 +1797,12 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
17701797 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
17711798 n_head = self .hparams ["num_attention_heads" ]
17721799 n_kv_head = self .hparams .get ("num_key_value_heads" )
1800+ is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
1801+
1802+ if is_vision_tensor :
1803+ return [] # skip vision tensors
1804+ elif name .startswith ("model.text_model" ):
1805+ name = name .replace ("text_model." , "" ) # for SmolVLM
17731806
17741807 if self .undo_permute :
17751808 if name .endswith (("q_proj.weight" , "q_proj.bias" )):
@@ -1852,6 +1885,41 @@ def prepare_tensors(self):
18521885 raise ValueError (f"Unprocessed experts: { experts } " )
18531886
18541887
1888+ @ModelBase .register ("Idefics3ForConditionalGeneration" , "SmolVLMForConditionalGeneration" )
1889+ class SmolVLMModel (VisionModel ):
1890+ def __init__ (self , * args , ** kwargs ):
1891+ super ().__init__ (* args , ** kwargs )
1892+ # fix for SmolVLM2, missing some keys in config.json
1893+ # default values are taken from transformers code
1894+ if self .hparams ["model_type" ] == "smolvlm_vision" :
1895+ self .hparams ["hidden_size" ] = self .hparams .get ("hidden_size" , 1152 )
1896+ self .hparams ["num_attention_heads" ] = self .hparams .get ("num_attention_heads" , 16 )
1897+ self .hparams ["intermediate_size" ] = self .hparams .get ("intermediate_size" , 3072 )
1898+ self .hparams ["num_hidden_layers" ] = self .hparams .get ("num_hidden_layers" , 12 )
1899+
1900+ def set_gguf_parameters (self ):
1901+ super ().set_gguf_parameters ()
1902+ self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .IDEFICS3 )
1903+ self .gguf_writer .add_vision_attention_layernorm_eps (self .hparams .get ("layer_norm_eps" , 1e-5 ))
1904+ self .gguf_writer .add_vision_projector_scale_factor (self .global_config .get ("scale_factor" , 2 ))
1905+ self .gguf_writer .add_vision_use_gelu (True )
1906+
1907+ def tensor_force_quant (self , name , new_name , bid , n_dims ):
1908+ del bid , new_name , n_dims # unused
1909+ if ".embeddings." in name :
1910+ return gguf .GGMLQuantizationType .F32
1911+ return False
1912+
1913+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
1914+ del bid # unused
1915+ is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
1916+
1917+ if is_vision_tensor :
1918+ return [(self .map_tensor_name (name ), data_torch )]
1919+
1920+ return [] # skip other tensors
1921+
1922+
18551923@ModelBase .register ("Llama4ForConditionalGeneration" )
18561924class Llama4Model (LlamaModel ):
18571925 model_arch = gguf .MODEL_ARCH .LLAMA4
@@ -3591,12 +3659,10 @@ class Gemma3VisionModel(VisionModel):
35913659 def set_gguf_parameters (self ):
35923660 super ().set_gguf_parameters ()
35933661 hparams = self .hparams
3594- self .gguf_writer .add_string (gguf .Keys . ClipVision . PROJECTOR_TYPE , "gemma3" )
3662+ self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType . GEMMA3 )
35953663 # default values below are taken from HF tranformers code
3596- self .gguf_writer .add_float32 (gguf .Keys .ClipVision .Attention .LAYERNORM_EPS , hparams .get ("layer_norm_eps" , 1e-6 ))
3597- self .gguf_writer .add_array (gguf .Keys .ClipVision .IMAGE_MEAN , [0.5 , 0.5 , 0.5 ])
3598- self .gguf_writer .add_array (gguf .Keys .ClipVision .IMAGE_STD , [0.5 , 0.5 , 0.5 ])
3599- self .gguf_writer .add_bool (gguf .Keys .ClipVision .USE_GELU , True )
3664+ self .gguf_writer .add_vision_attention_layernorm_eps (hparams .get ("layer_norm_eps" , 1e-6 ))
3665+ self .gguf_writer .add_vision_use_gelu (True )
36003666
36013667 def tensor_force_quant (self , name , new_name , bid , n_dims ):
36023668 del bid , new_name , n_dims # unused
@@ -3614,10 +3680,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
36143680 or name .startswith ("multimodal_projector." ) or name .startswith ("vision_model." ):
36153681 # process vision tensors
36163682 name = name .replace ("_weight" , ".weight" )
3617- if "fc1" in name :
3618- name = name .replace ("fc1" , "fc2" )
3619- else :
3620- name = name .replace ("fc2" , "fc1" )
36213683
36223684 # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
36233685 # the other norm values are part of SigLIP model, and they are already correct
0 commit comments