@@ -419,8 +419,12 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
419419    def  load_hparams (dir_model : Path ):
420420        with  open (dir_model  /  "config.json" , "r" , encoding = "utf-8" ) as  f :
421421            hparams  =  json .load (f )
422+             architectures  =  hparams .get ("architectures" )
422423            if  "text_config"  in  hparams :
423424                hparams  =  {** hparams , ** hparams ["text_config" ]}
425+             if  architectures  is  not   None :
426+                 # preserve "architectures" from root level config 
427+                 hparams ["architectures" ] =  architectures 
424428            return  hparams 
425429
426430    @classmethod  
@@ -1061,6 +1065,8 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
10611065class  VisionModel (ModelBase ):
10621066    model_arch  =  gguf .MODEL_ARCH .CLIP_VISION 
10631067    n_text_embd  =  0 
1068+     preprocessor_config : dict [str , Any ]
1069+     global_config : dict [str , Any ]
10641070
10651071    def  __init__ (self , * args , ** kwargs ):
10661072        super ().__init__ (* args , ** kwargs )
@@ -1075,24 +1081,33 @@ def __init__(self, *args, **kwargs):
10751081
10761082        if  "vision_config"  not  in   self .hparams :
10771083            raise  ValueError ("vision_config not found in hparams" )
1078-         # move vision config to the top level 
1084+         # move vision config to the top level, while preserving the original hparams in global_config 
1085+         self .global_config  =  self .hparams 
10791086        self .hparams  =  self .hparams ["vision_config" ]
10801087
1088+         # load preprocessor config 
1089+         with  open (self .dir_model  /  "preprocessor_config.json" , "r" , encoding = "utf-8" ) as  f :
1090+             self .preprocessor_config  =  json .load (f )
1091+ 
10811092    def  set_type (self ):
10821093        self .gguf_writer .add_type (gguf .GGUFType .CLIP_VISION )
10831094
10841095    def  set_gguf_parameters (self ):
10851096        self .gguf_writer .add_file_type (self .ftype )
1086-         self .gguf_writer .add_uint32 ( gguf . Keys . ClipVision . PROJECTION_DIM ,  self .n_embd_text )
1087-         self .gguf_writer .add_bool ( gguf . Keys . ClipVision . HAS_VISION_ENCODER ,  True )
1097+         self .gguf_writer .add_vision_projection_dim ( self .n_embd_text )
1098+         self .gguf_writer .add_vision_has_vision_encoder ( True )
10881099
10891100        # vision config 
1090-         self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .IMAGE_SIZE ,           self .find_hparam (["image_size" ]))
1091-         self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .PATCH_SIZE ,           self .find_hparam (["patch_size" ]))
1092-         self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .EMBEDDING_LENGTH ,     self .find_hparam (["hidden_size" ]))
1093-         self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .FEED_FORWARD_LENGTH ,  self .find_hparam (["intermediate_size" ]))
1094-         self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .BLOCK_COUNT ,          self .find_hparam (["num_hidden_layers" ]))
1095-         self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .Attention .HEAD_COUNT , self .find_hparam (["num_attention_heads" ]))
1101+         self .gguf_writer .add_vision_image_size (self .find_hparam (["image_size" ]))
1102+         self .gguf_writer .add_vision_patch_size (self .find_hparam (["patch_size" ]))
1103+         self .gguf_writer .add_vision_embedding_length (self .find_hparam (["hidden_size" ]))
1104+         self .gguf_writer .add_vision_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1105+         self .gguf_writer .add_vision_block_count (self .find_hparam (["num_hidden_layers" ]))
1106+         self .gguf_writer .add_vision_head_count (self .find_hparam (["num_attention_heads" ]))
1107+ 
1108+         # preprocessor config 
1109+         self .gguf_writer .add_vision_image_mean (self .preprocessor_config ["image_mean" ])
1110+         self .gguf_writer .add_vision_image_std (self .preprocessor_config ["image_mean" ])
10961111
10971112    def  write_vocab (self ):
10981113        raise  ValueError ("VisionModel does not support vocab writing" )
@@ -1703,11 +1718,23 @@ def prepare_tensors(self):
17031718                raise  ValueError (f"Unprocessed norms: { norms }  " )
17041719
17051720
1706- @ModelBase .register ("LLaMAForCausalLM" , "LlamaForCausalLM" , "MistralForCausalLM" , "MixtralForCausalLM" ) 
1721+ @ModelBase .register ( 
1722+     "LLaMAForCausalLM" , 
1723+     "LlamaForCausalLM" , 
1724+     "MistralForCausalLM" , 
1725+     "MixtralForCausalLM" , 
1726+     "Idefics3ForConditionalGeneration" , 
1727+     "SmolVLMForConditionalGeneration" ) 
17071728class  LlamaModel (TextModel ):
17081729    model_arch  =  gguf .MODEL_ARCH .LLAMA 
17091730    undo_permute  =  True 
17101731
1732+     def  __init__ (self , * args , ** kwargs ):
1733+         super ().__init__ (* args , ** kwargs )
1734+         # fix for SmolVLM2, missing `num_attention_heads` in config.json 
1735+         if  self .hparams ["architectures" ][0 ] ==  "SmolVLMForConditionalGeneration" :
1736+             self .hparams ["num_attention_heads" ] =  self .hparams .get ("num_attention_heads" , 32 )
1737+ 
17111738    def  set_vocab (self ):
17121739        try :
17131740            self ._set_vocab_sentencepiece ()
@@ -1770,6 +1797,12 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
17701797    def  modify_tensors (self , data_torch : Tensor , name : str , bid : int  |  None ) ->  Iterable [tuple [str , Tensor ]]:
17711798        n_head  =  self .hparams ["num_attention_heads" ]
17721799        n_kv_head  =  self .hparams .get ("num_key_value_heads" )
1800+         is_vision_tensor  =  "vision_tower"  in  name  or  "vision_model"  in  name  or  "model.connector"  in  name 
1801+ 
1802+         if  is_vision_tensor :
1803+             return  [] # skip vision tensors 
1804+         elif  name .startswith ("model.text_model" ):
1805+             name  =  name .replace ("text_model." , "" ) # for SmolVLM 
17731806
17741807        if  self .undo_permute :
17751808            if  name .endswith (("q_proj.weight" , "q_proj.bias" )):
@@ -1852,6 +1885,41 @@ def prepare_tensors(self):
18521885                raise  ValueError (f"Unprocessed experts: { experts }  " )
18531886
18541887
1888+ @ModelBase .register ("Idefics3ForConditionalGeneration" , "SmolVLMForConditionalGeneration" ) 
1889+ class  SmolVLMModel (VisionModel ):
1890+     def  __init__ (self , * args , ** kwargs ):
1891+         super ().__init__ (* args , ** kwargs )
1892+         # fix for SmolVLM2, missing some keys in config.json 
1893+         # default values are taken from transformers code 
1894+         if  self .hparams ["model_type" ] ==  "smolvlm_vision" :
1895+             self .hparams ["hidden_size" ] =  self .hparams .get ("hidden_size" , 1152 )
1896+             self .hparams ["num_attention_heads" ] =  self .hparams .get ("num_attention_heads" , 16 )
1897+             self .hparams ["intermediate_size" ] =  self .hparams .get ("intermediate_size" , 3072 )
1898+             self .hparams ["num_hidden_layers" ] =  self .hparams .get ("num_hidden_layers" , 12 )
1899+ 
1900+     def  set_gguf_parameters (self ):
1901+         super ().set_gguf_parameters ()
1902+         self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .IDEFICS3 )
1903+         self .gguf_writer .add_vision_attention_layernorm_eps (self .hparams .get ("layer_norm_eps" , 1e-5 ))
1904+         self .gguf_writer .add_vision_projector_scale_factor (self .global_config .get ("scale_factor" , 2 ))
1905+         self .gguf_writer .add_vision_use_gelu (True )
1906+ 
1907+     def  tensor_force_quant (self , name , new_name , bid , n_dims ):
1908+         del  bid , new_name , n_dims   # unused 
1909+         if  ".embeddings."  in  name :
1910+             return  gguf .GGMLQuantizationType .F32 
1911+         return  False 
1912+ 
1913+     def  modify_tensors (self , data_torch : Tensor , name : str , bid : int  |  None ) ->  Iterable [tuple [str , Tensor ]]:
1914+         del  bid   # unused 
1915+         is_vision_tensor  =  "vision_tower"  in  name  or  "vision_model"  in  name  or  "model.connector"  in  name 
1916+ 
1917+         if  is_vision_tensor :
1918+             return  [(self .map_tensor_name (name ), data_torch )]
1919+ 
1920+         return  [] # skip other tensors 
1921+ 
1922+ 
18551923@ModelBase .register ("Llama4ForConditionalGeneration" ) 
18561924class  Llama4Model (LlamaModel ):
18571925    model_arch  =  gguf .MODEL_ARCH .LLAMA4 
@@ -3591,12 +3659,10 @@ class Gemma3VisionModel(VisionModel):
35913659    def  set_gguf_parameters (self ):
35923660        super ().set_gguf_parameters ()
35933661        hparams  =  self .hparams 
3594-         self .gguf_writer .add_string (gguf .Keys . ClipVision . PROJECTOR_TYPE ,  "gemma3" )
3662+         self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType . GEMMA3 )
35953663        # default values below are taken from HF tranformers code 
3596-         self .gguf_writer .add_float32 (gguf .Keys .ClipVision .Attention .LAYERNORM_EPS , hparams .get ("layer_norm_eps" , 1e-6 ))
3597-         self .gguf_writer .add_array (gguf .Keys .ClipVision .IMAGE_MEAN , [0.5 , 0.5 , 0.5 ])
3598-         self .gguf_writer .add_array (gguf .Keys .ClipVision .IMAGE_STD ,  [0.5 , 0.5 , 0.5 ])
3599-         self .gguf_writer .add_bool  (gguf .Keys .ClipVision .USE_GELU ,   True )
3664+         self .gguf_writer .add_vision_attention_layernorm_eps (hparams .get ("layer_norm_eps" , 1e-6 ))
3665+         self .gguf_writer .add_vision_use_gelu (True )
36003666
36013667    def  tensor_force_quant (self , name , new_name , bid , n_dims ):
36023668        del  bid , new_name , n_dims   # unused 
@@ -3614,10 +3680,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
36143680                or  name .startswith ("multimodal_projector." ) or  name .startswith ("vision_model." ):
36153681            # process vision tensors 
36163682            name  =  name .replace ("_weight" , ".weight" )
3617-             if  "fc1"  in  name :
3618-                 name  =  name .replace ("fc1" , "fc2" )
3619-             else :
3620-                 name  =  name .replace ("fc2" , "fc1" )
36213683
36223684            # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector 
36233685            # the other norm values are part of SigLIP model, and they are already correct 
0 commit comments