@@ -292,7 +292,10 @@ def set_gguf_parameters(self):
292292 self .gguf_writer .add_vision_vit_head_count (self .vparams ["num_attention_heads" ])
293293 self .gguf_writer .add_vision_vit_image_mean (self .preprocessor_config ["image_mean" ])
294294 self .gguf_writer .add_vision_vit_image_std (self .preprocessor_config ["image_std" ])
295- self .gguf_writer .add_vision_vit_select_layer (self .find_hparam (["vision_feature_layer" , "mm_vision_select_layer" ]))
295+ try :
296+ self .gguf_writer .add_vision_vit_select_layer (self .find_hparam (["vision_feature_layer" , "mm_vision_select_layer" ]))
297+ except KeyError :
298+ self .gguf_writer .add_vision_vit_select_layer (0 )
296299
297300 self .gguf_writer .add_file_type (self .ftype )
298301 logger .info (f"gguf: file type = { self .ftype } " )
@@ -506,8 +509,9 @@ def load_hparams(dir_model: Path):
506509 hparams = json .load (f )
507510 if "text_config" in hparams :
508511 text_config = hparams ["text_config" ]
512+ model_id = text_config .get ("_name_or_path" , None )
509513 # for example, llava-1.5-7b-hf misses the language model config, need to retrieve it via model ID
510- if "_name_or_path" in text_config :
514+ if model_id is not None and model_id != "None" and model_id != "" :
511515 text_config = AutoConfig .from_pretrained (text_config ["_name_or_path" ]).to_dict ()
512516 hparams = {** text_config , ** hparams }
513517 return hparams
@@ -1616,7 +1620,7 @@ def prepare_tensors(self):
16161620 raise ValueError (f"Unprocessed norms: { norms } " )
16171621
16181622
1619- @Model .register ("LLaMAForCausalLM" , "LlamaForCausalLM" , "MistralForCausalLM" , "MixtralForCausalLM" , "LlavaForConditionalGeneration" , "MobileLlamaForCausalLM" )
1623+ @Model .register ("LLaMAForCausalLM" , "LlamaForCausalLM" , "MistralForCausalLM" , "MixtralForCausalLM" , "LlavaForConditionalGeneration" , "MobileLlamaForCausalLM" , "Idefics3ForConditionalGeneration" )
16201624class LlamaModel (Model ):
16211625 model_arch = gguf .MODEL_ARCH .LLAMA
16221626
@@ -1640,6 +1644,11 @@ def __init__(self, *args, **kwargs):
16401644 self .preprocessor_config = AutoImageProcessor .from_pretrained (vision_model_id ).to_dict ()
16411645 self .vision_arch = gguf .MODEL_ARCH .VISION_MOBILEVLM
16421646
1647+ if "vision_config" in self .hparams and model_type == "idefics3" :
1648+ self .vparams = self .hparams ["vision_config" ]
1649+ self .preprocessor_config = self .load_preprocessor_config (self .dir_model )
1650+ self .vision_arch = gguf .MODEL_ARCH .VISION_IDEFICS3
1651+
16431652 if self .vparams is not None and self .vision_arch is not None :
16441653 self .v_tensor_map = gguf .get_tensor_name_map (self .vision_arch , self .vparams ["num_hidden_layers" ])
16451654
@@ -1694,14 +1703,20 @@ def set_gguf_parameters(self):
16941703
16951704 # For vision model
16961705 if self .vparams is not None :
1706+ max_pos_embd = - 1
16971707 self .gguf_writer .add_vision_vit_patch_merge_type (gguf .CLIPPatchMergeType .FLAT )
16981708 # TODO: should not hardcode these, but they are currently missing from config.json
16991709 if self .vision_arch == gguf .MODEL_ARCH .VISION_LLAVA :
17001710 self .gguf_writer .add_vision_vit_projector_type (gguf .constants .CLIPProjectorType .MLP )
1711+ max_pos_embd = (self .vparams ["image_size" ] // self .vparams ["patch_size" ])** 2 + 1
17011712 if self .vision_arch == gguf .MODEL_ARCH .VISION_MOBILEVLM :
17021713 self .gguf_writer .add_vision_vit_projector_type (gguf .constants .CLIPProjectorType .LDPV2 )
1714+ max_pos_embd = (self .vparams ["image_size" ] // self .vparams ["patch_size" ])** 2 + 1
1715+ if self .vision_arch == gguf .MODEL_ARCH .VISION_IDEFICS3 :
1716+ self .gguf_writer .add_vision_vit_projector_type (gguf .constants .CLIPProjectorType .MLP )
1717+ self .gguf_writer .add_vision_vit_scale_factor (self .hparams ["scale_factor" ])
1718+ max_pos_embd = (self .vparams ["image_size" ] // self .vparams ["patch_size" ])** 2
17031719 self .gguf_writer .add_vision_vit_layer_norm_epsilon (1e-05 )
1704- max_pos_embd = (self .vparams ["image_size" ] // self .vparams ["patch_size" ])** 2 + 1
17051720 self .gguf_writer .add_vision_vit_max_position_embeddings (max_pos_embd )
17061721
17071722 @staticmethod
@@ -1717,19 +1732,23 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
17171732 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
17181733 n_head = self .hparams ["num_attention_heads" ]
17191734 n_kv_head = self .hparams .get ("num_key_value_heads" )
1735+ is_vision_tensor = "vision_tower" in name or "vision_model" in name
17201736
17211737 # For vision model
17221738 if name .startswith ("language_model" ):
17231739 name = name .replace ("language_model." , "" )
1740+ if name .startswith ("model.text_model" ):
1741+ name = name .replace ("text_model." , "" ) # for SmolVLM
17241742 else :
17251743 name = name .replace ("model.vision_tower." , "" )
1726- if "post_layernorm" in name :
1744+ if "post_layernorm" in name and self . vision_arch != gguf . MODEL_ARCH . VISION_IDEFICS3 :
17271745 return [] # skip post_layernorm
17281746
1729- if name .endswith (("q_proj.weight" , "q_proj.bias" )):
1730- data_torch = LlamaModel .permute (data_torch , n_head , n_head )
1731- if name .endswith (("k_proj.weight" , "k_proj.bias" )):
1732- data_torch = LlamaModel .permute (data_torch , n_head , n_kv_head )
1747+ if not is_vision_tensor :
1748+ if name .endswith (("q_proj.weight" , "q_proj.bias" )):
1749+ data_torch = LlamaModel .permute (data_torch , n_head , n_head )
1750+ if name .endswith (("k_proj.weight" , "k_proj.bias" )):
1751+ data_torch = LlamaModel .permute (data_torch , n_head , n_kv_head )
17331752
17341753 # process the experts separately
17351754 if name .find ("block_sparse_moe.experts" ) != - 1 :
0 commit comments