@@ -1898,6 +1898,55 @@ def prepare_tensors(self):
18981898 raise ValueError (f"Unprocessed experts: { experts } " )
18991899
19001900
1901+ @ModelBase .register ("LlavaForConditionalGeneration" )
1902+ class LlavaVisionModel (VisionModel ):
1903+ img_break_tok_id = - 1
1904+
1905+ def __init__ (self , * args , ** kwargs ):
1906+ super ().__init__ (* args , ** kwargs )
1907+ if self .hparams ["model_type" ] == "pixtral" :
1908+ # fix missing config.json values
1909+ self .hparams ["num_attention_heads" ] = self .hparams .get ("num_attention_heads" , 16 )
1910+ self .hparams ["num_hidden_layers" ] = self .hparams .get ("num_hidden_layers" , 24 )
1911+ self .hparams ["intermediate_size" ] = self .hparams .get ("intermediate_size" , 4096 )
1912+ self .hparams ["hidden_size" ] = self .hparams .get ("hidden_size" , 1024 )
1913+ self .hparams ["layer_norm_eps" ] = self .hparams .get ("layer_norm_eps" , 1e-5 )
1914+ self .img_break_tok_id = 12 # see tokenizer_config.json
1915+ else :
1916+ raise ValueError (f"Unsupported model type: { self .hparams ['model_type' ]} " )
1917+
1918+ def set_gguf_parameters (self ):
1919+ super ().set_gguf_parameters ()
1920+ hparams = self .hparams
1921+ if hparams ["model_type" ] == "pixtral" :
1922+ self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .PIXTRAL )
1923+ # default values below are taken from HF tranformers code
1924+ self .gguf_writer .add_vision_attention_layernorm_eps (hparams ["layer_norm_eps" ])
1925+ self .gguf_writer .add_vision_use_silu (True )
1926+
1927+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
1928+ del bid # unused
1929+ n_head = self .hparams ["num_attention_heads" ]
1930+ n_kv_head = n_head
1931+
1932+ if name .startswith ("multi_modal_projector." ) or name .startswith ("vision_tower." ):
1933+ # process vision tensors
1934+ if name .endswith (("q_proj.weight" , "q_proj.bias" )):
1935+ data_torch = LlamaModel .permute (data_torch , n_head , n_head )
1936+ if name .endswith (("k_proj.weight" , "k_proj.bias" )):
1937+ data_torch = LlamaModel .permute (data_torch , n_head , n_kv_head )
1938+ return [(self .map_tensor_name (name ), data_torch )]
1939+
1940+ if self .img_break_tok_id > 0 and "embed_tokens.weight" in name :
1941+ logger .info (f"Extracting [IMG_BREAK] token embedding from { name } " )
1942+ # for pixtral model, we need to extract the [IMG_BREAK] token embedding
1943+ img_break_embd = data_torch [self .img_break_tok_id ]
1944+ name = gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .V_TOK_EMBD_IMG_BREAK ]
1945+ return [(self .map_tensor_name (name ), img_break_embd )]
1946+
1947+ return [] # skip other tensors
1948+
1949+
19011950@ModelBase .register ("Idefics3ForConditionalGeneration" , "SmolVLMForConditionalGeneration" )
19021951class SmolVLMModel (VisionModel ):
19031952 def __init__ (self , * args , ** kwargs ):
0 commit comments