@@ -776,6 +776,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
776776 if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" :
777777 # ref: https://huggingface.co/THUDM/glm-4-9b-hf
778778 res = "glm4"
779+ if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3" :
780+ # ref: https://huggingface.co/mistral-community/pixtral-12b
781+ res = "pixtral"
779782
780783 if res is None :
781784 logger .warning ("\n " )
@@ -1724,7 +1727,8 @@ def prepare_tensors(self):
17241727 "MistralForCausalLM" ,
17251728 "MixtralForCausalLM" ,
17261729 "Idefics3ForConditionalGeneration" ,
1727- "SmolVLMForConditionalGeneration" )
1730+ "SmolVLMForConditionalGeneration" ,
1731+ "LlavaForConditionalGeneration" )
17281732class LlamaModel (TextModel ):
17291733 model_arch = gguf .MODEL_ARCH .LLAMA
17301734 undo_permute = True
@@ -1734,6 +1738,10 @@ def __init__(self, *args, **kwargs):
17341738 # fix for SmolVLM2, missing `num_attention_heads` in config.json
17351739 if self .hparams ["architectures" ][0 ] == "SmolVLMForConditionalGeneration" :
17361740 self .hparams ["num_attention_heads" ] = self .hparams .get ("num_attention_heads" , 32 )
1741+ # fix for Pixtral, missing `num_attention_heads` in config.json
1742+ if self .hparams ["architectures" ][0 ] == "LlavaForConditionalGeneration" \
1743+ and self .hparams .get ("model_type" ) == "mistral" :
1744+ self .hparams ["num_attention_heads" ] = self .hparams .get ("num_attention_heads" , 32 )
17371745
17381746 def set_vocab (self ):
17391747 try :
@@ -1797,12 +1805,17 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
17971805 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
17981806 n_head = self .hparams ["num_attention_heads" ]
17991807 n_kv_head = self .hparams .get ("num_key_value_heads" )
1800- is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
1808+ is_vision_tensor = "vision_tower" in name \
1809+ or "vision_model" in name \
1810+ or "model.connector" in name \
1811+ or "multi_modal_projector" in name
18011812
18021813 if is_vision_tensor :
18031814 return [] # skip vision tensors
18041815 elif name .startswith ("model.text_model" ):
18051816 name = name .replace ("text_model." , "" ) # for SmolVLM
1817+ elif name .startswith ("language_model." ):
1818+ name = name .replace ("language_model." , "" ) # for the rest
18061819
18071820 if self .undo_permute :
18081821 if name .endswith (("q_proj.weight" , "q_proj.bias" )):
@@ -1885,6 +1898,55 @@ def prepare_tensors(self):
18851898 raise ValueError (f"Unprocessed experts: { experts } " )
18861899
18871900
1901+ @ModelBase .register ("LlavaForConditionalGeneration" )
1902+ class LlavaVisionModel (VisionModel ):
1903+ img_break_tok_id = - 1
1904+
1905+ def __init__ (self , * args , ** kwargs ):
1906+ super ().__init__ (* args , ** kwargs )
1907+ if self .hparams ["model_type" ] == "pixtral" :
1908+ # fix missing config.json values
1909+ self .hparams ["num_attention_heads" ] = self .hparams .get ("num_attention_heads" , 16 )
1910+ self .hparams ["num_hidden_layers" ] = self .hparams .get ("num_hidden_layers" , 24 )
1911+ self .hparams ["intermediate_size" ] = self .hparams .get ("intermediate_size" , 4096 )
1912+ self .hparams ["hidden_size" ] = self .hparams .get ("hidden_size" , 1024 )
1913+ self .hparams ["layer_norm_eps" ] = self .hparams .get ("layer_norm_eps" , 1e-5 )
1914+ self .img_break_tok_id = 12 # see tokenizer_config.json
1915+ else :
1916+ raise ValueError (f"Unsupported model type: { self .hparams ['model_type' ]} " )
1917+
1918+ def set_gguf_parameters (self ):
1919+ super ().set_gguf_parameters ()
1920+ hparams = self .hparams
1921+ if hparams ["model_type" ] == "pixtral" :
1922+ self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .PIXTRAL )
1923+ # default values below are taken from HF tranformers code
1924+ self .gguf_writer .add_vision_attention_layernorm_eps (hparams ["layer_norm_eps" ])
1925+ self .gguf_writer .add_vision_use_silu (True )
1926+
1927+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
1928+ del bid # unused
1929+ n_head = self .hparams ["num_attention_heads" ]
1930+ n_kv_head = n_head
1931+
1932+ if name .startswith ("multi_modal_projector." ) or name .startswith ("vision_tower." ):
1933+ # process vision tensors
1934+ if name .endswith (("q_proj.weight" , "q_proj.bias" )):
1935+ data_torch = LlamaModel .permute (data_torch , n_head , n_head )
1936+ if name .endswith (("k_proj.weight" , "k_proj.bias" )):
1937+ data_torch = LlamaModel .permute (data_torch , n_head , n_kv_head )
1938+ return [(self .map_tensor_name (name ), data_torch )]
1939+
1940+ if self .img_break_tok_id > 0 and "embed_tokens.weight" in name :
1941+ logger .info (f"Extracting [IMG_BREAK] token embedding from { name } " )
1942+ # for pixtral model, we need to extract the [IMG_BREAK] token embedding
1943+ img_break_embd = data_torch [self .img_break_tok_id ]
1944+ name = gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .V_TOK_EMBD_IMG_BREAK ]
1945+ return [(self .map_tensor_name (name ), img_break_embd )]
1946+
1947+ return [] # skip other tensors
1948+
1949+
18881950@ModelBase .register ("Idefics3ForConditionalGeneration" , "SmolVLMForConditionalGeneration" )
18891951class SmolVLMModel (VisionModel ):
18901952 def __init__ (self , * args , ** kwargs ):
@@ -5079,10 +5141,25 @@ class Glm4Model(TextModel):
50795141 model_arch = gguf .MODEL_ARCH .GLM4
50805142
50815143 def set_vocab (self ):
5082- self ._set_vocab_gpt2 ()
5144+ from transformers import AutoTokenizer
5145+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
5146+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
5147+ tokens , toktypes , tokpre = self .get_vocab_base ()
5148+ self .gguf_writer .add_tokenizer_model ("gpt2" )
5149+ self .gguf_writer .add_tokenizer_pre (tokpre )
5150+ self .gguf_writer .add_token_list (tokens )
5151+ self .gguf_writer .add_token_types (toktypes )
5152+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
5153+ special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
5154+ special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ])
5155+ special_vocab ._set_special_token ("unk" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
5156+ special_vocab ._set_special_token ("bos" , tokenizer .get_added_vocab ()["[gMASK]" ])
5157+ special_vocab .add_to_gguf (self .gguf_writer )
50835158
50845159 def set_gguf_parameters (self ):
50855160 super ().set_gguf_parameters ()
5161+ rope_dim = self .hparams ["head_dim" ]
5162+ self .gguf_writer .add_rope_dimension_count (int (rope_dim * self .hparams .get ("partial_rotary_factor" , 0.5 )))
50865163 if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
50875164 if self .hparams ["rope_scaling" ].get ("type" ) == "yarn" :
50885165 self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .YARN )
0 commit comments