66from transformers import (
77 Qwen2_5_VLForConditionalGeneration ,
88 Qwen2_5_VLProcessor ,
9- AutoProcessor ,
109 Qwen2_5_VLConfig ,
1110 Qwen2VLImageProcessor
1211)
@@ -21,14 +20,29 @@ def k(raw_key: str, arch: str) -> str:
2120
2221def to_gguf_name (name : str ) -> str :
2322 og = name
24- name = name .replace ("text_model" , "t" ).replace ("vision_model" , "v" )
25- name = name .replace ("blocks" , "blk" ).replace ("embeddings." , "" )
23+ # Handle the base case where vision_model is not in the name
24+ if not name .startswith ("vision_model." ):
25+ name = "vision_model." + name
26+
27+ name = name .replace ("vision_model" , "v" )
28+ name = name .replace ("text_model" , "t" )
29+ name = name .replace ("blocks" , "blk" )
30+ name = name .replace ("embeddings." , "" )
2631 name = name .replace ("attn." , "attn_" )
27- name = name .replace ("mlp.gate_proj" , "ffn_gate" ).replace ("mlp.up_proj" , "ffn_up" ).replace ("mlp.down_proj" , "ffn_down" )
32+
33+ # Handle MLP components correctly
34+ name = name .replace ("mlp.gate_proj" , "ffn_gate" )
35+ name = name .replace ("mlp.up_proj" , "ffn_up" )
36+ name = name .replace ("mlp.down_proj" , "ffn_down" )
37+
38+ # Handle projection and norm components
2839 name = name .replace ("proj." , "out." )
29- # Replace norm names so that layernorms become ln1/ln2
30- name = name .replace ("norm1" , "ln1" ).replace ("norm2" , "ln2" )
40+ name = name .replace ("norm1" , "ln1" )
41+ name = name .replace ("norm2" , "ln2" )
42+
43+ # Handle merger components correctly
3144 name = name .replace ("merger.mlp" , "mm" )
45+
3246 print (f"[to_gguf_name] { og } --> { name } " )
3347 return name
3448
@@ -37,6 +51,10 @@ def find_vision_tensors(qwen2vl, np_dtype) -> Dict[str, np.ndarray]:
3751 vision_model = qwen2vl .visual
3852 tensor_map = {}
3953
54+ # Debug info
55+ print (f"Vision model type: { type (vision_model )} " )
56+ print (f"Number of blocks: { len (vision_model .blocks )} " )
57+
4058 for name , ten in vision_model .state_dict ().items ():
4159 ten = ten .numpy ()
4260
@@ -51,14 +69,14 @@ def find_vision_tensors(qwen2vl, np_dtype) -> Dict[str, np.ndarray]:
5169 wq = ten [:c ]
5270 wk = ten [c : c * 2 ]
5371 wv = ten [c * 2 :]
54- base_name = to_gguf_name (f"vision_model. { name } " )
72+ base_name = to_gguf_name (name )
5573 tensor_map [base_name .replace ("qkv" , "q" )] = wq
5674 tensor_map [base_name .replace ("qkv" , "k" )] = wk
5775 tensor_map [base_name .replace ("qkv" , "v" )] = wv
5876
5977 elif 'gate_proj' in name or 'up_proj' in name or 'down_proj' in name :
6078 # Handle the MLP structure with gate/up/down projections
61- tensor_map [to_gguf_name (f"vision_model. { name } " )] = ten
79+ tensor_map [to_gguf_name (name )] = ten
6280
6381 elif 'merger' in name :
6482 # Map merger layernorm parameters to post_ln keys
@@ -85,26 +103,38 @@ def find_vision_tensors(qwen2vl, np_dtype) -> Dict[str, np.ndarray]:
85103 # For the Conv3d, split the temporal kernel dimension (which is 2)
86104 c1 , c2 , kt , kh , kw = ten .shape
87105 assert kt == 2 , "Current implementation only supports temporal_patch_size of 2"
88- tensor_map ["v.patch_embd.weight" ] = ten [:, :, 0 , ...]
89- tensor_map ["v.patch_embd.weight.1" ] = ten [:, :, 1 , ...]
106+
107+ # Properly handle the Conv3d weights for GGUF
108+ # Reshape from [output_channels, input_channels, temporal, height, width]
109+ # to the format expected by GGUF
110+ # For temporal slice 0
111+ tensor_map ["v.patch_embd.weight" ] = ten [:, :, 0 , :, :].reshape (c1 , c2 * kh * kw )
112+ # For temporal slice 1
113+ tensor_map ["v.patch_embd.weight.1" ] = ten [:, :, 1 , :, :].reshape (c1 , c2 * kh * kw )
114+
115+ elif 'norm1' in name or 'norm2' in name :
116+ # Handle the RMSNorm correctly
117+ tensor_map [to_gguf_name (name )] = ten
90118
91119 else :
92- tensor_map [to_gguf_name (f"vision_model. { name } " )] = ten
120+ tensor_map [to_gguf_name (name )] = ten
93121
94122 # Ensure biases and layer norm weights remain in fp32
95123 for new_name , ten in tensor_map .items ():
96124 if (ten .ndim <= 1 or
97125 new_name .endswith ("ln1.weight" ) or
98126 new_name .endswith ("ln1.bias" ) or
99127 new_name .endswith ("ln2.weight" ) or
100- new_name .endswith ("ln2.bias" )):
128+ new_name .endswith ("ln2.bias" ) or
129+ new_name .endswith ("post_ln.weight" ) or
130+ new_name .endswith ("post_ln.bias" )):
101131 tensor_map [new_name ] = ten .astype (np .float32 )
102132 else :
103133 tensor_map [new_name ] = ten .astype (np_dtype )
104134
105- # Dummy tensor as a placeholder for position embeddings
106- # Required even when using rotary embeddings
107- tensor_map ["v.position_embd.weight" ] = np .zeros ([10 , 10 ], dtype = np .float32 )
135+ # Add rotary embeddings info - dummy tensor as a placeholder
136+ # This is needed because the model uses rotary position embeddings
137+ tensor_map ["v.position_embd.weight" ] = np .zeros ([1 , 1 ], dtype = np .float32 )
108138
109139 return tensor_map
110140
@@ -160,36 +190,70 @@ def main(args):
160190 for name , data in tensor_map .items ():
161191 fout .add_tensor (name , data )
162192
193+ # Add key vision model parameters
163194 fout .add_uint32 ("clip.vision.patch_size" , vcfg .patch_size )
164195 fout .add_uint32 ("clip.vision.image_size" , 560 )
165- fout .add_uint32 ("clip.vision.projection_dim" , 1536 )
196+ fout .add_uint32 ("clip.vision.projection_dim" , 1536 ) # Output of the merger
166197 fout .add_uint32 ("clip.vision.embedding_length" , vcfg .hidden_size )
167198 fout .add_uint32 (k (KEY_ATTENTION_HEAD_COUNT , VISION ), vcfg .num_heads )
168- fout .add_float32 (k (KEY_ATTENTION_LAYERNORM_EPS , VISION ), 1e-6 )
199+ fout .add_float32 (k (KEY_ATTENTION_LAYERNORM_EPS , VISION ), 1e-6 ) # From the RMSNorm epsilon
169200 fout .add_uint32 (k (KEY_BLOCK_COUNT , VISION ), vcfg .depth )
170- # For Qwen2.5VL the feed forward dim is 0 since we handle the MLP differently
171- fout .add_uint32 (k (KEY_FEED_FORWARD_LENGTH , VISION ), 0 )
201+
202+ # For Qwen2.5VL, specify the feed forward dimension from mlp
203+ fout .add_uint32 (k (KEY_FEED_FORWARD_LENGTH , VISION ), 3420 ) # From gate_proj/up_proj dimensions
204+
205+ # Add additional flags for Qwen2.5 specific features
206+ fout .add_bool ("clip.vision.use_rms_norm" , True ) # Qwen2 uses RMSNorm
207+ fout .add_bool ("clip.vision.use_rotary_embeddings" , True ) # Uses rotary embeddings
208+
172209 fout .add_name (model_name )
173210
174211 fout .add_string ("clip.vision.mm_patch_merge_type" , "qwen2vl_merger" )
175212 # Set the appropriate crop resolution based on image_size
176213 fout .add_uint32 ("clip.vision.image_crop_resolution" , 560 )
177214
215+ # Add image grid pinpoints to avoid buffer overflow
216+ # This array defines normalized coordinates for grid sampling in the vision model
217+ # Using standard grid points for 560x560 image with patch size 14
218+ grid_size = 560 // 14 # Number of patches in each dimension
219+ pinpoints = []
220+ for y in range (grid_size ):
221+ for x in range (grid_size ):
222+ # Normalized coordinates from 0.0 to 1.0
223+ # Convert to Python float instead of numpy.float32
224+ pinpoints .append (float (x / (grid_size - 1 )))
225+ pinpoints .append (float (y / (grid_size - 1 )))
226+
227+ # Add pinpoints as a float array
228+ fout .add_array ("clip.vision.image_grid_pinpoints" , pinpoints )
229+
230+ # Load processor for image normalization values
178231 if MODEL_INPUT_DIR is not None :
179- processor : Qwen2_5_VLProcessor = Qwen2VLImageProcessor .from_pretrained (model_path )
232+ processor = Qwen2VLImageProcessor .from_pretrained (model_path )
180233 else :
181- processor : Qwen2_5_VLProcessor = Qwen2_5_VLProcessor .from_pretrained (model_name )
182-
183- fout .add_array ("clip.vision.image_mean" , processor .image_mean )
184- fout .add_array ("clip.vision.image_std" , processor .image_std )
234+ processor = Qwen2_5_VLProcessor .from_pretrained (model_name )
235+
236+ # Get the image mean and std values and ensure they're in the right format
237+ try :
238+ # Try accessing through image_processor first (newer versions)
239+ image_mean = processor .image_mean
240+ image_std = processor .image_std
241+ except AttributeError :
242+ # Fallback to direct access (older versions)
243+ image_mean = processor .image_mean
244+ image_std = processor .image_std
245+
246+ # Convert numpy values to Python floats
247+ image_mean = [float (x ) for x in image_mean ]
248+ image_std = [float (x ) for x in image_std ]
249+
250+ # Add arrays with Python float values
251+ fout .add_array ("clip.vision.image_mean" , image_mean )
252+ fout .add_array ("clip.vision.image_std" , image_std )
185253
186254 # Set the activation function flags based on the model config
187- if hasattr (vcfg , 'hidden_act' ) and 'silu' in vcfg .hidden_act .lower ():
188- fout .add_bool ("clip.use_silu" , True )
189- fout .add_bool ("clip.use_gelu" , False )
190- else :
191- fout .add_bool ("clip.use_silu" , False )
192- fout .add_bool ("clip.use_gelu" , False ) # Use defaults from dump
255+ fout .add_bool ("clip.use_silu" , True ) # Qwen2.5VL uses SiLU activation in MLP
256+ fout .add_bool ("clip.use_gelu" , False )
193257
194258 fout .write_header_to_file ()
195259 fout .write_kv_data_to_file ()
0 commit comments