@@ -39,11 +39,26 @@ def to_gguf_name(name: str) -> str:
3939 if "weight_g" in name :
4040 name = name .replace ("weight_g" , "weight" )
4141
42+ # Special handling for merger tensors to match clip.cpp expectations
43+ if "merger.mlp" in name :
44+ # Extract the layer number
45+ parts = name .split ("." )
46+ for i , part in enumerate (parts ):
47+ if part == "mlp" and i + 1 < len (parts ):
48+ layer_num = parts [i + 1 ]
49+ # Map the merger layers to the expected GGUF tensor names
50+ # Note: clip.cpp looks for mm.0.* and mm.2.* (not mm.1.*)
51+ if layer_num == "0" :
52+ name = name .replace (f"merger.mlp.{ layer_num } " , "mm.0" )
53+ elif layer_num == "1" :
54+ name = name .replace (f"merger.mlp.{ layer_num } " , "mm.2" )
55+ break
56+
4257 print (f"[to_gguf_name] { og } --> { name } " )
4358 return name
4459
4560
46- def find_vision_tensors (model , dtype ) -> Dict [str , np .ndarray ]:
61+ def find_vision_tensors (model , dtype , hidden_size ) -> Dict [str , np .ndarray ]:
4762 visual = model .visual
4863 tensor_map = {}
4964
@@ -68,8 +83,23 @@ def find_vision_tensors(model, dtype) -> Dict[str, np.ndarray]:
6883 elif name .endswith ("ln_q.bias" ) and 'weight_g' not in name :
6984 tensor_map ['v.post_ln.bias' ] = ten
7085 else :
71- # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
72- tensor_map [to_gguf_name (name )] = ten
86+ # Handle merger tensors with special attention to naming
87+ # First, determine if this is a layer 0 or layer 1 tensor
88+ if "merger.mlp.0" in name :
89+ # First layer gets mapped to mm.0.*
90+ if "weight" in name :
91+ tensor_map ["mm.0.weight" ] = ten
92+ elif "bias" in name :
93+ tensor_map ["mm.0.bias" ] = ten
94+ elif "merger.mlp.1" in name :
95+ # Second layer gets mapped to mm.2.* (not mm.1.*)
96+ if "weight" in name :
97+ tensor_map ["mm.2.weight" ] = ten
98+ elif "bias" in name :
99+ tensor_map ["mm.2.bias" ] = ten
100+ else :
101+ # For any other tensors, use the standard naming conversion
102+ tensor_map [to_gguf_name (name )] = ten
73103 elif 'patch_embed.proj.weight' in name :
74104 # NOTE: split Conv3D into Conv2Ds
75105 c1 , c2 , kt , kh , kw = ten .shape
@@ -84,7 +114,10 @@ def find_vision_tensors(model, dtype) -> Dict[str, np.ndarray]:
84114 tensor_map [new_name ] = ten .astype (np .float32 )
85115 else :
86116 tensor_map [new_name ] = ten .astype (dtype )
87- tensor_map ["v.position_embd.weight" ] = np .zeros ([10 , 10 ], dtype = np .float32 ) # dummy tensor, just here as a placeholder
117+ # For Qwen2.5, create a properly sized position embedding tensor
118+ # Size it based on the model's hidden dimension and expected sequence length
119+ seq_length = 40 * 40 # Approximate max sequence length
120+ tensor_map ["v.position_embd.weight" ] = np .zeros ([seq_length , hidden_size ], dtype = np .float32 ) # Properly sized placeholder
88121 return tensor_map
89122
90123
@@ -153,7 +186,7 @@ def main(args):
153186 image_size = 14 * 40 # same as used below
154187 fout .add_uint32 ("clip.vision.image_crop_resolution" , image_size )
155188
156- tensor_map = find_vision_tensors (model , np_dtype )
189+ tensor_map = find_vision_tensors (model , np_dtype , vcfg . hidden_size )
157190 for name , data in tensor_map .items ():
158191 fout .add_tensor (name , data )
159192
0 commit comments