Skip to content

Commit 801d64d

Browse files
Surgery updates.
1 parent a4ca0c3 commit 801d64d

File tree

1 file changed

+94
-30
lines changed

1 file changed

+94
-30
lines changed

examples/llava/qwen2_5_vl_surgery.py

Lines changed: 94 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from transformers import (
77
Qwen2_5_VLForConditionalGeneration,
88
Qwen2_5_VLProcessor,
9-
AutoProcessor,
109
Qwen2_5_VLConfig,
1110
Qwen2VLImageProcessor
1211
)
@@ -21,14 +20,29 @@ def k(raw_key: str, arch: str) -> str:
2120

2221
def to_gguf_name(name: str) -> str:
2322
og = name
24-
name = name.replace("text_model", "t").replace("vision_model", "v")
25-
name = name.replace("blocks", "blk").replace("embeddings.", "")
23+
# Handle the base case where vision_model is not in the name
24+
if not name.startswith("vision_model."):
25+
name = "vision_model." + name
26+
27+
name = name.replace("vision_model", "v")
28+
name = name.replace("text_model", "t")
29+
name = name.replace("blocks", "blk")
30+
name = name.replace("embeddings.", "")
2631
name = name.replace("attn.", "attn_")
27-
name = name.replace("mlp.gate_proj", "ffn_gate").replace("mlp.up_proj", "ffn_up").replace("mlp.down_proj", "ffn_down")
32+
33+
# Handle MLP components correctly
34+
name = name.replace("mlp.gate_proj", "ffn_gate")
35+
name = name.replace("mlp.up_proj", "ffn_up")
36+
name = name.replace("mlp.down_proj", "ffn_down")
37+
38+
# Handle projection and norm components
2839
name = name.replace("proj.", "out.")
29-
# Replace norm names so that layernorms become ln1/ln2
30-
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
40+
name = name.replace("norm1", "ln1")
41+
name = name.replace("norm2", "ln2")
42+
43+
# Handle merger components correctly
3144
name = name.replace("merger.mlp", "mm")
45+
3246
print(f"[to_gguf_name] {og} --> {name}")
3347
return name
3448

@@ -37,6 +51,10 @@ def find_vision_tensors(qwen2vl, np_dtype) -> Dict[str, np.ndarray]:
3751
vision_model = qwen2vl.visual
3852
tensor_map = {}
3953

54+
# Debug info
55+
print(f"Vision model type: {type(vision_model)}")
56+
print(f"Number of blocks: {len(vision_model.blocks)}")
57+
4058
for name, ten in vision_model.state_dict().items():
4159
ten = ten.numpy()
4260

@@ -51,14 +69,14 @@ def find_vision_tensors(qwen2vl, np_dtype) -> Dict[str, np.ndarray]:
5169
wq = ten[:c]
5270
wk = ten[c: c * 2]
5371
wv = ten[c * 2:]
54-
base_name = to_gguf_name(f"vision_model.{name}")
72+
base_name = to_gguf_name(name)
5573
tensor_map[base_name.replace("qkv", "q")] = wq
5674
tensor_map[base_name.replace("qkv", "k")] = wk
5775
tensor_map[base_name.replace("qkv", "v")] = wv
5876

5977
elif 'gate_proj' in name or 'up_proj' in name or 'down_proj' in name:
6078
# Handle the MLP structure with gate/up/down projections
61-
tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
79+
tensor_map[to_gguf_name(name)] = ten
6280

6381
elif 'merger' in name:
6482
# Map merger layernorm parameters to post_ln keys
@@ -85,26 +103,38 @@ def find_vision_tensors(qwen2vl, np_dtype) -> Dict[str, np.ndarray]:
85103
# For the Conv3d, split the temporal kernel dimension (which is 2)
86104
c1, c2, kt, kh, kw = ten.shape
87105
assert kt == 2, "Current implementation only supports temporal_patch_size of 2"
88-
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
89-
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
106+
107+
# Properly handle the Conv3d weights for GGUF
108+
# Reshape from [output_channels, input_channels, temporal, height, width]
109+
# to the format expected by GGUF
110+
# For temporal slice 0
111+
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, :, :].reshape(c1, c2 * kh * kw)
112+
# For temporal slice 1
113+
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, :, :].reshape(c1, c2 * kh * kw)
114+
115+
elif 'norm1' in name or 'norm2' in name:
116+
# Handle the RMSNorm correctly
117+
tensor_map[to_gguf_name(name)] = ten
90118

91119
else:
92-
tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
120+
tensor_map[to_gguf_name(name)] = ten
93121

94122
# Ensure biases and layer norm weights remain in fp32
95123
for new_name, ten in tensor_map.items():
96124
if (ten.ndim <= 1 or
97125
new_name.endswith("ln1.weight") or
98126
new_name.endswith("ln1.bias") or
99127
new_name.endswith("ln2.weight") or
100-
new_name.endswith("ln2.bias")):
128+
new_name.endswith("ln2.bias") or
129+
new_name.endswith("post_ln.weight") or
130+
new_name.endswith("post_ln.bias")):
101131
tensor_map[new_name] = ten.astype(np.float32)
102132
else:
103133
tensor_map[new_name] = ten.astype(np_dtype)
104134

105-
# Dummy tensor as a placeholder for position embeddings
106-
# Required even when using rotary embeddings
107-
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32)
135+
# Add rotary embeddings info - dummy tensor as a placeholder
136+
# This is needed because the model uses rotary position embeddings
137+
tensor_map["v.position_embd.weight"] = np.zeros([1, 1], dtype=np.float32)
108138

109139
return tensor_map
110140

@@ -160,36 +190,70 @@ def main(args):
160190
for name, data in tensor_map.items():
161191
fout.add_tensor(name, data)
162192

193+
# Add key vision model parameters
163194
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
164195
fout.add_uint32("clip.vision.image_size", 560)
165-
fout.add_uint32("clip.vision.projection_dim", 1536)
196+
fout.add_uint32("clip.vision.projection_dim", 1536) # Output of the merger
166197
fout.add_uint32("clip.vision.embedding_length", vcfg.hidden_size)
167198
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
168-
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
199+
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) # From the RMSNorm epsilon
169200
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
170-
# For Qwen2.5VL the feed forward dim is 0 since we handle the MLP differently
171-
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0)
201+
202+
# For Qwen2.5VL, specify the feed forward dimension from mlp
203+
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 3420) # From gate_proj/up_proj dimensions
204+
205+
# Add additional flags for Qwen2.5 specific features
206+
fout.add_bool("clip.vision.use_rms_norm", True) # Qwen2 uses RMSNorm
207+
fout.add_bool("clip.vision.use_rotary_embeddings", True) # Uses rotary embeddings
208+
172209
fout.add_name(model_name)
173210

174211
fout.add_string("clip.vision.mm_patch_merge_type", "qwen2vl_merger")
175212
# Set the appropriate crop resolution based on image_size
176213
fout.add_uint32("clip.vision.image_crop_resolution", 560)
177214

215+
# Add image grid pinpoints to avoid buffer overflow
216+
# This array defines normalized coordinates for grid sampling in the vision model
217+
# Using standard grid points for 560x560 image with patch size 14
218+
grid_size = 560 // 14 # Number of patches in each dimension
219+
pinpoints = []
220+
for y in range(grid_size):
221+
for x in range(grid_size):
222+
# Normalized coordinates from 0.0 to 1.0
223+
# Convert to Python float instead of numpy.float32
224+
pinpoints.append(float(x / (grid_size - 1)))
225+
pinpoints.append(float(y / (grid_size - 1)))
226+
227+
# Add pinpoints as a float array
228+
fout.add_array("clip.vision.image_grid_pinpoints", pinpoints)
229+
230+
# Load processor for image normalization values
178231
if MODEL_INPUT_DIR is not None:
179-
processor: Qwen2_5_VLProcessor = Qwen2VLImageProcessor.from_pretrained(model_path)
232+
processor = Qwen2VLImageProcessor.from_pretrained(model_path)
180233
else:
181-
processor: Qwen2_5_VLProcessor = Qwen2_5_VLProcessor.from_pretrained(model_name)
182-
183-
fout.add_array("clip.vision.image_mean", processor.image_mean)
184-
fout.add_array("clip.vision.image_std", processor.image_std)
234+
processor = Qwen2_5_VLProcessor.from_pretrained(model_name)
235+
236+
# Get the image mean and std values and ensure they're in the right format
237+
try:
238+
# Try accessing through image_processor first (newer versions)
239+
image_mean = processor.image_mean
240+
image_std = processor.image_std
241+
except AttributeError:
242+
# Fallback to direct access (older versions)
243+
image_mean = processor.image_mean
244+
image_std = processor.image_std
245+
246+
# Convert numpy values to Python floats
247+
image_mean = [float(x) for x in image_mean]
248+
image_std = [float(x) for x in image_std]
249+
250+
# Add arrays with Python float values
251+
fout.add_array("clip.vision.image_mean", image_mean)
252+
fout.add_array("clip.vision.image_std", image_std)
185253

186254
# Set the activation function flags based on the model config
187-
if hasattr(vcfg, 'hidden_act') and 'silu' in vcfg.hidden_act.lower():
188-
fout.add_bool("clip.use_silu", True)
189-
fout.add_bool("clip.use_gelu", False)
190-
else:
191-
fout.add_bool("clip.use_silu", False)
192-
fout.add_bool("clip.use_gelu", False) # Use defaults from dump
255+
fout.add_bool("clip.use_silu", True) # Qwen2.5VL uses SiLU activation in MLP
256+
fout.add_bool("clip.use_gelu", False)
193257

194258
fout.write_header_to_file()
195259
fout.write_kv_data_to_file()

0 commit comments

Comments
 (0)