55import numpy as np
66from gguf import *
77from transformers import (
8+ AutoProcessor ,
89 Qwen2VLForConditionalGeneration ,
10+ Qwen2_5_VLForConditionalGeneration ,
911 Qwen2VLProcessor ,
10- AutoProcessor ,
11- Qwen2VLConfig
12+ Qwen2VLConfig ,
13+ Qwen2_5_VLConfig ,
1214)
1315
1416
1820def k (raw_key : str , arch : str ) -> str :
1921 return raw_key .format (arch = arch )
2022
23+ class VL2 :
24+
25+ @staticmethod
26+ def to_gguf_name (name : str ) -> str :
27+ og = name
28+ name = name .replace ("text_model" , "t" ).replace ("vision_model" , "v" )
29+ name = name .replace ("blocks" , "blk" ).replace ("embeddings." , "" )
30+ name = name .replace ("attn." , "attn_" )
31+ name = name .replace ("mlp.fc1" , "ffn_down" ).replace ("mlp.fc2" , "ffn_up" ).replace ("proj." , "out." )
32+ # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
33+ name = name .replace ("norm1" , "ln1" ).replace ("norm2" , "ln2" )
34+ name = name .replace ("merger.mlp" , 'mm' )
35+ print (f"[to_gguf_name] { og } --> { name } " )
36+ return name
37+
38+ @classmethod
39+ def find_vision_tensors (cls , qwen2vl , dtype ) -> Dict [str , np .ndarray ]:
40+ vision_model = qwen2vl .visual
41+ tensor_map = {}
42+ for name , ten in vision_model .state_dict ().items ():
43+ ten = ten .numpy ()
44+ if 'qkv' in name :
45+ if ten .ndim == 2 : # weight
46+ c3 , _ = ten .shape
47+ else : # bias
48+ c3 = ten .shape [0 ]
49+ assert c3 % 3 == 0
50+ c = c3 // 3
51+ wq = ten [:c ]
52+ wk = ten [c : c * 2 ]
53+ wv = ten [c * 2 :]
54+ tensor_map [cls .to_gguf_name (f"vision_model.{ name } " ).replace ("qkv" , "q" )] = wq
55+ tensor_map [cls .to_gguf_name (f"vision_model.{ name } " ).replace ("qkv" , "k" )] = wk
56+ tensor_map [cls .to_gguf_name (f"vision_model.{ name } " ).replace ("qkv" , "v" )] = wv
57+ elif 'merger' in name :
58+ if name .endswith ("ln_q.weight" ):
59+ tensor_map ['v.post_ln.weight' ] = ten
60+ elif name .endswith ("ln_q.bias" ):
61+ tensor_map ['v.post_ln.bias' ] = ten
62+ else :
63+ # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
64+ tensor_map [cls .to_gguf_name (name )] = ten
65+ elif 'patch_embed.proj.weight' in name :
66+ # NOTE: split Conv3D into Conv2Ds
67+ c1 , c2 , kt , kh , kw = ten .shape
68+ assert kt == 2 , "Current implmentation only support temporal_patch_size of 2"
69+ tensor_map ["v.patch_embd.weight" ] = ten [:, :, 0 , ...]
70+ tensor_map ["v.patch_embd.weight.1" ] = ten [:, :, 1 , ...]
71+ else :
72+ tensor_map [cls .to_gguf_name (f"vision_model.{ name } " )] = ten
2173
22- def to_gguf_name (name : str ) -> str :
23- og = name
24- name = name .replace ("text_model" , "t" ).replace ("vision_model" , "v" )
25- name = name .replace ("blocks" , "blk" ).replace ("embeddings." , "" )
26- name = name .replace ("attn." , "attn_" )
27- name = name .replace ("mlp.fc1" , "ffn_down" ).replace ("mlp.fc2" , "ffn_up" ).replace ("proj." , "out." )
28- # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
29- name = name .replace ("norm1" , "ln1" ).replace ("norm2" , "ln2" )
30- name = name .replace ("merger.mlp" , 'mm' )
31- print (f"[to_gguf_name] { og } --> { name } " )
32- return name
33-
34-
35- def find_vision_tensors (qwen2vl , dtype ) -> Dict [str , np .ndarray ]:
36- vision_model = qwen2vl .visual
37- tensor_map = {}
38- for name , ten in vision_model .state_dict ().items ():
39- ten = ten .numpy ()
40- if 'qkv' in name :
41- if ten .ndim == 2 : # weight
42- c3 , _ = ten .shape
43- else : # bias
44- c3 = ten .shape [0 ]
45- assert c3 % 3 == 0
46- c = c3 // 3
47- wq = ten [:c ]
48- wk = ten [c : c * 2 ]
49- wv = ten [c * 2 :]
50- tensor_map [to_gguf_name (f"vision_model.{ name } " ).replace ("qkv" , "q" )] = wq
51- tensor_map [to_gguf_name (f"vision_model.{ name } " ).replace ("qkv" , "k" )] = wk
52- tensor_map [to_gguf_name (f"vision_model.{ name } " ).replace ("qkv" , "v" )] = wv
53- elif 'merger' in name :
54- if name .endswith ("ln_q.weight" ):
55- tensor_map ['v.post_ln.weight' ] = ten
56- elif name .endswith ("ln_q.bias" ):
57- tensor_map ['v.post_ln.bias' ] = ten
74+ for new_name , ten in tensor_map .items ():
75+ if ten .ndim <= 1 or new_name .endswith ("_norm.weight" ):
76+ tensor_map [new_name ] = ten .astype (np .float32 )
5877 else :
59- # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
60- tensor_map [to_gguf_name (name )] = ten
61- elif 'patch_embed.proj.weight' in name :
62- # NOTE: split Conv3D into Conv2Ds
63- c1 , c2 , kt , kh , kw = ten .shape
64- assert kt == 2 , "Current implmentation only support temporal_patch_size of 2"
65- tensor_map ["v.patch_embd.weight" ] = ten [:, :, 0 , ...]
66- tensor_map ["v.patch_embd.weight.1" ] = ten [:, :, 1 , ...]
67- else :
68- tensor_map [to_gguf_name (f"vision_model.{ name } " )] = ten
69-
70- for new_name , ten in tensor_map .items ():
71- if ten .ndim <= 1 or new_name .endswith ("_norm.weight" ):
72- tensor_map [new_name ] = ten .astype (np .float32 )
73- else :
74- tensor_map [new_name ] = ten .astype (dtype )
75- tensor_map ["v.position_embd.weight" ] = np .zeros ([10 , 10 ], dtype = np .float32 ) # dummy tensor, just here as a placeholder
76- return tensor_map
78+ tensor_map [new_name ] = ten .astype (dtype )
79+ tensor_map ["v.position_embd.weight" ] = np .zeros ([10 , 10 ], dtype = np .float32 ) # dummy tensor, just here as a placeholder
80+ return tensor_map
81+
82+
83+ class VL25 (VL2 ):
84+
85+ @staticmethod
86+ def to_gguf_name (name : str ) -> str :
87+ og = name
88+ name = name .replace ("text_model" , "t" ).replace ("vision_model" , "v" )
89+ name = name .replace ("blocks" , "blk" ).replace ("embeddings." , "" )
90+ name = name .replace ("attn." , "attn_" )
91+ name = name .replace ("mlp.down_proj" , "ffn_down" ).replace ("mlp.up_proj" , "ffn_up" )
92+ name = name .replace ("mlp.gate_proj" , "ffn_gate" ).replace ("proj." , "out." )
93+ name = name .replace ("norm1" , "ln1" ).replace ("norm2" , "ln2" )
94+ name = name .replace ("merger.mlp" , 'mm' )
95+ print (f"[vl25][to_gguf_name] { og } --> { name } " )
96+ return name
7797
7898
7999def main (args ):
@@ -92,11 +112,18 @@ def main(args):
92112 model_path = ""
93113 model_name = args .model_name
94114 print ("model_name: " , model_name )
95- qwen2vl = Qwen2VLForConditionalGeneration .from_pretrained (
96- model_name , torch_dtype = dtype , device_map = "cpu"
97- )
98- cfg : Qwen2VLConfig = qwen2vl .config # type: ignore[reportAssignmentType]
99- vcfg = cfg .vision_config
115+ if args .model_type == "qwen2vl" :
116+ qwen2vl = Qwen2VLForConditionalGeneration .from_pretrained (
117+ model_name , torch_dtype = dtype , device_map = "cpu"
118+ )
119+ cfg : Qwen2VLConfig = qwen2vl .config # type: ignore[reportAssignmentType]
120+ vcfg = cfg .vision_config
121+ else :
122+ qwen2vl = Qwen2_5_VLForConditionalGeneration .from_pretrained (
123+ model_name , torch_dtype = dtype , device_map = "cpu"
124+ )
125+ cfg : Qwen2_5_VLConfig = qwen2vl .config # type: ignore[reportAssignmentType]
126+ vcfg = cfg .vision_config
100127
101128 if os .path .isdir (model_name ):
102129 local_model = True
@@ -125,14 +152,26 @@ def main(args):
125152 else :
126153 raise ValueError ()
127154
128- tensor_map = find_vision_tensors (qwen2vl , np_dtype )
155+ if args .model_type == "qwen2.5vl" :
156+ fout .add_bool ("clip.use_glu_mlp" , True ) # gate linear unit MLP layer in vision model
157+ fout .add_bool ("clip.use_rms_norm" , True )
158+ fout .add_array ("clip.vision.fullatt_block_indexes" , vcfg .fullatt_block_indexes )
159+ fout .add_uint32 ("clip.vision.window_size" , vcfg .window_size )
160+ fout .add_uint32 (k (KEY_EMBEDDING_LENGTH , VISION ), vcfg .hidden_size )
161+ fout .add_uint32 ("clip.vision.projection_dim" , vcfg .out_hidden_size )
162+ else :
163+ fout .add_uint32 (k (KEY_EMBEDDING_LENGTH , VISION ), vcfg .embed_dim )
164+ fout .add_uint32 ("clip.vision.projection_dim" , vcfg .hidden_size )
165+
166+ if args .model_type == "qwen2.5vl" :
167+ tensor_map = VL25 .find_vision_tensors (qwen2vl , np_dtype )
168+ else :
169+ tensor_map = VL2 .find_vision_tensors (qwen2vl , np_dtype )
129170 for name , data in tensor_map .items ():
130171 fout .add_tensor (name , data )
131172
132173 fout .add_uint32 ("clip.vision.patch_size" , vcfg .patch_size )
133174 fout .add_uint32 ("clip.vision.image_size" , 14 * 40 ) # some reasonable size that is divable by (14*2)
134- fout .add_uint32 (k (KEY_EMBEDDING_LENGTH , VISION ), vcfg .embed_dim )
135- fout .add_uint32 ("clip.vision.projection_dim" , vcfg .hidden_size )
136175 fout .add_uint32 (k (KEY_ATTENTION_HEAD_COUNT , VISION ), vcfg .num_heads )
137176 fout .add_float32 (k (KEY_ATTENTION_LAYERNORM_EPS , VISION ), 1e-6 )
138177 fout .add_uint32 (k (KEY_BLOCK_COUNT , VISION ), vcfg .depth )
@@ -160,6 +199,7 @@ def main(args):
160199if __name__ == "__main__" :
161200 parser = argparse .ArgumentParser ()
162201 parser .add_argument ("model_name" , nargs = '?' , default = "Qwen/Qwen2-VL-2B-Instruct" )
202+ parser .add_argument ("--model_type" , nargs = '?' , choices = ['qwen2vl' , 'qwen2.5vl' ], default = "qwen2vl" )
163203 parser .add_argument ("--data_type" , nargs = '?' , choices = ['fp32' , 'fp16' ], default = "fp32" )
164204 args = parser .parse_args ()
165205 main (args )
0 commit comments