11import os
22import time
33import unittest
4+ import onnx
45import torch
56from onnx_diagnostic .ext_test_case import ExtTestCase , never_test , ignore_warnings
67from onnx_diagnostic .torch_export_patches import torch_export_patches
1314class TestTryExportHuggingFaceHubModel (ExtTestCase ):
1415 @never_test ()
1516 @ignore_warnings (UserWarning )
16- def test_imagetext2text_qwen_2_5_vl_instruct_visual (self ):
17+ def test_qwen25_vli_visual (self ):
1718 """
19+ # task: imagetext2text
1820 clear&&NEVERTEST=1 python _unittests/ut_tasks/try_export.py -k qwen_2_5
1921
2022 possible prefix: ``TEXTDEVICE=cuda TESTDTYPE=float16 EXPORTER=onnx-dynamo
@@ -44,7 +46,7 @@ def test_imagetext2text_qwen_2_5_vl_instruct_visual(self):
4446 TESTDEVICE=cuda \\
4547 TESTDTYPE=float16 \\
4648 EXPORTER=custom \\
47- python _unittests/ut_tasks/try_export.py -k qwen_2_5_vl_instruct_visual
49+ python _unittests/ut_tasks/try_export.py -k qwen25_vli_visual
4850 """
4951 begin = time .perf_counter ()
5052 device = os .environ .get ("TESTDEVICE" , "cpu" )
@@ -113,10 +115,8 @@ def _config_reduction(config, task):
113115 )
114116 if not self .unit_test_going ():
115117 print ("-- save inputs" )
116- torch .save (
117- big_inputs , self .get_dump_file ("qwen_2_5_vl_instruct_visual.inputs.big.pt" )
118- )
119- torch .save (inputs , self .get_dump_file ("qwen_2_5_vl_instruct_visual.inputs.pt" ))
118+ torch .save (big_inputs , self .get_dump_file ("qwen25_vli_visual.inputs.big.pt" ))
119+ torch .save (inputs , self .get_dump_file ("qwen25_vli_visual.inputs.pt" ))
120120
121121 print (f"-- inputs: { self .string_type (inputs , with_shape = True )} " )
122122 # this is too long
@@ -126,75 +126,107 @@ def _config_reduction(config, task):
126126 print (f"-- MODEL RUN IN { time .perf_counter () - begin } " )
127127 print (f"-- expected: { self .string_type (expected , with_shape = True )} " )
128128
129- filename = self .get_dump_file (
130- f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{ device } .{ dtype } .{ exporter } .onnx"
131- )
132- fileep = self .get_dump_file (
133- f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{ device } .{ dtype } .{ exporter } .graph"
134- )
135129 dynamic_shapes = dict (
136130 hidden_states = {0 : "hidden_width" , 1 : "hidden_height" },
137131 grid_thw = {}, # {0: "n_images"}, # TODO: fix
138132 )
139133
140- # fake_inputs = make_fake_with_dynamic_dimensions(inputs, dynamic_shapes)[0]
141- begin = time .perf_counter ()
142- export_inputs = inputs
143- print ()
144- with torch_export_patches (
145- patch_torch = False ,
146- patch_sympy = False ,
147- patch_transformers = True ,
148- verbose = 1 ,
149- stop_if_static = 2 ,
150- ):
151- to_onnx (
152- model_to_export ,
153- kwargs = export_inputs ,
154- dynamic_shapes = dynamic_shapes ,
155- filename = filename ,
156- exporter = exporter ,
157- verbose = 1 ,
158- save_ep = None if self .unit_test_going () else (fileep , 2 ** 35 ),
159- target_opset = 22 ,
160- optimize = True ,
161- onnx_plugs = PLUGS ,
162- )
163-
164- print (f"-- MODEL CONVERTED IN { time .perf_counter () - begin } " )
134+ qwen25_attention = os .environ .get ("QWEN25ATTENTION" , "" )
135+ if qwen25_attention :
136+ attention_options = [qwen25_attention ]
137+ elif device == "cuda" and dtype in ("float16" , "bfloat16" ):
138+ attention_options = ["PACKED" , "BIGMASK" ]
139+ else :
140+ attention_options = ["LOOPMHA" , "BIGMASK" ]
165141
166- pt2_files = [f"{ fileep } .backup.pt2" , f"{ fileep } .ep.pt2" , f"{ fileep } .pt2" ]
167- pt2_files = [f for f in pt2_files if os .path .exists (f )]
168- assert (
169- self .unit_test_going () or pt2_files
170- ), f"Unable to find an existing file among { pt2_files !r} "
171- pt2_file = (
172- (pt2_files [0 ] if pt2_files else None ) if not self .unit_test_going () else None
173- )
174- # self.assertExists(pt2_file)
175- # ep = torch.export.load(pt2_file)
176- # diff = self.max_diff(ep.module()(**export_inputs), model.visual(**export_inputs))
177- # print("----------- diff", diff)
178- begin = time .perf_counter ()
179- self .assert_onnx_disc (
180- f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{ device } .{ dtype } .{ exporter } " ,
181- filename ,
182- model_to_export ,
183- export_inputs ,
184- verbose = 1 ,
185- providers = (
186- ["CUDAExecutionProvider" , "CPUExecutionProvider" ]
187- if device == "cuda"
188- else ["CPUExecutionProvider" ]
189- ),
190- use_ort = True ,
191- atol = 0.02 ,
192- rtol = 10 ,
193- ort_optimized_graph = False ,
194- ep = pt2_file ,
195- expected = expected ,
196- )
197- print (f"-- MODEL VERIFIED IN { time .perf_counter () - begin } " )
142+ # fake_inputs = make_fake_with_dynamic_dimensions(inputs, dynamic_shapes)[0]
143+ for attention in attention_options :
144+ with self .subTest (attention = attention ):
145+ print ()
146+ print (f"-- attention={ attention !r} " )
147+ os .environ ["QWEN25ATTENTION" ] = attention
148+ filename = self .get_dump_file (
149+ f"test_qwen25_vli_visual.{ device } .{ dtype } .{ attention } .{ exporter } .onnx"
150+ )
151+ fileep = self .get_dump_file (
152+ f"test_qwen25_vli_visual.{ device } .{ dtype } .{ attention } .{ exporter } .graph"
153+ )
154+
155+ begin = time .perf_counter ()
156+ export_inputs = inputs
157+ with torch_export_patches (
158+ patch_torch = False ,
159+ patch_sympy = False ,
160+ patch_transformers = True ,
161+ verbose = 1 ,
162+ stop_if_static = 2 ,
163+ ):
164+ to_onnx (
165+ model_to_export ,
166+ kwargs = export_inputs ,
167+ dynamic_shapes = dynamic_shapes ,
168+ filename = filename ,
169+ exporter = exporter ,
170+ verbose = 1 ,
171+ save_ep = None if self .unit_test_going () else (fileep , 2 ** 35 ),
172+ target_opset = 22 ,
173+ optimize = True ,
174+ onnx_plugs = PLUGS ,
175+ )
176+
177+ print (f"-- MODEL CONVERTED IN { time .perf_counter () - begin } " )
178+ model = onnx .load (filename , load_external_data = False )
179+ if attention == "PACKED" :
180+ self .assertIn (
181+ "PackedMultiHeadAttention" , {n .op_type for n in model .graph .node }
182+ )
183+ elif attention == "BIGMASK" :
184+ self .assertNotIn (
185+ "PackedMultiHeadAttention" , {n .op_type for n in model .graph .node }
186+ )
187+ elif attention == "LOOPMHA" :
188+ self .assertNotIn (
189+ "PackedMultiHeadAttention" , {n .op_type for n in model .graph .node }
190+ )
191+ self .assertIn ("Loop" , {n .op_type for n in model .graph .node })
192+ else :
193+ raise AssertionError (f"attention={ attention !r} not expected" )
194+
195+ pt2_files = [f"{ fileep } .backup.pt2" , f"{ fileep } .ep.pt2" , f"{ fileep } .pt2" ]
196+ pt2_files = [f for f in pt2_files if os .path .exists (f )]
197+ assert (
198+ self .unit_test_going () or pt2_files
199+ ), f"Unable to find an existing file among { pt2_files !r} "
200+ pt2_file = (
201+ (pt2_files [0 ] if pt2_files else None )
202+ if not self .unit_test_going ()
203+ else None
204+ )
205+ # self.assertExists(pt2_file)
206+ # ep = torch.export.load(pt2_file)
207+ # diff = self.max_diff(ep.module()(**export_inputs), model.visual(**export_inputs))
208+ # print("----------- diff", diff)
209+ begin = time .perf_counter ()
210+ self .assert_onnx_disc (
211+ (f"test_qwen25_vli_visual.{ device } .{ dtype } .{ attention } .{ exporter } " ),
212+ filename ,
213+ model_to_export ,
214+ export_inputs ,
215+ verbose = 1 ,
216+ providers = (
217+ ["CUDAExecutionProvider" , "CPUExecutionProvider" ]
218+ if device == "cuda"
219+ else ["CPUExecutionProvider" ]
220+ ),
221+ use_ort = True ,
222+ atol = 0.02 ,
223+ rtol = 10 ,
224+ ort_optimized_graph = False ,
225+ ep = pt2_file ,
226+ expected = expected ,
227+ )
228+ print (f"-- MODEL VERIFIED IN { time .perf_counter () - begin } " )
229+ os .environ ["QWEN25ATTENTION" ] = qwen25_attention
198230 if self .unit_test_going ():
199231 self .clean_dump ()
200232
0 commit comments