Skip to content

Commit 513fc03

Browse files
committed
shorten names
1 parent cb9d51b commit 513fc03

File tree

5 files changed

+118
-90
lines changed

5 files changed

+118
-90
lines changed

.github/workflows/models.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,5 +61,5 @@ jobs:
6161
run: python -m pip freeze
6262

6363
- name: qwen2.5_vl_instruct
64-
run: PYTHONPATH=. UNITTEST_GOING=1 NEVERTEST=1 QWEN25ATTENTION=BIGMASK TESTDTYPE=float16 TESTDEVICE=cpu python _unittests/ut_tasks/try_export.py -f -k test_imagetext2text_qwen_2_5_vl_instruct_visual
64+
run: PYTHONPATH=. UNITTEST_GOING=1 NEVERTEST=1 QWEN25ATTENTION=BIGMASK TESTDTYPE=float16 TESTDEVICE=cpu python _unittests/ut_tasks/try_export.py -f -k test_qwen_2_5_vli_visual
6565

_doc/cmds/sbs.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ Example
2727
.. code-block::
2828
2929
python -m onnx_diagnostic sbs \
30-
-i qwen_2_5_vl_instruct_visual.inputs.pt \
31-
--ep test_imagetext2text_qwen_2_5_vl_instruct_visual.cuda.float16.custom.graph.ep.pt2 \
32-
-m test_imagetext2text_qwen_2_5_vl_instruct_visual.cuda.float16.custom.onnx \
30+
-i qwen25_vli_visual.inputs.pt \
31+
--ep test_qwen25_vli_visual.cuda.float16.custom.graph.ep.pt2 \
32+
-m test_qwen25_vli_visual.cuda.float16.custom.onnx \
3333
-o results.dynamo.float16.xlsx \
3434
-v 1 --atol=0.1 --rtol=1 \
3535
--replay-names conv3d,rsqrt,to_4,mul_48,linear,linear_2,linear_84,linear_89,mul_172,linear_156,linear_159 \

_unittests/ut_tasks/try_export.py

Lines changed: 101 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import time
33
import unittest
4+
import onnx
45
import torch
56
from onnx_diagnostic.ext_test_case import ExtTestCase, never_test, ignore_warnings
67
from onnx_diagnostic.torch_export_patches import torch_export_patches
@@ -13,8 +14,9 @@
1314
class TestTryExportHuggingFaceHubModel(ExtTestCase):
1415
@never_test()
1516
@ignore_warnings(UserWarning)
16-
def test_imagetext2text_qwen_2_5_vl_instruct_visual(self):
17+
def test_qwen25_vli_visual(self):
1718
"""
19+
# task: imagetext2text
1820
clear&&NEVERTEST=1 python _unittests/ut_tasks/try_export.py -k qwen_2_5
1921
2022
possible prefix: ``TEXTDEVICE=cuda TESTDTYPE=float16 EXPORTER=onnx-dynamo
@@ -44,7 +46,7 @@ def test_imagetext2text_qwen_2_5_vl_instruct_visual(self):
4446
TESTDEVICE=cuda \\
4547
TESTDTYPE=float16 \\
4648
EXPORTER=custom \\
47-
python _unittests/ut_tasks/try_export.py -k qwen_2_5_vl_instruct_visual
49+
python _unittests/ut_tasks/try_export.py -k qwen25_vli_visual
4850
"""
4951
begin = time.perf_counter()
5052
device = os.environ.get("TESTDEVICE", "cpu")
@@ -113,10 +115,8 @@ def _config_reduction(config, task):
113115
)
114116
if not self.unit_test_going():
115117
print("-- save inputs")
116-
torch.save(
117-
big_inputs, self.get_dump_file("qwen_2_5_vl_instruct_visual.inputs.big.pt")
118-
)
119-
torch.save(inputs, self.get_dump_file("qwen_2_5_vl_instruct_visual.inputs.pt"))
118+
torch.save(big_inputs, self.get_dump_file("qwen25_vli_visual.inputs.big.pt"))
119+
torch.save(inputs, self.get_dump_file("qwen25_vli_visual.inputs.pt"))
120120

121121
print(f"-- inputs: {self.string_type(inputs, with_shape=True)}")
122122
# this is too long
@@ -126,75 +126,107 @@ def _config_reduction(config, task):
126126
print(f"-- MODEL RUN IN {time.perf_counter() - begin}")
127127
print(f"-- expected: {self.string_type(expected, with_shape=True)}")
128128

129-
filename = self.get_dump_file(
130-
f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{device}.{dtype}.{exporter}.onnx"
131-
)
132-
fileep = self.get_dump_file(
133-
f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{device}.{dtype}.{exporter}.graph"
134-
)
135129
dynamic_shapes = dict(
136130
hidden_states={0: "hidden_width", 1: "hidden_height"},
137131
grid_thw={}, # {0: "n_images"}, # TODO: fix
138132
)
139133

140-
# fake_inputs = make_fake_with_dynamic_dimensions(inputs, dynamic_shapes)[0]
141-
begin = time.perf_counter()
142-
export_inputs = inputs
143-
print()
144-
with torch_export_patches(
145-
patch_torch=False,
146-
patch_sympy=False,
147-
patch_transformers=True,
148-
verbose=1,
149-
stop_if_static=2,
150-
):
151-
to_onnx(
152-
model_to_export,
153-
kwargs=export_inputs,
154-
dynamic_shapes=dynamic_shapes,
155-
filename=filename,
156-
exporter=exporter,
157-
verbose=1,
158-
save_ep=None if self.unit_test_going() else (fileep, 2**35),
159-
target_opset=22,
160-
optimize=True,
161-
onnx_plugs=PLUGS,
162-
)
163-
164-
print(f"-- MODEL CONVERTED IN {time.perf_counter() - begin}")
134+
qwen25_attention = os.environ.get("QWEN25ATTENTION", "")
135+
if qwen25_attention:
136+
attention_options = [qwen25_attention]
137+
elif device == "cuda" and dtype in ("float16", "bfloat16"):
138+
attention_options = ["PACKED", "BIGMASK"]
139+
else:
140+
attention_options = ["LOOPMHA", "BIGMASK"]
165141

166-
pt2_files = [f"{fileep}.backup.pt2", f"{fileep}.ep.pt2", f"{fileep}.pt2"]
167-
pt2_files = [f for f in pt2_files if os.path.exists(f)]
168-
assert (
169-
self.unit_test_going() or pt2_files
170-
), f"Unable to find an existing file among {pt2_files!r}"
171-
pt2_file = (
172-
(pt2_files[0] if pt2_files else None) if not self.unit_test_going() else None
173-
)
174-
# self.assertExists(pt2_file)
175-
# ep = torch.export.load(pt2_file)
176-
# diff = self.max_diff(ep.module()(**export_inputs), model.visual(**export_inputs))
177-
# print("----------- diff", diff)
178-
begin = time.perf_counter()
179-
self.assert_onnx_disc(
180-
f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{device}.{dtype}.{exporter}",
181-
filename,
182-
model_to_export,
183-
export_inputs,
184-
verbose=1,
185-
providers=(
186-
["CUDAExecutionProvider", "CPUExecutionProvider"]
187-
if device == "cuda"
188-
else ["CPUExecutionProvider"]
189-
),
190-
use_ort=True,
191-
atol=0.02,
192-
rtol=10,
193-
ort_optimized_graph=False,
194-
ep=pt2_file,
195-
expected=expected,
196-
)
197-
print(f"-- MODEL VERIFIED IN {time.perf_counter() - begin}")
142+
# fake_inputs = make_fake_with_dynamic_dimensions(inputs, dynamic_shapes)[0]
143+
for attention in attention_options:
144+
with self.subTest(attention=attention):
145+
print()
146+
print(f"-- attention={attention!r}")
147+
os.environ["QWEN25ATTENTION"] = attention
148+
filename = self.get_dump_file(
149+
f"test_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}.onnx"
150+
)
151+
fileep = self.get_dump_file(
152+
f"test_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}.graph"
153+
)
154+
155+
begin = time.perf_counter()
156+
export_inputs = inputs
157+
with torch_export_patches(
158+
patch_torch=False,
159+
patch_sympy=False,
160+
patch_transformers=True,
161+
verbose=1,
162+
stop_if_static=2,
163+
):
164+
to_onnx(
165+
model_to_export,
166+
kwargs=export_inputs,
167+
dynamic_shapes=dynamic_shapes,
168+
filename=filename,
169+
exporter=exporter,
170+
verbose=1,
171+
save_ep=None if self.unit_test_going() else (fileep, 2**35),
172+
target_opset=22,
173+
optimize=True,
174+
onnx_plugs=PLUGS,
175+
)
176+
177+
print(f"-- MODEL CONVERTED IN {time.perf_counter() - begin}")
178+
model = onnx.load(filename, load_external_data=False)
179+
if attention == "PACKED":
180+
self.assertIn(
181+
"PackedMultiHeadAttention", {n.op_type for n in model.graph.node}
182+
)
183+
elif attention == "BIGMASK":
184+
self.assertNotIn(
185+
"PackedMultiHeadAttention", {n.op_type for n in model.graph.node}
186+
)
187+
elif attention == "LOOPMHA":
188+
self.assertNotIn(
189+
"PackedMultiHeadAttention", {n.op_type for n in model.graph.node}
190+
)
191+
self.assertIn("Loop", {n.op_type for n in model.graph.node})
192+
else:
193+
raise AssertionError(f"attention={attention!r} not expected")
194+
195+
pt2_files = [f"{fileep}.backup.pt2", f"{fileep}.ep.pt2", f"{fileep}.pt2"]
196+
pt2_files = [f for f in pt2_files if os.path.exists(f)]
197+
assert (
198+
self.unit_test_going() or pt2_files
199+
), f"Unable to find an existing file among {pt2_files!r}"
200+
pt2_file = (
201+
(pt2_files[0] if pt2_files else None)
202+
if not self.unit_test_going()
203+
else None
204+
)
205+
# self.assertExists(pt2_file)
206+
# ep = torch.export.load(pt2_file)
207+
# diff = self.max_diff(ep.module()(**export_inputs), model.visual(**export_inputs))
208+
# print("----------- diff", diff)
209+
begin = time.perf_counter()
210+
self.assert_onnx_disc(
211+
(f"test_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}"),
212+
filename,
213+
model_to_export,
214+
export_inputs,
215+
verbose=1,
216+
providers=(
217+
["CUDAExecutionProvider", "CPUExecutionProvider"]
218+
if device == "cuda"
219+
else ["CPUExecutionProvider"]
220+
),
221+
use_ort=True,
222+
atol=0.02,
223+
rtol=10,
224+
ort_optimized_graph=False,
225+
ep=pt2_file,
226+
expected=expected,
227+
)
228+
print(f"-- MODEL VERIFIED IN {time.perf_counter() - begin}")
229+
os.environ["QWEN25ATTENTION"] = qwen25_attention
198230
if self.unit_test_going():
199231
self.clean_dump()
200232

_unittests/ut_tasks/try_tasks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,7 +1009,7 @@ def test_imagetext2text_generation_gemma3_4b_it(self):
10091009

10101010
@never_test()
10111011
@ignore_warnings(UserWarning)
1012-
def test_imagetext2text_qwen_2_5_vl_instruct(self):
1012+
def test_qwen25_vli(self):
10131013
"""
10141014
clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k qwen_2_5
10151015
@@ -1106,7 +1106,7 @@ def config_reduction(config, task):
11061106
),
11071107
steal_forward(
11081108
[model, model.visual, model.visual.blocks[0].attn],
1109-
dump_file=self.get_dump_file("test_imagetext2text_qwen_2_5_vl_instruct.onnx"),
1109+
dump_file=self.get_dump_file("test_qwen25_vli.onnx"),
11101110
dump_drop={"attention_mask", "past_key_values", "pixel_values"},
11111111
save_as_external_data=False,
11121112
with_shapes=True,

onnx_diagnostic/torch_export_patches/patches/_patch_transformers_qwen2_5.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
patch_qwen2_5 = False
1717

1818
PLUGS = []
19-
strategy_for_attention_in_qwen_2_5 = os.environ.get("QWEN25ATTENTION", "PACKED")
2019

2120
if patch_qwen2_5:
2221
import onnxscript
@@ -445,6 +444,9 @@ class patched_Qwen2_5_VLVisionAttention:
445444
_PATCHED_CLASS_ = (
446445
transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLVisionAttention
447446
)
447+
STRATEGY_FOR_ATTENTION = lambda: os.environ.get( # noqa: E731
448+
"QWEN25ATTENTION", "PACKED"
449+
)
448450

449451
def forward(
450452
self,
@@ -488,11 +490,13 @@ def forward(
488490
self.config._attn_implementation
489491
]
490492

491-
if (
493+
is_sdpa = (
492494
attention_interface
493495
is transformers.integrations.sdpa_attention.sdpa_attention_forward
494496
or attention_interface is patched_sdpa_attention_forward
495-
) and strategy_for_attention_in_qwen_2_5 == "PACKED":
497+
)
498+
attention_strategy = patched_Qwen2_5_VLVisionAttention.STRATEGY_FOR_ATTENTION()
499+
if is_sdpa and attention_strategy == "PACKED":
496500
attn_output = qwen_sdpa_attention_versatile(
497501
query_states,
498502
key_states,
@@ -525,11 +529,7 @@ def forward(
525529
),
526530
version=1,
527531
)
528-
elif (
529-
attention_interface
530-
is transformers.integrations.sdpa_attention.sdpa_attention_forward
531-
or attention_interface is patched_sdpa_attention_forward
532-
) and strategy_for_attention_in_qwen_2_5 == "LOOPMHA":
532+
elif is_sdpa and attention_strategy == "LOOPMHA":
533533

534534
def _iteration(start_end, query_states, key_states, value_states):
535535
return patched_Qwen2_5_VLVisionAttentionOneIteration.forward(
@@ -561,11 +561,7 @@ def _iteration(start_end, query_states, key_states, value_states):
561561
# starts_ends, query_states, key_states, value_states), tuple(),
562562
# )
563563
attn_output = torch.cat(attn_outputs, dim=1)
564-
elif (
565-
attention_interface
566-
is transformers.integrations.sdpa_attention.sdpa_attention_forward
567-
or attention_interface is patched_sdpa_attention_forward
568-
) and strategy_for_attention_in_qwen_2_5 == "BIGMASK":
564+
elif is_sdpa and attention_strategy == "BIGMASK":
569565
# make square mask
570566
indices = torch.arange(
571567
cu_seqlens.max(), dtype=cu_seqlens.dtype, device=cu_seqlens.device
@@ -594,8 +590,8 @@ def _iteration(start_end, query_states, key_states, value_states):
594590
)
595591
else:
596592
raise NotImplementedError(
597-
f"Not export strategy for strategy_for_attention_in_qwen_2_5="
598-
f"{strategy_for_attention_in_qwen_2_5!r}, "
593+
f"No corresponding export strategy for "
594+
f"{attention_strategy!r}, "
599595
f"(use QWEN25ATTENTION to change it), and attention_interface="
600596
f"{attention_interface!r} (use sdpa)"
601597
)

0 commit comments

Comments
 (0)