shorten names

xadupre · xadupre · commit 513fc03b2fc5 · 2025-11-26T11:32:01.000+01:00
diff --git a/.github/workflows/models.yml b/.github/workflows/models.yml
@@ -61,5 +61,5 @@ jobs:
         run: python -m pip freeze
 
       - name: qwen2.5_vl_instruct
-        run: PYTHONPATH=. UNITTEST_GOING=1 NEVERTEST=1 QWEN25ATTENTION=BIGMASK TESTDTYPE=float16 TESTDEVICE=cpu python _unittests/ut_tasks/try_export.py -f -k test_imagetext2text_qwen_2_5_vl_instruct_visual
+        run: PYTHONPATH=. UNITTEST_GOING=1 NEVERTEST=1 QWEN25ATTENTION=BIGMASK TESTDTYPE=float16 TESTDEVICE=cpu python _unittests/ut_tasks/try_export.py -f -k test_qwen_2_5_vli_visual
         
diff --git a/_doc/cmds/sbs.rst b/_doc/cmds/sbs.rst
@@ -27,9 +27,9 @@ Example
 .. code-block::
 
     python -m onnx_diagnostic sbs \
-        -i qwen_2_5_vl_instruct_visual.inputs.pt \
-        --ep test_imagetext2text_qwen_2_5_vl_instruct_visual.cuda.float16.custom.graph.ep.pt2 \
-        -m test_imagetext2text_qwen_2_5_vl_instruct_visual.cuda.float16.custom.onnx \
+        -i qwen25_vli_visual.inputs.pt \
+        --ep test_qwen25_vli_visual.cuda.float16.custom.graph.ep.pt2 \
+        -m test_qwen25_vli_visual.cuda.float16.custom.onnx \
         -o results.dynamo.float16.xlsx \
         -v 1 --atol=0.1 --rtol=1 \
         --replay-names conv3d,rsqrt,to_4,mul_48,linear,linear_2,linear_84,linear_89,mul_172,linear_156,linear_159 \
diff --git a/_unittests/ut_tasks/try_export.py b/_unittests/ut_tasks/try_export.py
@@ -1,6 +1,7 @@
 import os
 import time
 import unittest
+import onnx
 import torch
 from onnx_diagnostic.ext_test_case import ExtTestCase, never_test, ignore_warnings
 from onnx_diagnostic.torch_export_patches import torch_export_patches
@@ -13,8 +14,9 @@
 class TestTryExportHuggingFaceHubModel(ExtTestCase):
     @never_test()
     @ignore_warnings(UserWarning)
-    def test_imagetext2text_qwen_2_5_vl_instruct_visual(self):
+    def test_qwen25_vli_visual(self):
         """
+        # task: imagetext2text
         clear&&NEVERTEST=1 python _unittests/ut_tasks/try_export.py -k qwen_2_5
 
         possible prefix: ``TEXTDEVICE=cuda TESTDTYPE=float16 EXPORTER=onnx-dynamo
@@ -44,7 +46,7 @@ def test_imagetext2text_qwen_2_5_vl_instruct_visual(self):
             TESTDEVICE=cuda \\
             TESTDTYPE=float16 \\
             EXPORTER=custom \\
-            python _unittests/ut_tasks/try_export.py -k qwen_2_5_vl_instruct_visual
+            python _unittests/ut_tasks/try_export.py -k qwen25_vli_visual
         """
         begin = time.perf_counter()
         device = os.environ.get("TESTDEVICE", "cpu")
@@ -113,10 +115,8 @@ def _config_reduction(config, task):
         )
         if not self.unit_test_going():
             print("-- save inputs")
-            torch.save(
-                big_inputs, self.get_dump_file("qwen_2_5_vl_instruct_visual.inputs.big.pt")
-            )
-            torch.save(inputs, self.get_dump_file("qwen_2_5_vl_instruct_visual.inputs.pt"))
+            torch.save(big_inputs, self.get_dump_file("qwen25_vli_visual.inputs.big.pt"))
+            torch.save(inputs, self.get_dump_file("qwen25_vli_visual.inputs.pt"))
 
         print(f"-- inputs: {self.string_type(inputs, with_shape=True)}")
         # this is too long
@@ -126,75 +126,107 @@ def _config_reduction(config, task):
         print(f"-- MODEL RUN IN {time.perf_counter() - begin}")
         print(f"-- expected: {self.string_type(expected, with_shape=True)}")
 
-        filename = self.get_dump_file(
-            f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{device}.{dtype}.{exporter}.onnx"
-        )
-        fileep = self.get_dump_file(
-            f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{device}.{dtype}.{exporter}.graph"
-        )
         dynamic_shapes = dict(
             hidden_states={0: "hidden_width", 1: "hidden_height"},
             grid_thw={},  # {0: "n_images"}, # TODO: fix
         )
 
-        # fake_inputs = make_fake_with_dynamic_dimensions(inputs, dynamic_shapes)[0]
-        begin = time.perf_counter()
-        export_inputs = inputs
-        print()
-        with torch_export_patches(
-            patch_torch=False,
-            patch_sympy=False,
-            patch_transformers=True,
-            verbose=1,
-            stop_if_static=2,
-        ):
-            to_onnx(
-                model_to_export,
-                kwargs=export_inputs,
-                dynamic_shapes=dynamic_shapes,
-                filename=filename,
-                exporter=exporter,
-                verbose=1,
-                save_ep=None if self.unit_test_going() else (fileep, 2**35),
-                target_opset=22,
-                optimize=True,
-                onnx_plugs=PLUGS,
-            )
-
-        print(f"-- MODEL CONVERTED IN {time.perf_counter() - begin}")
+        qwen25_attention = os.environ.get("QWEN25ATTENTION", "")
+        if qwen25_attention:
+            attention_options = [qwen25_attention]
+        elif device == "cuda" and dtype in ("float16", "bfloat16"):
+            attention_options = ["PACKED", "BIGMASK"]
+        else:
+            attention_options = ["LOOPMHA", "BIGMASK"]
 
-        pt2_files = [f"{fileep}.backup.pt2", f"{fileep}.ep.pt2", f"{fileep}.pt2"]
-        pt2_files = [f for f in pt2_files if os.path.exists(f)]
-        assert (
-            self.unit_test_going() or pt2_files
-        ), f"Unable to find an existing file among {pt2_files!r}"
-        pt2_file = (
-            (pt2_files[0] if pt2_files else None) if not self.unit_test_going() else None
-        )
-        # self.assertExists(pt2_file)
-        # ep = torch.export.load(pt2_file)
-        # diff = self.max_diff(ep.module()(**export_inputs), model.visual(**export_inputs))
-        # print("----------- diff", diff)
-        begin = time.perf_counter()
-        self.assert_onnx_disc(
-            f"test_imagetext2text_qwen_2_5_vl_instruct_visual.{device}.{dtype}.{exporter}",
-            filename,
-            model_to_export,
-            export_inputs,
-            verbose=1,
-            providers=(
-                ["CUDAExecutionProvider", "CPUExecutionProvider"]
-                if device == "cuda"
-                else ["CPUExecutionProvider"]
-            ),
-            use_ort=True,
-            atol=0.02,
-            rtol=10,
-            ort_optimized_graph=False,
-            ep=pt2_file,
-            expected=expected,
-        )
-        print(f"-- MODEL VERIFIED IN {time.perf_counter() - begin}")
+        # fake_inputs = make_fake_with_dynamic_dimensions(inputs, dynamic_shapes)[0]
+        for attention in attention_options:
+            with self.subTest(attention=attention):
+                print()
+                print(f"-- attention={attention!r}")
+                os.environ["QWEN25ATTENTION"] = attention
+                filename = self.get_dump_file(
+                    f"test_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}.onnx"
+                )
+                fileep = self.get_dump_file(
+                    f"test_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}.graph"
+                )
+
+                begin = time.perf_counter()
+                export_inputs = inputs
+                with torch_export_patches(
+                    patch_torch=False,
+                    patch_sympy=False,
+                    patch_transformers=True,
+                    verbose=1,
+                    stop_if_static=2,
+                ):
+                    to_onnx(
+                        model_to_export,
+                        kwargs=export_inputs,
+                        dynamic_shapes=dynamic_shapes,
+                        filename=filename,
+                        exporter=exporter,
+                        verbose=1,
+                        save_ep=None if self.unit_test_going() else (fileep, 2**35),
+                        target_opset=22,
+                        optimize=True,
+                        onnx_plugs=PLUGS,
+                    )
+
+                print(f"-- MODEL CONVERTED IN {time.perf_counter() - begin}")
+                model = onnx.load(filename, load_external_data=False)
+                if attention == "PACKED":
+                    self.assertIn(
+                        "PackedMultiHeadAttention", {n.op_type for n in model.graph.node}
+                    )
+                elif attention == "BIGMASK":
+                    self.assertNotIn(
+                        "PackedMultiHeadAttention", {n.op_type for n in model.graph.node}
+                    )
+                elif attention == "LOOPMHA":
+                    self.assertNotIn(
+                        "PackedMultiHeadAttention", {n.op_type for n in model.graph.node}
+                    )
+                    self.assertIn("Loop", {n.op_type for n in model.graph.node})
+                else:
+                    raise AssertionError(f"attention={attention!r} not expected")
+
+                pt2_files = [f"{fileep}.backup.pt2", f"{fileep}.ep.pt2", f"{fileep}.pt2"]
+                pt2_files = [f for f in pt2_files if os.path.exists(f)]
+                assert (
+                    self.unit_test_going() or pt2_files
+                ), f"Unable to find an existing file among {pt2_files!r}"
+                pt2_file = (
+                    (pt2_files[0] if pt2_files else None)
+                    if not self.unit_test_going()
+                    else None
+                )
+                # self.assertExists(pt2_file)
+                # ep = torch.export.load(pt2_file)
+                # diff = self.max_diff(ep.module()(**export_inputs), model.visual(**export_inputs))
+                # print("----------- diff", diff)
+                begin = time.perf_counter()
+                self.assert_onnx_disc(
+                    (f"test_qwen25_vli_visual.{device}.{dtype}.{attention}.{exporter}"),
+                    filename,
+                    model_to_export,
+                    export_inputs,
+                    verbose=1,
+                    providers=(
+                        ["CUDAExecutionProvider", "CPUExecutionProvider"]
+                        if device == "cuda"
+                        else ["CPUExecutionProvider"]
+                    ),
+                    use_ort=True,
+                    atol=0.02,
+                    rtol=10,
+                    ort_optimized_graph=False,
+                    ep=pt2_file,
+                    expected=expected,
+                )
+                print(f"-- MODEL VERIFIED IN {time.perf_counter() - begin}")
+        os.environ["QWEN25ATTENTION"] = qwen25_attention
         if self.unit_test_going():
             self.clean_dump()
 
diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
@@ -1009,7 +1009,7 @@ def test_imagetext2text_generation_gemma3_4b_it(self):
 
     @never_test()
     @ignore_warnings(UserWarning)
-    def test_imagetext2text_qwen_2_5_vl_instruct(self):
+    def test_qwen25_vli(self):
         """
         clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k qwen_2_5
 
@@ -1106,7 +1106,7 @@ def config_reduction(config, task):
             ),
             steal_forward(
                 [model, model.visual, model.visual.blocks[0].attn],
-                dump_file=self.get_dump_file("test_imagetext2text_qwen_2_5_vl_instruct.onnx"),
+                dump_file=self.get_dump_file("test_qwen25_vli.onnx"),
                 dump_drop={"attention_mask", "past_key_values", "pixel_values"},
                 save_as_external_data=False,
                 with_shapes=True,
diff --git a/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_qwen2_5.py b/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_qwen2_5.py
@@ -16,7 +16,6 @@
     patch_qwen2_5 = False
 
 PLUGS = []
-strategy_for_attention_in_qwen_2_5 = os.environ.get("QWEN25ATTENTION", "PACKED")
 
 if patch_qwen2_5:
     import onnxscript
@@ -445,6 +444,9 @@ class patched_Qwen2_5_VLVisionAttention:
         _PATCHED_CLASS_ = (
             transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLVisionAttention
         )
+        STRATEGY_FOR_ATTENTION = lambda: os.environ.get(  # noqa: E731
+            "QWEN25ATTENTION", "PACKED"
+        )
 
         def forward(
             self,
@@ -488,11 +490,13 @@ def forward(
                     self.config._attn_implementation
                 ]
 
-            if (
+            is_sdpa = (
                 attention_interface
                 is transformers.integrations.sdpa_attention.sdpa_attention_forward
                 or attention_interface is patched_sdpa_attention_forward
-            ) and strategy_for_attention_in_qwen_2_5 == "PACKED":
+            )
+            attention_strategy = patched_Qwen2_5_VLVisionAttention.STRATEGY_FOR_ATTENTION()
+            if is_sdpa and attention_strategy == "PACKED":
                 attn_output = qwen_sdpa_attention_versatile(
                     query_states,
                     key_states,
@@ -525,11 +529,7 @@ def forward(
                         ),
                         version=1,
                     )
-                elif (
-                    attention_interface
-                    is transformers.integrations.sdpa_attention.sdpa_attention_forward
-                    or attention_interface is patched_sdpa_attention_forward
-                ) and strategy_for_attention_in_qwen_2_5 == "LOOPMHA":
+                elif is_sdpa and attention_strategy == "LOOPMHA":
 
                     def _iteration(start_end, query_states, key_states, value_states):
                         return patched_Qwen2_5_VLVisionAttentionOneIteration.forward(
@@ -561,11 +561,7 @@ def _iteration(start_end, query_states, key_states, value_states):
                     #       starts_ends, query_states, key_states, value_states), tuple(),
                     # )
                     attn_output = torch.cat(attn_outputs, dim=1)
-                elif (
-                    attention_interface
-                    is transformers.integrations.sdpa_attention.sdpa_attention_forward
-                    or attention_interface is patched_sdpa_attention_forward
-                ) and strategy_for_attention_in_qwen_2_5 == "BIGMASK":
+                elif is_sdpa and attention_strategy == "BIGMASK":
                     # make square mask
                     indices = torch.arange(
                         cu_seqlens.max(), dtype=cu_seqlens.dtype, device=cu_seqlens.device
@@ -594,8 +590,8 @@ def _iteration(start_end, query_states, key_states, value_states):
                     )
                 else:
                     raise NotImplementedError(
-                        f"Not export strategy for strategy_for_attention_in_qwen_2_5="
-                        f"{strategy_for_attention_in_qwen_2_5!r}, "
+                        f"No corresponding export strategy for "
+                        f"{attention_strategy!r}, "
                         f"(use QWEN25ATTENTION to change it), and attention_interface="
                         f"{attention_interface!r} (use sdpa)"
                     )