ut

xadupre · xadupre · commit d5798cc866ea · 2025-11-26T14:31:25.000+01:00
diff --git a/.github/workflows/models.yml b/.github/workflows/models.yml
@@ -61,5 +61,5 @@ jobs:
         run: python -m pip freeze
 
       - name: qwen2.5_vl_instruct
-        run: PYTHONPATH=. UNITTEST_GOING=1 NEVERTEST=1 QWEN25ATTENTION=BIGMASK TESTDTYPE=float16 TESTDEVICE=cpu python _unittests/ut_tasks/try_export.py -f -k test_qwen_2_5_vli_visual
+        run: PYTHONPATH=. UNITTEST_GOING=1 NEVERTEST=1 QWEN25ATTENTION=BIGMASK TESTDTYPE=float16 TESTDEVICE=cpu python _unittests/ut_tasks/try_export.py -f -k test_qwen25_vli_visual
         
diff --git a/_unittests/ut_torch_export_patches/test_patch_transformers.py b/_unittests/ut_torch_export_patches/test_patch_transformers.py
@@ -6,6 +6,7 @@
 import onnx_diagnostic.torch_export_patches.patches.patch_transformers as patch_transformers
 from onnx_diagnostic.ext_test_case import (
     ExtTestCase,
+    requires_cuda,
     requires_transformers,
     requires_torch,
     ignore_warnings,
@@ -518,6 +519,53 @@ def test_qwen2_5_vl_vision_attention_iteration(self):
             )
         self.clean_dump()
 
+    @unittest.skipIf(not patch_qwen2_5, "Qwen25 not part of this transformers")
+    @requires_cuda()
+    def test_plug_packed_multi_head_attention_qwen25(self):
+        from onnx_diagnostic.torch_export_patches.patches._patch_transformers_qwen2_5 import (
+            qwen_sdpa_attention_versatile,
+        )
+
+        inputs = (
+            torch.rand((1, 16, 1292, 80), dtype=torch.float16).to("cuda"),
+            torch.rand((1, 16, 1292, 80), dtype=torch.float16).to("cuda"),
+            torch.rand((1, 16, 1292, 80), dtype=torch.float16).to("cuda"),
+            torch.tensor(
+                [
+                    0,
+                    64,
+                    128,
+                    192,
+                    256,
+                    304,
+                    368,
+                    432,
+                    496,
+                    560,
+                    608,
+                    672,
+                    736,
+                    800,
+                    864,
+                    912,
+                    976,
+                    1040,
+                    1104,
+                    1168,
+                    1216,
+                    1232,
+                    1248,
+                    1264,
+                    1280,
+                    1292,
+                ],
+                dtype=torch.int64,
+            ).to("cuda"),
+        )
+        qwen_sdpa_attention_versatile.verify(
+            *inputs, scaling=0.11180339887498948, num_heads=16
+        )
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/export/onnx_plug.py b/onnx_diagnostic/export/onnx_plug.py
@@ -238,20 +238,21 @@ def _register(self):
         custom_def.register_kernel(None)(self.eager_fn)
         custom_def._abstract_fn = self.shape_fn
 
-    def verify(self, *args, engine: Optional[Callable] = None) -> VerifyResult:
+    def verify(self, *args, engine: Optional[Callable] = None, **kwargs) -> VerifyResult:
         """
         Verifies that the eager mode is equivalent to the onnx function given
         as a replacements. This function evaluates `eager_fn`, checks that the shapes
         are equivalent to the ones given by `shape_fn`, and finally evaluates the
         onnx translation if the previous did not fail.
 
         :param args: function inputs
+        :param kwargs: arguments for eager_fn
         :param engine: by default an instance of
             :class:`onnx_diagnostic.reference.OnnxruntimeEvaluator`.
         :return: outputs of :func:`onnx_diagnostic.helpers.max_diff`
         """
-        expected = self.eager_fn(*args)
-        shapes = self.shape_fn(*args)
+        expected = self.eager_fn(*args, **kwargs)
+        shapes = self.shape_fn(*args, **kwargs)
         if isinstance(expected, torch.Tensor):
             expected = (expected,)
             assert isinstance(shapes, torch.Tensor), (
@@ -279,7 +280,7 @@ def verify(self, *args, engine: Optional[Callable] = None) -> VerifyResult:
 
         # Now the ONNX execution.
         assert engine is None, f"Not implemented yet with engine={engine!r}"
-        sess = OnnxruntimeEvaluator(self.function_proto)
+        sess = OnnxruntimeEvaluator(self.function_proto, whole=True)
         feeds = dict(zip(sess.input_names, args))
         got = sess.run(None, feeds)
         diffs = tuple(max_diff(e, g) for e, g in zip(expected, got))