add attention 24

xadupre · xadupre · commit cd01099a5299 · 2025-12-02T14:58:17.000+01:00
diff --git a/_unittests/ut_tasks/try_export.py b/_unittests/ut_tasks/try_export.py
@@ -148,7 +148,7 @@ def _config_reduction(config, task):
         elif device == "cuda" and dtype in ("float16", "bfloat16"):
             attention_options = ["PACKED", "BIGMASK"]
         else:
-            attention_options = ["LOOPMHA", "BIGMASK"]
+            attention_options = ["LOOPMHA", "LOOPA24", "BIGMASK"]
 
         # fake_inputs = make_fake_with_dynamic_dimensions(inputs, dynamic_shapes)[0]
         for attention in attention_options:
@@ -180,7 +180,7 @@ def _config_reduction(config, task):
                         exporter=exporter,
                         verbose=1,
                         save_ep=None if self.unit_test_going() else (fileep, 2**35),
-                        target_opset=22,
+                        target_opset=24 if attention == "LOOPMHA" else 22,
                         optimize=True,
                         onnx_plugs=PLUGS,
                     )
@@ -207,17 +207,18 @@ def _config_reduction(config, task):
                 print(f"-- MODEL CONVERTED IN {time.perf_counter() - begin}")
                 model = onnx.load(filename, load_external_data=False)
                 if attention == "PACKED":
-                    self.assertIn(
-                        "PackedMultiHeadAttention", {n.op_type for n in model.graph.node}
-                    )
+                    self.assertIn("PackedMultiHeadAttention", str(model))
                 elif attention == "BIGMASK":
-                    self.assertNotIn(
-                        "PackedMultiHeadAttention", {n.op_type for n in model.graph.node}
-                    )
+                    self.assertNotIn("PackedMultiHeadAttention", str(model))
+                    self.assertNotIn("MultiHeadAttention", str(model))
+                    self.assertNotIn("Loop", {n.op_type for n in model.graph.node})
                 elif attention == "LOOPMHA":
-                    self.assertNotIn(
-                        "PackedMultiHeadAttention", {n.op_type for n in model.graph.node}
-                    )
+                    self.assertNotIn("PackedMultiHeadAttention", str(model))
+                    self.assertIn("MultiHeadAttention", str(model))
+                    self.assertIn("Loop", {n.op_type for n in model.graph.node})
+                elif attention == "LOOPA24":
+                    self.assertNotIn("PackedMultiHeadAttention", str(model))
+                    self.assertNotIn("MultiHeadAttention", str(model))
                     self.assertIn("Loop", {n.op_type for n in model.graph.node})
                 else:
                     raise AssertionError(f"attention={attention!r} not expected")
diff --git a/_unittests/ut_torch_export_patches/test_patch_transformers.py b/_unittests/ut_torch_export_patches/test_patch_transformers.py
@@ -519,9 +519,43 @@ def test_qwen2_5_vl_vision_attention_iteration(self):
             )
         self.clean_dump()
 
+    @classmethod
+    def _get_seqlen(cls) -> torch.Tensor:
+        return torch.tensor(
+            [
+                0,
+                64,
+                128,
+                192,
+                256,
+                304,
+                368,
+                432,
+                496,
+                560,
+                608,
+                672,
+                736,
+                800,
+                864,
+                912,
+                976,
+                1040,
+                1104,
+                1168,
+                1216,
+                1232,
+                1248,
+                1264,
+                1280,
+                1292,
+            ],
+            dtype=torch.int64,
+        )
+
     @unittest.skipIf(not patch_qwen2_5, "Qwen25 not part of this transformers")
     @requires_cuda()
-    def test_plug_packed_multi_head_attention_qwen25_packed(self):
+    def test_plug_packed_multi_head_attention_qwen25_packed_float16(self):
         from onnx_diagnostic.torch_export_patches.patches._patch_transformers_qwen2_5 import (
             qwen_sdpa_attention_packed_versatile,
         )
@@ -530,37 +564,7 @@ def test_plug_packed_multi_head_attention_qwen25_packed(self):
             torch.rand((1, 16, 1292, 80), dtype=torch.float16).to("cuda"),
             torch.rand((1, 16, 1292, 80), dtype=torch.float16).to("cuda"),
             torch.rand((1, 16, 1292, 80), dtype=torch.float16).to("cuda"),
-            torch.tensor(
-                [
-                    0,
-                    64,
-                    128,
-                    192,
-                    256,
-                    304,
-                    368,
-                    432,
-                    496,
-                    560,
-                    608,
-                    672,
-                    736,
-                    800,
-                    864,
-                    912,
-                    976,
-                    1040,
-                    1104,
-                    1168,
-                    1216,
-                    1232,
-                    1248,
-                    1264,
-                    1280,
-                    1292,
-                ],
-                dtype=torch.int64,
-            ).to("cuda"),
+            self._get_seqlen().to("cuda"),
         )
 
         results = qwen_sdpa_attention_packed_versatile.verify(
@@ -580,7 +584,7 @@ def test_plug_packed_multi_head_attention_qwen25_packed(self):
         self.assertLess(results.diffs[0]["abs"], 0.01)
 
     @unittest.skipIf(not patch_qwen2_5, "Qwen25 not part of this transformers")
-    def test_plug_packed_multi_head_attention_qwen25_loopmha(self):
+    def test_plug_packed_multi_head_attention_qwen25_loopmha_float16(self):
         from onnx_diagnostic.torch_export_patches.patches._patch_transformers_qwen2_5 import (
             qwen_sdpa_attention_loopmha_versatile,
         )
@@ -589,46 +593,15 @@ def test_plug_packed_multi_head_attention_qwen25_loopmha(self):
             torch.rand((1, 16, 1292, 80), dtype=torch.float16),
             torch.rand((1, 16, 1292, 80), dtype=torch.float16),
             torch.rand((1, 16, 1292, 80), dtype=torch.float16),
-            torch.tensor(
-                [
-                    0,
-                    64,
-                    128,
-                    192,
-                    256,
-                    304,
-                    368,
-                    432,
-                    496,
-                    560,
-                    608,
-                    672,
-                    736,
-                    800,
-                    864,
-                    912,
-                    976,
-                    1040,
-                    1104,
-                    1168,
-                    1216,
-                    1232,
-                    1248,
-                    1264,
-                    1280,
-                    1292,
-                ],
-                dtype=torch.int64,
-            ),
+            self._get_seqlen(),
         )
 
         results = qwen_sdpa_attention_loopmha_versatile.verify(
             *inputs,
             scaling=0.5,
             num_heads=16,
-            itype=onnx.TensorProto.FLOAT16,
             dump_onnx_model=self.get_dump_file(
-                "test_plug_packed_multi_head_attention_qwen25_loopmha.onnx"
+                "test_plug_packed_multi_head_attention_qwen25_loopmha_float16.onnx"
             ),
         )
         self.assertEqual(len(results.eager_outputs), len(results.onnx_outputs))
@@ -637,13 +610,101 @@ def test_plug_packed_multi_head_attention_qwen25_loopmha(self):
         self.assertLess(results.diffs[0]["abs"], 0.01)
 
         results = qwen_sdpa_attention_loopmha_versatile.verify(
-            *inputs, scaling=0.11180339887498948, num_heads=16, itype=onnx.TensorProto.FLOAT16
+            *inputs, scaling=0.11180339887498948, num_heads=16
         )
         self.assertEqual(len(results.eager_outputs), len(results.onnx_outputs))
         self.assertEqual(len(results.eager_outputs), len(results.diffs))
         self.assertEqualArray(results.eager_outputs[0], results.onnx_outputs[0], atol=0.01)
         self.assertLess(results.diffs[0]["abs"], 0.01)
 
+    @unittest.skipIf(not patch_qwen2_5, "Qwen25 not part of this transformers")
+    def test_plug_packed_multi_head_attention_qwen25_loopmha_float32(self):
+        from onnx_diagnostic.torch_export_patches.patches._patch_transformers_qwen2_5 import (
+            qwen_sdpa_attention_loopmha_versatile,
+        )
+
+        inputs = (
+            torch.rand((1, 16, 1292, 80), dtype=torch.float32),
+            torch.rand((1, 16, 1292, 80), dtype=torch.float32),
+            torch.rand((1, 16, 1292, 80), dtype=torch.float32),
+            self._get_seqlen(),
+        )
+
+        results = qwen_sdpa_attention_loopmha_versatile.verify(
+            *inputs,
+            scaling=0.5,
+            num_heads=16,
+            dump_onnx_model=self.get_dump_file(
+                "test_plug_packed_multi_head_attention_qwen25_loopmha_float16.onnx"
+            ),
+        )
+        self.assertEqual(len(results.eager_outputs), len(results.onnx_outputs))
+        self.assertEqual(len(results.eager_outputs), len(results.diffs))
+        self.assertEqualArray(results.eager_outputs[0], results.onnx_outputs[0], atol=1e-5)
+        self.assertLess(results.diffs[0]["abs"], 1e-5)
+
+        results = qwen_sdpa_attention_loopmha_versatile.verify(
+            *inputs, scaling=0.11180339887498948, num_heads=16
+        )
+        self.assertEqual(len(results.eager_outputs), len(results.onnx_outputs))
+        self.assertEqual(len(results.eager_outputs), len(results.diffs))
+        self.assertEqualArray(results.eager_outputs[0], results.onnx_outputs[0], atol=1e-5)
+        self.assertLess(results.diffs[0]["abs"], 1e-5)
+
+    @unittest.skipIf(not patch_qwen2_5, "Qwen25 not part of this transformers")
+    def test_plug_packed_multi_head_attention_qwen25_loopa24_float16(self):
+        from onnx_diagnostic.torch_export_patches.patches._patch_transformers_qwen2_5 import (
+            qwen_sdpa_attention_loopa24_versatile,
+        )
+
+        inputs = (
+            torch.rand((1, 16, 1292, 80), dtype=torch.float16),
+            torch.rand((1, 16, 1292, 80), dtype=torch.float16),
+            torch.rand((1, 16, 1292, 80), dtype=torch.float16),
+            self._get_seqlen(),
+        )
+
+        results = qwen_sdpa_attention_loopa24_versatile.verify(*inputs, scaling=0.5)
+        self.assertEqual(len(results.eager_outputs), len(results.onnx_outputs))
+        self.assertEqual(len(results.eager_outputs), len(results.diffs))
+        self.assertEqualArray(results.eager_outputs[0], results.onnx_outputs[0], atol=1e-5)
+        self.assertLess(results.diffs[0]["abs"], 1e-5)
+
+        results = qwen_sdpa_attention_loopa24_versatile.verify(
+            *inputs, scaling=0.11180339887498948
+        )
+        self.assertEqual(len(results.eager_outputs), len(results.onnx_outputs))
+        self.assertEqual(len(results.eager_outputs), len(results.diffs))
+        self.assertEqualArray(results.eager_outputs[0], results.onnx_outputs[0], atol=1e-5)
+        self.assertLess(results.diffs[0]["abs"], 1e-5)
+
+    @unittest.skipIf(not patch_qwen2_5, "Qwen25 not part of this transformers")
+    def test_plug_packed_multi_head_attention_qwen25_loopa24_float32(self):
+        from onnx_diagnostic.torch_export_patches.patches._patch_transformers_qwen2_5 import (
+            qwen_sdpa_attention_loopa24_versatile,
+        )
+
+        inputs = (
+            torch.rand((1, 16, 1292, 80), dtype=torch.float32),
+            torch.rand((1, 16, 1292, 80), dtype=torch.float32),
+            torch.rand((1, 16, 1292, 80), dtype=torch.float32),
+            self._get_seqlen(),
+        )
+
+        results = qwen_sdpa_attention_loopa24_versatile.verify(*inputs, scaling=0.5)
+        self.assertEqual(len(results.eager_outputs), len(results.onnx_outputs))
+        self.assertEqual(len(results.eager_outputs), len(results.diffs))
+        self.assertEqualArray(results.eager_outputs[0], results.onnx_outputs[0], atol=1e-5)
+        self.assertLess(results.diffs[0]["abs"], 1e-5)
+
+        results = qwen_sdpa_attention_loopa24_versatile.verify(
+            *inputs, scaling=0.11180339887498948
+        )
+        self.assertEqual(len(results.eager_outputs), len(results.onnx_outputs))
+        self.assertEqual(len(results.eager_outputs), len(results.diffs))
+        self.assertEqualArray(results.eager_outputs[0], results.onnx_outputs[0], atol=1e-5)
+        self.assertLess(results.diffs[0]["abs"], 1e-5)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_qwen2_5.py b/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_qwen2_5.py