Diagonal Mask (#295)

xadupre · web-flow · commit 0d3bd28ce123 · 2025-11-11T11:31:35.000+01:00
* diagonal mask for attention

* doc

* fix

* fix
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,7 +4,7 @@ Change Logs
 0.8.2
 +++++
 
-* :pr:`292`, :pr:`293`, :pr:`294`: new patches for Qwen models
+* :pr:`292`, :pr:`293`, :pr:`294`, :pr:`295`: new patches for Qwen models
 
 0.8.1
 +++++
diff --git a/_unittests/ut_torch_export_patches/test_patch_transformers.py b/_unittests/ut_torch_export_patches/test_patch_transformers.py
@@ -4,7 +4,13 @@
 import transformers.integrations.sdpa_attention as sdpa_attention
 import onnx
 import onnx_diagnostic.torch_export_patches.patches.patch_transformers as patch_transformers
-from onnx_diagnostic.ext_test_case import ExtTestCase, requires_transformers, ignore_warnings
+from onnx_diagnostic.ext_test_case import (
+    ExtTestCase,
+    requires_transformers,
+    requires_torch,
+    ignore_warnings,
+    has_onnxscript,
+)
 from onnx_diagnostic.helpers.torch_helper import torch_deepcopy, fake_torchdynamo_exporting
 from onnx_diagnostic.export.shape_helper import make_fake_with_dynamic_dimensions
 from onnx_diagnostic.torch_models.hghub.hub_api import get_cached_configuration
@@ -398,9 +404,69 @@ def test_patched_qwen2_5_vl_vision_attention_forward(self):
                 _is_torchdynamo_exporting()
             ), f"exporting is not set to true? {torch.compiler.is_exporting_flag}"
             got = patched_Qwen2_5_VLVisionAttention.forward(instance, **inputs)
-            self.assertEqualArray(expected, got)
+            self.assertEqualArray(expected, got, atol=1e-5)
+
+        class Model(patched_class):
+            def forward(
+                self,
+                hidden_states: torch.Tensor,
+                cu_seqlens: torch.Tensor,
+                rotary_pos_emb: torch.Tensor | None = None,
+                position_embeddings1: torch.Tensor | None = None,
+                position_embeddings2: torch.Tensor | None = None,
+                **kwargs,
+            ) -> torch.Tensor:
+                return patched_Qwen2_5_VLVisionAttention.forward(
+                    self,
+                    hidden_states,
+                    cu_seqlens,
+                    rotary_pos_emb=rotary_pos_emb,
+                    position_embeddings=(position_embeddings1, position_embeddings2),
+                    **kwargs,
+                )
+
+        instance = Model(config.vision_config)
+        instance.eval()
+
+        ds = dict(
+            hidden_states={0: "d1"},
+            cu_seqlens={0: "d3"},
+            position_embeddings1={0: "d1"},
+            position_embeddings2={0: "d1"},
+        )
+        inputs.update(
+            dict(
+                position_embeddings1=inputs["position_embeddings"][0],
+                position_embeddings2=inputs["position_embeddings"][1],
+            )
+        )
+        del inputs["position_embeddings"]
+        for exporter in ("custom", "onnx-dynamo"):
+            # onnx-dynamo needs OpOverload(op='aten.sym_storage_offset' (transformers>=5.0?)
+            if exporter == "onnx-dynamo" and not has_onnxscript("0.5.7"):
+                raise unittest.SkipTest("needs onnxscript>=0.5.7")
+            filename = self.get_dump_file(
+                f"test_patched_qwen2_5_vl_vision_attention_forward.{exporter}.onnx"
+            )
+            to_onnx(
+                instance,
+                kwargs=inputs,
+                dynamic_shapes=ds,
+                exporter=exporter,
+                filename=filename,
+            )
+            # exporter_kwargs={"report":True} if exporter != "custom" else {}
+            self.assert_onnx_disc(
+                f"test_patched_qwen2_5_vl_vision_attention_forward-{exporter}",
+                onnx.load(filename),
+                instance,
+                inputs,
+                atol=1e-3,
+                rtol=1,
+            )
 
-    @requires_transformers("5.0")
+    @requires_transformers("4.99")
+    @requires_torch("2.9.99")
     @unittest.skipIf(not patch_qwen2_5, "Qwen25 not part of this transformers")
     def test_qwen2_5_vl_vision_attention_iteration(self):
         from onnx_diagnostic.torch_export_patches.patches.patch_transformers import (
diff --git a/onnx_diagnostic/export/api.py b/onnx_diagnostic/export/api.py
@@ -93,7 +93,7 @@ def to_onnx(
         )
         ort_fusions.optimize_for_ort(epo.model)
         if filename:
-            epo.save(filename)
+            epo.save(filename, external_data=True)
         return epo
 
     if exporter == "modelbuilder":
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -2012,6 +2012,8 @@ def forward(
 if patch_qwen2_5:
     import torch.nn.functional as F
 
+    use_loop_for_attention_in_qwen_2_5 = False
+
     class patched_Qwen2_5_VLForConditionalGeneration:
         _PATCHES_ = ["prepare_inputs_for_generation"]
         _PATCHED_CLASS_ = (
@@ -2392,36 +2394,65 @@ def forward(
                 ):
                     attention_interface = patched_sdpa_attention_forward
 
-                def _iteration(start_end, query_states, key_states, value_states):
-                    return patched_Qwen2_5_VLVisionAttentionOneIteration.forward(
+                if use_loop_for_attention_in_qwen_2_5:
+
+                    def _iteration(start_end, query_states, key_states, value_states):
+                        return patched_Qwen2_5_VLVisionAttentionOneIteration.forward(
+                            self,
+                            start_end,
+                            query_states,
+                            key_states,
+                            value_states,
+                            scaling=self.scaling,
+                            dropout=0.0 if not self.training else self.attention_dropout,
+                        )
+
+                    starts = cu_seqlens[:-1]
+                    ends = cu_seqlens[1:]
+                    # cu_seqlens = [0, 10, 14, 27]
+                    # starts: [0, 10, 14]
+                    # ends: [10, 14, 17]
+                    # starts_ends: [[0, 10], [10, 14], [14, 27]]
+                    starts_ends = torch.cat([starts.unsqueeze(1), ends.unsqueeze(1)], dim=1)
+                    attn_outputs = [
+                        _iteration(start_end, query_states, key_states, value_states)
+                        for start_end in starts_ends
+                    ]
+                    # attn_outputs = torch._higher_order_ops.while_loop(
+                    # attn_outputs = torch.ops.higher_order.while_loop(
+                    #    (lambda it, starts_ends, *_args: it < starts_ends.shape[0]),
+                    #    _iteration,
+                    #    (torch.tensor(0),
+                    #       starts_ends, query_states, key_states, value_states), tuple(),
+                    # )
+                    attn_output = torch.cat(attn_outputs, dim=1)
+                else:
+                    # make square mask
+                    indices = torch.arange(
+                        cu_seqlens.max(), dtype=cu_seqlens.dtype, device=cu_seqlens.device
+                    )
+                    dot = (cu_seqlens.unsqueeze(1) <= indices.unsqueeze(0)).to(
+                        cu_seqlens.dtype
+                    )
+                    dot = dot.sum(dim=0)
+                    mask = dot.unsqueeze(1) - dot.unsqueeze(0)
+                    bool_mask = mask == 0
+                    bool_mask = bool_mask.unsqueeze(0).unsqueeze(0)
+
+                    torch._check(bool_mask.shape[2] == key_states.shape[2])
+                    torch._check(bool_mask.shape[3] == key_states.shape[2])
+
+                    attn_output, _ = attention_interface(
                         self,
-                        start_end,
                         query_states,
                         key_states,
                         value_states,
+                        attention_mask=bool_mask,
                         scaling=self.scaling,
                         dropout=0.0 if not self.training else self.attention_dropout,
+                        is_causal=False,
+                        **kwargs,
                     )
-
-                starts = cu_seqlens[:-1]
-                ends = cu_seqlens[1:]
-                # cu_seqlens = [0, 10, 14, 27]
-                # starts: [0, 10, 14]
-                # ends: [10, 14, 17]
-                # starts_ends: [[0, 10], [10, 14], [14, 27]]
-                starts_ends = torch.cat([starts.unsqueeze(1), ends.unsqueeze(1)], dim=1)
-                attn_outputs = [
-                    _iteration(start_end, query_states, key_states, value_states)
-                    for start_end in starts_ends
-                ]
-                # attn_outputs = torch._higher_order_ops.while_loop(
-                # attn_outputs = torch.ops.higher_order.while_loop(
-                #    (lambda it, starts_ends, *_args: it < starts_ends.shape[0]),
-                #    _iteration,
-                #    (torch.tensor(0),
-                #       starts_ends, query_states, key_states, value_states), tuple(),
-                # )
-                attn_output = torch.cat(attn_outputs, dim=1)
             else:
                 # Other implementations: Process each chunk separately
                 lengths = cu_seqlens[1:] - cu_seqlens[:-1]

Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,7 @@ def to_onnx(`
`93`	`93`	`)`
`94`	`94`	`ort_fusions.optimize_for_ort(epo.model)`
`95`	`95`	`if filename:`
`96`		`- epo.save(filename)`
	`96`	`+ epo.save(filename, external_data=True)`
`97`	`97`	`return epo`
`98`	`98`
`99`	`99`	`if exporter == "modelbuilder":`