update rewriting for qwen (#293)

xadupre · web-flow · commit c371e381efd7 · 2025-11-09T19:00:47.000+01:00
* update rewriting

* fix

* changes
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.8.2
 +++++
 
+* :pr:`293`: second series of patches
 * :pr:`292`: new patches for Qwen models
 
 0.8.1
diff --git a/_unittests/ut_tasks/try_export.py b/_unittests/ut_tasks/try_export.py
@@ -64,8 +64,8 @@ def _config_reduction(config, task):
         print(f"-- processor={type(processor)}")
 
         inputs = dict(
-            hidden_states=torch.rand((14308, 1176), dtype=torch.float32),
-            grid_thw=torch.tensor([[1, 98, 146]], dtype=torch.int64),
+            hidden_states=torch.rand((1292, 1176), dtype=torch.float32),
+            grid_thw=torch.tensor([[1, 34, 38]], dtype=torch.int64),
         )
 
         print(f"-- inputs: {self.string_type(inputs, with_shape=True)}")
@@ -89,7 +89,7 @@ def _config_reduction(config, task):
         export_inputs = inputs
         print()
         with torch_export_patches(
-            patch_torch=True,
+            patch_torch=False,
             patch_sympy=False,
             patch_transformers=True,
             verbose=1,
diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
@@ -1044,7 +1044,8 @@ def config_reduction(config, task):
                 "content": [
                     {
                         "type": "image",
-                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                        # "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                        "image": "https://github.com/sdpython/teachpyx/blob/main/_doc/practice/tds-base/int.png?raw=true",
                     },
                     {"type": "text", "text": "Describe this image."},
                 ],
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -2255,52 +2255,6 @@ def forward(
             hidden_states = hidden_states[reverse_indices, :]
             return hidden_states
 
-    @torch.library.custom_op("custom::qwen25_attention", mutates_args={})
-    def qwen25_attention(
-        query_states: torch.Tensor,
-        key_states: torch.Tensor,
-        value_states: torch.Tensor,
-        cu_seqlens: torch.Tensor,
-        _cu_seqlens: torch.Tensor,
-        max_seqlen: torch.Tensor,
-        _max_seqlen: torch.Tensor,
-        scale: torch.Tensor,
-    ) -> torch.Tensor:
-        return torch.empty(
-            key_states.shape[0],
-            value_states.shape[1],
-            max_seqlen,
-            value_states.shape[-1],
-            dtype=query_states.dtype,
-            device=query_states.device,
-        )
-
-    def make_undefined_dimension(i: int) -> torch.SymInt:
-        t = torch.ones((i * 2,))
-        t[:i] = 0
-        res = torch.nonzero(t).shape[0]
-        return res
-
-    @qwen25_attention.register_fake
-    def qwen25_attention_shape(
-        query_states,
-        key_states,
-        value_states,
-        cu_seqlens,
-        _cu_seqlens,
-        max_seqlen,
-        _max_seqlen,
-        scale,
-    ):
-        return torch.empty(
-            key_states.shape[0],
-            value_states.shape[1],
-            max_seqlen,  # make_undefined_dimension(max_seqlen), new dimension does not work
-            value_states.shape[-1],
-            dtype=query_states.dtype,
-            device=query_states.device,
-        )
-
     class patched_Qwen2_5_VLVisionAttention:
         _PATCHES_ = ["forward"]
         _PATCHED_CLASS_ = (
@@ -2350,15 +2304,26 @@ def forward(
                 or torch.compiler.is_exporting()
             ):
                 max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-                attn_output = torch.ops.custom.qwen25_attention(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens,
-                    cu_seqlens,
-                    max_seqlen,
-                    max_seqlen,
-                    torch.tensor(self.scaling, dtype=torch.float32),
+                attn_output = torch.onnx.ops.symbolic(
+                    "custom::qwen25_attention",
+                    (
+                        query_states,
+                        key_states,
+                        value_states,
+                        cu_seqlens,
+                        cu_seqlens,
+                        max_seqlen,
+                        max_seqlen,
+                        torch.tensor(self.scaling, dtype=torch.float32),
+                    ),
+                    dtype=query_states.dtype,
+                    shape=(
+                        key_states.shape[0],
+                        value_states.shape[1],
+                        max_seqlen,
+                        value_states.shape[-1],
+                    ),
+                    version=1,
                 )
             elif self.config._attn_implementation == "flash_attention_2":
                 # Flash Attention 2: Use cu_seqlens for variable length attention