fix patch for sdpa mask

xadupre · xadupre · commit cf7dc28023a8 · 2025-09-15T15:44:39.000+02:00
diff --git a/_unittests/ut_helpers/test_cache_helper.py b/_unittests/ut_helpers/test_cache_helper.py
@@ -258,9 +258,7 @@ def test_unflatten_flatten_hybrid_cache(self):
                 self.string_type(unflat, with_shape=True),
             )
 
-    def test_cache_update_padding_mask_function(self):
-        from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
-
+    def test_cache_update_padding_mask_function_vmap(self):
         def causal_mask_function(
             batch_idx: int, head_idx: int, q_idx: int, kv_idx: int
         ) -> bool:
@@ -303,11 +301,9 @@ def forward(self, x, mask):
                 head_arange = torch.arange(x.shape[3])
                 kv_arange = torch.arange(x.shape[1])
                 cache_position = torch.arange(x.shape[2])
-                with TransformGetItemToIndex():
-                    causal_mask = patched__vmap_for_bhqkv(mask_function)(
-                        batch_arange, head_arange, cache_position, kv_arange
-                    )
-                    return x + causal_mask.to(x.dtype)
+                f = patched__vmap_for_bhqkv(mask_function)
+                causal_mask = f(batch_arange, head_arange, cache_position, kv_arange)
+                return x + causal_mask.to(x.dtype)
 
         inputs = {
             "x": torch.rand((4, 4, 4, 4), dtype=torch.float32),
@@ -325,6 +321,28 @@ def forward(self, x, mask):
         )
         self.assertNotEmpty(ep)
 
+    def test_simple_indices(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, i, j):
+                return x[i, j]
+
+        inputs = (
+            torch.rand((4, 4), dtype=torch.float32),
+            torch.randint(0, 4, (4, 4, 4, 4), dtype=torch.int64),
+            torch.randint(0, 4, (4, 4, 4, 4), dtype=torch.int64),
+        )
+        model = Model()
+        expected = model(*inputs)
+        self.assertEqual(expected.shape, (4, 4, 4, 4))
+        DYN = torch.export.Dim.DYNAMIC
+        sh = {0: DYN, 1: DYN, 2: DYN, 3: DYN}
+        ep = torch.export.export(
+            model,
+            inputs,
+            dynamic_shapes=({0: DYN, 1: DYN}, sh, sh),
+        )
+        self.assertNotEmpty(ep)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_torch_models/test_tiny_llms.py b/_unittests/ut_torch_models/test_tiny_llms.py
@@ -29,7 +29,7 @@ def test_tiny_llm_export_dynamic(self):
         self.assertEqual(
             {"attention_mask", "past_key_values", "input_ids", "position_ids"}, set(inputs)
         )
-        with torch_export_patches(patch_transformers=True):
+        with torch_export_patches(patch_transformers=True, verbose=1):
             ep = torch.export.export(
                 model,
                 (),
diff --git a/onnx_diagnostic/torch_export_patches/onnx_export_errors.py b/onnx_diagnostic/torch_export_patches/onnx_export_errors.py
@@ -439,6 +439,28 @@ def torch_export_patches(
                 f_transformers__vmap_for_bhqkv = masking_utils._vmap_for_bhqkv
                 masking_utils._vmap_for_bhqkv = patch_transformers_list.patched__vmap_for_bhqkv
 
+                if verbose:
+                    print(
+                        "[torch_export_patches] patches "
+                        "transformers.masking_utils.sdpa_mask_recent_torch"
+                    )
+                f_transformers_sdpa_mask_recent_torch = masking_utils.sdpa_mask_recent_torch
+                masking_utils.sdpa_mask_recent_torch = (
+                    patch_transformers_list.patched_sdpa_mask_recent_torch
+                )
+                if masking_utils.sdpa_mask == f_transformers_sdpa_mask_recent_torch:
+                    if verbose:
+                        print(
+                            "[torch_export_patches] patches "
+                            "transformers.masking_utils.sdpa_mask"
+                        )
+                    f_transformers_sdpa_mask = masking_utils.sdpa_mask
+                    masking_utils.sdpa_mask = (
+                        patch_transformers_list.patched_sdpa_mask_recent_torch
+                    )
+                else:
+                    f_transformers_sdpa_mask = None
+
             if (
                 masking_utils
                 and patch_transformers_list.patch_masking_utils
@@ -456,10 +478,37 @@ def torch_export_patches(
                     and masking_utils.ALL_MASK_ATTENTION_FUNCTIONS["eager"]
                     == f_transformers_eager_mask
                 ):
+                    if verbose:
+                        print(
+                            "[torch_export_patches] patches "
+                            "transformers.masking_utils.eager_mask "
+                            "in ALL_MASK_ATTENTION_FUNCTIONS"
+                        )
                     masking_utils.ALL_MASK_ATTENTION_FUNCTIONS["eager"] = (
                         patch_transformers_list.patched_eager_mask
                     )
 
+            if (
+                masking_utils
+                and patch_transformers_list.patch_masking_utils
+                and hasattr(masking_utils, "sdpa_mask")
+                and f_transformers_sdpa_mask is not None
+            ):
+                if verbose:
+                    print(
+                        "[torch_export_patches] patches "
+                        "transformers.masking_utils.sdpa_mask "
+                        "in ALL_MASK_ATTENTION_FUNCTIONS"
+                    )
+                if (
+                    "sdpa" in masking_utils.ALL_MASK_ATTENTION_FUNCTIONS
+                    and masking_utils.ALL_MASK_ATTENTION_FUNCTIONS["sdpa"]
+                    == f_transformers_sdpa_mask
+                ):
+                    masking_utils.ALL_MASK_ATTENTION_FUNCTIONS["sdpa"] = (
+                        patch_transformers_list.patched_sdpa_mask_recent_torch
+                    )
+
         if custom_patches:
             if verbose:
                 print("[torch_export_patches] applies custom patches")
@@ -568,19 +617,43 @@ def torch_export_patches(
                     and hasattr(masking_utils, "_vmap_for_bhqkv")
                 ):
                     masking_utils._vmap_for_bhqkv = f_transformers__vmap_for_bhqkv
+
                     if verbose:
                         print(
                             "[torch_export_patches] restored "
                             "transformers.masking_utils._vmap_for_bhqkv"
                         )
 
+                    masking_utils.sdpa_mask_recent_torch = (
+                        f_transformers_sdpa_mask_recent_torch
+                    )
+
+                    if verbose:
+                        print(
+                            "[torch_export_patches] restored "
+                            "transformers.masking_utils.sdpa_mask_recent_torch"
+                        )
+
+                    if f_transformers_sdpa_mask is not None:
+                        masking_utils.sdpa_mask = f_transformers_sdpa_mask
+                        if verbose:
+                            print(
+                                "[torch_export_patches] restored "
+                                "transformers.masking_utils.sdpa_mask"
+                            )
+
                 if (
                     masking_utils
                     and patch_transformers_list.patch_masking_utils
                     and hasattr(masking_utils, "eager_mask")
                 ):
                     f_transformers_eager_mask = masking_utils.eager_mask
                     masking_utils.eager_mask = f_transformers_eager_mask
+                    if verbose:
+                        print(
+                            "[torch_export_patches] restored "
+                            "transformers.masking_utils.eager_mask"
+                        )
                     if (
                         "eager" in masking_utils.ALL_MASK_ATTENTION_FUNCTIONS
                         and masking_utils.ALL_MASK_ATTENTION_FUNCTIONS["eager"]
@@ -589,11 +662,32 @@ def torch_export_patches(
                         masking_utils.ALL_MASK_ATTENTION_FUNCTIONS["eager"] = (
                             f_transformers_eager_mask
                         )
-                    if verbose:
-                        print(
-                            "[torch_export_patches] restored "
-                            "transformers.masking_utils.eager_mask"
+                        if verbose:
+                            print(
+                                "[torch_export_patches] restored "
+                                "transformers.masking_utils.eager_mask "
+                                "in ALL_MASK_ATTENTION_FUNCTIONS"
+                            )
+
+                if (
+                    masking_utils
+                    and patch_transformers_list.patch_masking_utils
+                    and hasattr(masking_utils, "sdpa_mask")
+                ):
+                    if (
+                        "sdpa" in masking_utils.ALL_MASK_ATTENTION_FUNCTIONS
+                        and masking_utils.ALL_MASK_ATTENTION_FUNCTIONS["sdpa"]
+                        == patch_transformers_list.patched_sdpa_mask_recent_torch
+                    ):
+                        masking_utils.ALL_MASK_ATTENTION_FUNCTIONS["sdpa"] = (
+                            f_transformers_sdpa_mask
                         )
+                        if verbose:
+                            print(
+                                "[torch_export_patches] restored "
+                                "transformers.masking_utils.sdpa_mask "
+                                "in ALL_MASK_ATTENTION_FUNCTIONS"
+                            )
 
             ########
             # caches
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -37,7 +37,14 @@
 
 if patch_masking_utils:
     # Introduced in 4.52
-    from transformers.masking_utils import causal_mask_function, sdpa_mask
+    from transformers.masking_utils import (
+        causal_mask_function,
+        sdpa_mask,
+        padding_mask_function,
+        and_masks,
+        _ignore_causal_mask_sdpa,
+        prepare_padding_mask,
+    )
 
     def patched__vmap_for_bhqkv(mask_function: Callable, bh_indices: bool = True) -> Callable:
         """manual patch for function ``transformers.masking_utils._vmap_for_bhqkv``."""
@@ -125,6 +132,35 @@ def patched_eager_mask(
         mask = (~mask).to(dtype) * min_dtype
         return mask
 
+    def patched_sdpa_mask_recent_torch(
+        batch_size: int,
+        cache_position: torch.Tensor,
+        kv_length: int,
+        kv_offset: int = 0,
+        mask_function: Callable = causal_mask_function,
+        attention_mask: Optional[torch.Tensor] = None,
+        local_size: Optional[int] = None,
+        allow_is_causal_skip: bool = True,
+        **kwargs,
+    ) -> Optional[torch.Tensor]:
+        """manual patch for function ``transformers.masking_utils.sdpa_mask_recent_torch``."""
+        q_length = cache_position.shape[0]
+        padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset, _slice=False)
+        if allow_is_causal_skip and _ignore_causal_mask_sdpa(
+            padding_mask, q_length, kv_length, kv_offset, local_size
+        ):
+            return None
+        kv_arange = torch.arange(kv_length, device=cache_position.device)
+        kv_arange += kv_offset
+        if padding_mask is not None:
+            mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
+        batch_arange = torch.arange(batch_size, device=cache_position.device)
+        head_arange = torch.arange(1, device=cache_position.device)
+        causal_mask = patched__vmap_for_bhqkv(mask_function)(
+            batch_arange, head_arange, cache_position, kv_arange
+        )
+        return causal_mask
+
 
 if patch_parse_processor_args:
 

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ def test_tiny_llm_export_dynamic(self):`
`29`	`29`	`self.assertEqual(`
`30`	`30`	`{"attention_mask", "past_key_values", "input_ids", "position_ids"}, set(inputs)`
`31`	`31`	`)`
`32`		`- with torch_export_patches(patch_transformers=True):`
	`32`	`+ with torch_export_patches(patch_transformers=True, verbose=1):`
`33`	`33`	`ep = torch.export.export(`
`34`	`34`	`model,`
`35`	`35`	`(),`