Fix chunked attention mask with left-padding (#40324)

Cyrilvallez · web-flow · commit c2e3cc24e09a · 2025-08-21T10:52:49.000+02:00
* add fix

* add test

* raise proper warning for older versions

* fix

* fix and add 2nd test

* fix for flex and torch 2.5
diff --git a/src/transformers/masking_utils.py b/src/transformers/masking_utils.py
@@ -20,7 +20,7 @@
 
 from .cache_utils import Cache
 from .configuration_utils import PretrainedConfig
-from .utils import is_torch_xpu_available
+from .utils import is_torch_xpu_available, logging
 from .utils.generic import GeneralInterface
 from .utils.import_utils import is_torch_flex_attn_available, is_torch_greater_or_equal, is_torchdynamo_compiling
 
@@ -40,6 +40,9 @@
     from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
 
 
+logger = logging.get_logger(__name__)
+
+
 def and_masks(*mask_functions: list[Callable]) -> Callable:
     """Returns a mask function that is the intersection of provided mask functions"""
     if not all(callable(arg) for arg in mask_functions):
@@ -87,12 +90,24 @@ def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
     return inner_mask
 
 
-def chunked_overlay(chunk_size: int) -> Callable:
+def chunked_overlay(chunk_size: int, left_padding: torch.Tensor) -> Callable:
     """
     This is an overlay depicting a chunked attention pattern. Add it on top of a causal mask for a proper chunked
     attention mask.
     """
 
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        return (kv_idx - left_padding[batch_idx]) // chunk_size == (q_idx - left_padding[batch_idx]) // chunk_size
+
+    return inner_mask
+
+
+def _legacy_chunked_overlay(chunk_size: int) -> Callable:
+    """
+    Same as the above function, but do not correctly account for left padding tokens.
+    Only kept for compatibility with older torch versions (< 2.6).
+    """
+
     def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
         return kv_idx // chunk_size == q_idx // chunk_size
 
@@ -106,11 +121,13 @@ def sliding_window_causal_mask_function(sliding_window: int) -> Callable:
     return and_masks(sliding_window_overlay(sliding_window), causal_mask_function)
 
 
-def chunked_causal_mask_function(chunk_size: int) -> Callable:
+def chunked_causal_mask_function(chunk_size: int, left_padding: torch.Tensor) -> Callable:
     """
     This return the mask_function function to create a chunked attention mask.
     """
-    return and_masks(chunked_overlay(chunk_size), causal_mask_function)
+    if not _is_torch_greater_or_equal_than_2_6:
+        return and_masks(_legacy_chunked_overlay(chunk_size), causal_mask_function)
+    return and_masks(chunked_overlay(chunk_size, left_padding), causal_mask_function)
 
 
 def padding_mask_function(padding_mask: torch.Tensor) -> Callable:
@@ -298,7 +315,7 @@ def sdpa_mask_recent_torch(
     You can do
 
     ```python
-    >>> create_4d_causal_mask(batch_size=1, cache_position=torch.arange(5), kv_length=5)
+    >>> sdpa_mask(batch_size=1, cache_position=torch.arange(5), kv_length=5)
     >>> tensor([[[[ True, False, False, False, False],
                   [ True,  True, False, False, False],
                   [ True,  True,  True, False, False],
@@ -319,7 +336,7 @@ def sdpa_mask_recent_torch(
     You can do
 
     ```python
-    >>> create_4d_causal_mask(batch_size=1, cache_position=torch.arange(5), kv_length=5, mask_function=sliding_window_causal_mask_function(3))
+    >>> sdpa_mask(batch_size=1, cache_position=torch.arange(5), kv_length=5, mask_function=sliding_window_causal_mask_function(3))
     >>> tensor([[[[ True, False, False, False, False],
                   [ True,  True, False, False, False],
                   [ True,  True,  True, False, False],
@@ -340,7 +357,7 @@ def sdpa_mask_recent_torch(
     You can do
 
     ```python
-    >>> create_4d_causal_mask(batch_size=1, cache_position=torch.arange(5), kv_length=5, mask_function=chunked_causal_mask_function(3))
+    >>> sdpa_mask(batch_size=1, cache_position=torch.arange(5), kv_length=5, mask_function=chunked_causal_mask_function(3, torch.zeros(1, dtype=int)))
     >>> tensor([[[[ True, False, False, False, False],
                 [ True,  True, False, False, False],
                 [ True,  True,  True, False, False],
@@ -973,7 +990,25 @@ def create_chunked_causal_mask(
         )
 
     batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
-    mask_factory_function = chunked_causal_mask_function(chunk_size)
+    # For chunked attention and batched inputs, we need to take the number of left padding tokens into account
+    # to start the chunk from the actual start of the sequence for the padded sequence
+    if attention_mask is not None:
+        # Only count the left padding tokens, not all of them
+        left_padding_tokens = (attention_mask.cumsum(dim=-1) == torch.zeros_like(attention_mask)).sum(dim=-1)
+    else:
+        left_padding_tokens = torch.zeros(batch_size, device=cache_position.device, dtype=int)
+    # Raise a warning for older versions if the problematic left-padding situation arises
+    if (
+        not _is_torch_greater_or_equal_than_2_6
+        and kv_length + kv_offset > chunk_size
+        and (left_padding_tokens > 0).any()
+    ):
+        logger.warning_once(
+            "Due to limitations of your current torch version, we cannot correctly account for the left-padding "
+            "when computing the chunked attention pattern. This will lead to a wrong attention mask for the padded "
+            "sequences. Behavior will be undefined. Please upgrade to `torch>=2.6` to solve this issue."
+        )
+    mask_factory_function = chunked_causal_mask_function(chunk_size, left_padding_tokens)
     mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
 
     # Do not allow skip if we are compiling (this is to match BC)
diff --git a/tests/utils/test_masking_utils.py b/tests/utils/test_masking_utils.py
@@ -21,8 +21,9 @@
     import torch
     from torch.nn.attention.flex_attention import create_block_mask
 
-    from transformers import LlamaConfig
-    from transformers.masking_utils import create_causal_mask, find_packed_sequence_indices
+    from transformers import DynamicCache, LlamaConfig
+    from transformers.cache_utils import DynamicSlidingWindowLayer
+    from transformers.masking_utils import create_causal_mask, create_chunked_causal_mask, find_packed_sequence_indices
 
 
 # fmt: off
@@ -135,3 +136,111 @@ def test_find_packed_sequence_indices(self):
         position_ids = torch.tensor([[0, 1, 2, 3, 0, 1, 0, 1, 2, 3], [0, 1, 2, 3, 4, 5, 0, 1, 2, 3]])
         EXPECTED_SEQUENCE_INDICES = torch.tensor([[0, 0, 0, 0, 1, 1, 2, 2, 2, 2], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1]])
         self.assertTrue((find_packed_sequence_indices(position_ids) == EXPECTED_SEQUENCE_INDICES).all())
+
+    def test_chunked_mask_with_left_padding_and_large_prefill(self):
+        # Make sur we have an attention_chunk_size in the config
+        config = LlamaConfig(attention_chunk_size=3, attn_implementation="sdpa")
+
+        batch_size = 2
+        sequence_length = 8
+        pad_tokens = 4
+
+        input_ids = torch.randint(100, 200, (batch_size, sequence_length))
+        attention_mask = torch.tensor(
+            [[0 if i < pad_tokens else 1 for i in range(sequence_length)], [1] * sequence_length]
+        )
+        inputs_embeds = torch.empty_like(input_ids, dtype=torch.float16)
+        cache_position = torch.arange(sequence_length)
+        position_ids = torch.empty(batch_size, sequence_length, dtype=cache_position.dtype)
+        position_ids[0, :pad_tokens] = 1
+        position_ids[0, pad_tokens:] = torch.arange(sequence_length - pad_tokens)
+        position_ids[1, :] = cache_position
+
+        chunked_attention_mask = create_chunked_causal_mask(
+            config=config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=None,
+            position_ids=position_ids,
+        )
+
+        # fmt: off
+        EXPECTED_CHUNKED_MASK = torch.tensor(
+            # Here, for the padded sequence, the chunk size should start correctly at index 4 (otherwise, with 4 padding
+            # tokens are chunk_size=3, the first chunk is from indices 0-2, then 3-6 if we don't account for the padding correctly)
+            [[[[False, False, False, False, False, False, False, False],
+                [False, False, False, False, False, False, False, False],
+                [False, False, False, False, False, False, False, False],
+                [False, False, False, False, False, False, False, False],
+                [False, False, False, False,  True, False, False, False],
+                [False, False, False, False,  True,  True, False, False],
+                [False, False, False, False,  True,  True,  True, False],
+                [False, False, False, False, False, False, False,  True]]],
+
+
+            [[[ True, False, False, False, False, False, False, False],
+                [ True,  True, False, False, False, False, False, False],
+                [ True,  True,  True, False, False, False, False, False],
+                [False, False, False,  True, False, False, False, False],
+                [False, False, False,  True,  True, False, False, False],
+                [False, False, False,  True,  True,  True, False, False],
+                [False, False, False, False, False, False,  True, False],
+                [False, False, False, False, False, False,  True,  True]]]],
+            dtype=torch.bool)
+        # fmt: on
+
+        self.assertTrue((chunked_attention_mask == EXPECTED_CHUNKED_MASK).all())
+
+    def test_chunked_mask_with_left_padding_decoding(self):
+        # Make sur we have an attention_chunk_size in the config
+        config = LlamaConfig(attention_chunk_size=4, attn_implementation="sdpa", num_hidden_layers=1)
+
+        cache = DynamicCache(config=config)
+        # Sanity check
+        self.assertEqual(len(cache), 1)
+        self.assertTrue(isinstance(cache.layers[0], DynamicSlidingWindowLayer))
+
+        # Fill-in the Cache (sequence length is bigger than chunk size here)
+        batch_size = 2
+        prefill_size = 8
+        pad_tokens = 7
+        fake_kv = torch.rand(batch_size, 32, prefill_size, 32)
+        cache.update(fake_kv, fake_kv, 0, torch.arange(prefill_size))
+
+        # Create a new input after the prefill
+        input_ids = torch.randint(100, 200, (batch_size, 1))
+        attention_mask = torch.tensor(
+            [[0 if i < pad_tokens else 1 for i in range(prefill_size + 1)], [1] * (prefill_size + 1)]
+        )
+        inputs_embeds = torch.empty_like(input_ids, dtype=torch.float16)
+        cache_position = torch.tensor([prefill_size], dtype=int)
+        position_ids = torch.tensor([[prefill_size - pad_tokens], [prefill_size]])
+
+        chunked_attention_mask = create_chunked_causal_mask(
+            config=config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=cache,
+            position_ids=position_ids,
+        )
+
+        # To understand a bit more the following expected mask, here is the full 2d mask, where the "|" characters are the chunk
+        # separators (where the tokens should stop seeing each other)
+        # [0, 0, 0, 0, 0, 0, 0, | 1, 1],    -> due to left padding, the first chunk only starts after the padding tokens
+        # [| 1, 1, 1, 1, | 1, 1, 1, 1, | 1]])  -> easy case, each 4 tokens is a new chunk
+
+        # fmt: off
+        EXPECTED_CHUNKED_MASK = torch.tensor(
+            # Here, for the padded sequence, the chunk size should start correctly at index 7 (the first unpadded
+            # index), and so only indices 7 and 8 should be True
+            [[[[False, False,  True,  True]]],
+
+            # Here, for the unpadded sequence, the chunks start at index 0. Since we have 9 tokens in total, the last
+            # token (index 8) will only see itself (we have 2 full chunks before)
+            [[[False, False, False,  True]]]],
+            dtype=torch.bool)
+        # fmt: on
+
+        self.assertTrue((chunked_attention_mask == EXPECTED_CHUNKED_MASK).all())