Merge pull request #2557 from AI-Hypercomputer:mohit/fix_tokamax_2

Google-ML-Automation · Google-ML-Automation · commit 64d6d9b425e7 · 2025-10-28T20:31:33.000-07:00
PiperOrigin-RevId: 825333068
diff --git a/src/MaxText/layers/attention_op.py b/src/MaxText/layers/attention_op.py
@@ -144,8 +144,8 @@ def validate_flash_attention_with_sinks_on_gpu(sinks: Array | None) -> None:
     raise ValueError("The flash attention with sinks is not supported on GPU yet.")
 
 
-# TODO(agagik): change tokamax_splash_mask._ComputableMask to be non protected
-class ChunkedCausalMask(tokamax_splash_mask._ComputableMask):  # pylint: disable=protected-access
+# TODO(agagik): change splash_attention_mask._ComputableMask to be non protected
+class ChunkedCausalMask(splash_attention_mask._ComputableMask):  # pylint: disable=protected-access
   """Lazy chunked causal mask.
 
   Attention is causal within each chunk (0, K), (K, 2K), (2K, 3K), ... tokens attend to each other but not across chunks.
@@ -1138,10 +1138,11 @@ def create_sa_config(config, query, key, attn_logits_soft_cap):
 
     sa_config = create_sa_config(self.config, query, key, attn_logits_soft_cap)
     mask_shape = (query.shape[2], key.shape[2])  # (q_seq_len, kv_seq_len)
+    mask_module = tokamax_splash_mask if self.config.use_tokamax_splash else splash_attention_mask
     if self.attention_type == AttentionType.FULL:
-      mask = splash_attention_mask.FullMask(mask_shape)
+      mask = mask_module.FullMask(mask_shape)
     else:
-      mask = splash_attention_mask.CausalMask(shape=mask_shape)
+      mask = mask_module.CausalMask(shape=mask_shape)
 
     # Create LoadBalancedCausalMask if cp and load_balancing
     if cp_size > 1 and load_balanced_context_parallel:
@@ -1152,7 +1153,7 @@ def create_sa_config(config, query, key, attn_logits_soft_cap):
     if self.attention_type == AttentionType.LOCAL_SLIDING:
       if self.sliding_window_size is None:
         raise ValueError("Sliding_window_size must be set if Local Sliding attention type")
-      mask &= splash_attention_mask.LocalMask(
+      mask &= mask_module.LocalMask(
           shape=(query.shape[2], key.shape[2]),
           window_size=(self.sliding_window_size, self.sliding_window_size),
           offset=0,
@@ -1775,7 +1776,7 @@ def __call__(
 
 
 # pylint: disable=protected-access
-class LoadBalancedCausalMask(tokamax_splash_mask._ComputableMask):
+class LoadBalancedCausalMask(splash_attention_mask._ComputableMask):
   """Lazy causal mask, prevents the model from attending to future tokens.
   Attributes:
     offset: Offset of q start wrt kv. A positive offset shifts the bottom
diff --git a/tests/attention_test.py b/tests/attention_test.py
@@ -576,7 +576,6 @@ def tpu_kernel_attention_helper(self, num_kv_heads):
       },
   )
   # TODO (b/454764135.) : This tests fails with new tokamax kernel
-  @pytest.mark.skip(reason="Issue w/ tokamax kernel CP->EP sharding correctness. ")
   @pytest.mark.tpu_only
   def test_tpu_flash_attention_context_parallel(
       self, ici_context_parallelism, context_parallel_load_balance, ici_expert_parallelism, expert_shard_attention_option
@@ -1289,7 +1288,6 @@ def test_projection_initialization(self):
       },
   )
   # TODO (b/454764135.) : This tests fails with new tokamax kernel
-  @pytest.mark.skip(reason="Issue w/ tokamax kernel CP->EP sharding correctness. ")
   @pytest.mark.tpu_only
   def test_tpu_flash_attention_context_parallel(
       self, ici_context_parallelism, context_parallel_load_balance, ici_expert_parallelism, expert_shard_attention_option