Use boolean values for partial mask blocks in the splash attention kernel.

Rifur13 · Google-ML-Automation · commit e92ca9bbaee1 · 2024-12-11T14:59:30.000-08:00
The values are guaranteed to be 0 or 1 since we create this array ourselves when processing the masks into a MaskInfo object.

PiperOrigin-RevId: 705252534
diff --git a/jax/experimental/pallas/ops/tpu/splash_attention/splash_attention_kernel.py b/jax/experimental/pallas/ops/tpu/splash_attention/splash_attention_kernel.py
@@ -603,9 +603,9 @@ def _apply_mask_and_soft_cap(
     else:
       mask = pl.load(mask_ref, (k_slice, slice(None)))
 
-    snm = jnp.where(should_not_mask, 1, 0)
-    masks.append(jnp.bitwise_or(mask, jnp.broadcast_to(snm, mask.shape)) != 0)
-
+    masks.append(
+        jnp.bitwise_or(mask, jnp.broadcast_to(should_not_mask, mask.shape))
+    )
   if mask_function is not None:
     # Compute the mask using the given q_sequence indices.
     # KV indices are computed on the fly. This works because we only support Q
@@ -900,6 +900,16 @@ def _splash_attention_forward(
     kv_seq_len_dimension = 1
     num_kv_heads = k.shape[0]
 
+  partial_mask_blocks = fwd_mask_info.partial_mask_blocks
+  if (
+      partial_mask_blocks is not None
+      and jnp.dtype(partial_mask_blocks.dtype) != np.bool_
+  ):
+    raise ValueError(
+        "partial_mask_blocks must be of type np.bool_ but got"
+        f" {partial_mask_blocks.dtype}"
+    )
+
   if len(k.shape) != expected_kv_rank:
     raise ValueError(
         f"Expected {expected_kv_rank}-dim 'key' tensor for MQA. Instead got a"
diff --git a/jax/experimental/pallas/ops/tpu/splash_attention/splash_attention_mask_info.py b/jax/experimental/pallas/ops/tpu/splash_attention/splash_attention_mask_info.py
@@ -56,7 +56,7 @@ class MaskInfo(NamedTuple):
       indicates that the corresponding block in the full mask contained both
       zeros and ones. An entry of 2 indicates the corresponding block was
       entirely ones.
-    partial_mask_blocks: A i32[num_partial_blocks, block_q, block_kv] NumPy
+    partial_mask_blocks: A bool[num_partial_blocks, block_q, block_kv] NumPy
       array that contains the blocks of the original mask that contained both
       zeros and ones. The entries in `mask_next` point to indices in the first
       axis of this array.
@@ -305,7 +305,7 @@ def _get_mask_info_for_shard(
 
 
 # When used in a transformer network with multiple layers, the SplashAttention
-# kernel is created serveral times with the same mask. Cache MaskInfo to avoid
+# kernel is created several times with the same mask. Cache MaskInfo to avoid
 # blowing up compile times. Ideally the size of the cache should be determined
 # by the client.
 @functools.lru_cache(maxsize=12)
@@ -376,14 +376,6 @@ def _process_mask(
   if mod != 0:
     raise ValueError(f'{head_shards=} should divide {head_count=}.')
 
-  first_mask_size = mask.masks[0].shape
-  for h in range(head_count):
-    if mask.masks[h].shape != first_mask_size:
-      raise ValueError(
-          f'First head mask has shape {first_mask_size}, but head mask {h} has'
-          f' shape {mask.masks[h].shape}. All head masks must have the same'
-          ' shape.'
-      )
 
   # Uniquify the masks.
   # Create a collection of the unique head masks in the input multi-head mask.
@@ -526,13 +518,9 @@ def set_block_mask(mask_id: int, q_index: int, kv_index: int, value: int):
 
   partial_mask_blocks = None
   has_mask_next = False
-  if len(unique_partial_mask_blocks) == 1:
-    partial_mask_blocks = [x.array for x in unique_partial_mask_blocks]
-    partial_mask_blocks = partial_mask_blocks[0][None].astype(np.int32)
-    has_mask_next = True
-  elif len(unique_partial_mask_blocks) > 1:
+  if len(unique_partial_mask_blocks) >= 1:
     partial_mask_blocks = [x.array for x in unique_partial_mask_blocks]
-    partial_mask_blocks = np.stack(partial_mask_blocks, axis=0).astype(np.int32)
+    partial_mask_blocks = np.stack(partial_mask_blocks, axis=0).astype(np.bool_)
     has_mask_next = True
   if is_dkv and partial_mask_blocks is not None:
     partial_mask_blocks = np.swapaxes(partial_mask_blocks, -1, -2)
diff --git a/tests/pallas/tpu_splash_attention_mask_test.py b/tests/pallas/tpu_splash_attention_mask_test.py
@@ -18,9 +18,9 @@
 from absl.testing import absltest
 from absl.testing import parameterized
 import jax
+from jax._src import test_util as jtu
 from jax.experimental.pallas.ops.tpu.splash_attention import splash_attention_mask as mask_lib
 from jax.experimental.pallas.ops.tpu.splash_attention import splash_attention_mask_info as mask_info_lib
-from jax._src import test_util as jtu
 import numpy as np
 
 jax.config.parse_flags_with_absl()
@@ -798,7 +798,7 @@ def test_two_causal_masks(self, is_lazy_mask: bool):
         self._expected_causal_data_next[None],
         self._expected_causal_mask_next(0)[None] if not is_lazy_mask else None,
         self._expected_causal_block_mask[None],
-        np.expand_dims(np.tril(np.ones(block_shape, dtype=np.int32)), 0)
+        np.expand_dims(np.tril(np.ones(block_shape, dtype=np.bool_)), 0)
         if not is_lazy_mask
         else None,
         np.arange(sequence_lengths[0], dtype=np.int32)
@@ -813,7 +813,7 @@ def test_two_causal_masks(self, is_lazy_mask: bool):
         else None,
         self._expected_causal_block_mask_dkv[None],
         np.expand_dims(
-            np.tril(np.ones(block_shape, dtype=np.int32)), 0
+            np.tril(np.ones(block_shape, dtype=np.bool_)), 0
         ).swapaxes(-1, -2)
         if not is_lazy_mask
         else None,
@@ -851,7 +851,7 @@ def test_rectangular_wide_causal_mask(self, is_lazy_mask: bool):
         self._expected_causal_data_next[None],
         self._expected_causal_mask_next(0)[None] if not is_lazy_mask else None,
         self._expected_causal_block_mask[None],
-        np.expand_dims(np.tril(np.ones(block_shape, dtype=np.int32)), 0)
+        np.expand_dims(np.tril(np.ones(block_shape, dtype=np.bool_)), 0)
         if not is_lazy_mask
         else None,
         np.arange(sequence_lengths[0], dtype=np.int32)
@@ -894,7 +894,7 @@ def test_rectangular_wide_causal_mask(self, is_lazy_mask: bool):
         expected_causal_mask_next_dkv if not is_lazy_mask else None,
         expected_causal_block_mask_dkv,
         np.expand_dims(
-            np.tril(np.ones(block_shape, dtype=np.int32)), 0
+            np.tril(np.ones(block_shape, dtype=np.bool_)), 0
         ).swapaxes(-1, -2)
         if not is_lazy_mask
         else None,
@@ -974,7 +974,7 @@ def test_rectangular_tall_causal_mask(self, is_lazy_mask: bool):
         expected_causal_data_next,
         expected_causal_mask_next if not is_lazy_mask else None,
         expected_causal_block_mask,
-        np.expand_dims(np.tril(np.ones(block_shape, dtype=np.int32)), 0)
+        np.expand_dims(np.tril(np.ones(block_shape, dtype=np.bool_)), 0)
         if not is_lazy_mask
         else None,
         np.arange(sequence_lengths[0], dtype=np.int32)
@@ -1029,7 +1029,7 @@ def test_rectangular_tall_causal_mask(self, is_lazy_mask: bool):
         expected_causal_mask_next_dkv if not is_lazy_mask else None,
         expected_causal_block_mask_dkv,
         np.expand_dims(
-            np.tril(np.ones(block_shape, dtype=np.int32)), 0
+            np.tril(np.ones(block_shape, dtype=np.bool_)), 0
         ).swapaxes(-1, -2)
         if not is_lazy_mask
         else None,
@@ -1069,10 +1069,10 @@ def test_local_mask(self, is_lazy_mask: bool):
     expected_partial_mask_blocks = self._stack(
         [
             np.triu(
-                np.tri(*block_shape, window_size, dtype=np.int32), -window_size
+                np.tri(*block_shape, window_size, dtype=np.bool_), -window_size
             ),
-            np.tri(*block_shape, -window_size, dtype=np.int32),
-            np.triu(np.ones(block_shape, dtype=np.int32), window_size),
+            np.tri(*block_shape, -window_size, dtype=np.bool_),
+            np.triu(np.ones(block_shape, dtype=np.bool_), window_size),
         ],
     )
 
@@ -1179,8 +1179,8 @@ def test_local_mask_narrow(self, is_lazy_mask: bool):
 
     expected_partial_mask_blocks = self._stack(
         [
-            np.triu(np.tri(*block_shape, 0, dtype=np.int32), -window_size),
-            np.triu(np.ones(block_shape, dtype=np.int32), window_size),
+            np.triu(np.tri(*block_shape, 0, dtype=np.bool_), -window_size),
+            np.triu(np.ones(block_shape, dtype=np.bool_), window_size),
         ],
     )
 
@@ -1298,13 +1298,13 @@ def test_two_head_shards_one_causal_one_local(self, is_lazy_mask: bool):
     )
 
     expected_partial_mask_blocks = self._stack([
-        np.tril(np.ones(block_shape, dtype=np.int32)),
+        np.tril(np.ones(block_shape, dtype=np.bool_)),
         np.triu(
-            np.tri(*block_shape, window_size, dtype=np.int32),
+            np.tri(*block_shape, window_size, dtype=np.bool_),
             -window_size,
         ),
-        np.tri(*block_shape, -window_size, dtype=np.int32),
-        np.triu(np.ones(block_shape, dtype=np.int32), window_size),
+        np.tri(*block_shape, -window_size, dtype=np.bool_),
+        np.triu(np.ones(block_shape, dtype=np.bool_), window_size),
     ])
 
     expected_block_mask_dkv = self._stack(
@@ -1384,7 +1384,7 @@ def test_two_head_shards_causal_full(self, is_lazy_mask: bool):
     ])
 
     expected_partial_mask_blocks = np.expand_dims(
-        np.tril(np.ones(block_shape, dtype=np.int32)), 0
+        np.tril(np.ones(block_shape, dtype=np.bool_)), 0
     )
 
     expected_mask_info = mask_info_lib.MaskInfo(
@@ -1460,13 +1460,13 @@ def test_two_qseq_shards_causal_local(self, is_lazy_mask: bool):
     )
 
     expected_partial_mask_blocks = self._stack([
-        np.tril(np.ones(block_shape, dtype=np.int32)),
+        np.tril(np.ones(block_shape, dtype=np.bool_)),
         np.triu(
-            np.tri(*block_shape, window_size, dtype=np.int32),
+            np.tri(*block_shape, window_size, dtype=np.bool_),
             -window_size,
         ),
-        np.tri(*block_shape, -window_size, dtype=np.int32),
-        np.triu(np.ones(block_shape, dtype=np.int32), window_size),
+        np.tri(*block_shape, -window_size, dtype=np.bool_),
+        np.triu(np.ones(block_shape, dtype=np.bool_), window_size),
     ])
 
     expected_mask_info = mask_info_lib.MaskInfo(
@@ -1577,13 +1577,13 @@ def test_two_qseq_shards_causal_local_stacked(self):
     )
 
     expected_partial_mask_blocks = self._stack([
-        np.tril(np.ones(block_shape, dtype=np.int32)),
+        np.tril(np.ones(block_shape, dtype=np.bool_)),
         np.triu(
-            np.tri(*block_shape, window_size, dtype=np.int32),
+            np.tri(*block_shape, window_size, dtype=np.bool_),
             -window_size,
         ),
-        np.tri(*block_shape, -window_size, dtype=np.int32),
-        np.triu(np.ones(block_shape, dtype=np.int32), window_size),
+        np.tri(*block_shape, -window_size, dtype=np.bool_),
+        np.triu(np.ones(block_shape, dtype=np.bool_), window_size),
     ])
 
     expected_mask_info = mask_info_lib.MaskInfo(
@@ -1749,13 +1749,13 @@ def test_two_qseq_shards_local_wide_local_narrow_stacked(self):
     expected_partial_mask_blocks = self._stack([
         # Wide
         np.triu(
-            np.tri(*block_shape, window_size, dtype=np.int32),
+            np.tri(*block_shape, window_size, dtype=np.bool_),
             -window_size,
         ),
-        np.tri(*block_shape, -window_size, dtype=np.int32),
-        np.triu(np.ones(block_shape, dtype=np.int32), window_size),
+        np.tri(*block_shape, -window_size, dtype=np.bool_),
+        np.triu(np.ones(block_shape, dtype=np.bool_), window_size),
         # Narrow
-        np.triu(np.tri(*block_shape, 0, dtype=np.int32), -window_size),
+        np.triu(np.tri(*block_shape, 0, dtype=np.bool_), -window_size),
     ])
 
     expected_mask_info = mask_info_lib.MaskInfo(
@@ -1890,7 +1890,7 @@ def test_two_head_shards_causal_mask(self, is_lazy_mask: bool):
     )
 
     expected_partial_mask_blocks = np.expand_dims(
-        np.tril(np.ones(block_shape, dtype=np.int32)), 0
+        np.tril(np.ones(block_shape, dtype=np.bool_)), 0
     )
 
     expected_mask_info = mask_info_lib.MaskInfo(
@@ -1979,13 +1979,13 @@ def test_two_head_shards_two_causal_two_local(self, is_lazy_mask: bool):
 
     expected_partial_mask_blocks = self._stack(
         [
-            np.tril(np.ones(block_shape, dtype=np.int32)),
+            np.tril(np.ones(block_shape, dtype=np.bool_)),
             np.triu(
-                np.tri(*block_shape, window_size, dtype=np.int32),
+                np.tri(*block_shape, window_size, dtype=np.bool_),
                 -window_size,
             ),
-            np.tri(*block_shape, -window_size, dtype=np.int32),
-            np.triu(np.ones(block_shape, dtype=np.int32), window_size),
+            np.tri(*block_shape, -window_size, dtype=np.bool_),
+            np.triu(np.ones(block_shape, dtype=np.bool_), window_size),
         ],
     )