mattjj
diff --git a/‎jax/experimental/pallas/ops/tpu/splash_attention/splash_attention_kernel.py‎
Lines changed: 24 additions & 1 deletion b/‎jax/experimental/pallas/ops/tpu/splash_attention/splash_attention_kernel.py‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎jax/experimental/pallas/ops/tpu/splash_attention/splash_attention_mask_info.py‎
Lines changed: 83 additions & 27 deletions b/‎jax/experimental/pallas/ops/tpu/splash_attention/splash_attention_mask_info.py‎
Lines changed: 83 additions & 27 deletions
diff --git a/‎tests/pallas/BUILD‎
Lines changed: 15 additions & 0 deletions b/‎tests/pallas/BUILD‎
Lines changed: 15 additions & 0 deletions
@@ -2293,6 +2293,26 @@ def _splash_attention(
     mask_function: MaskFunctionType | None,
     interpret: bool,
 ) -> SplashCustomReturnType:
+  """
+  For dynamic masks, `partial_mask_blocks` has shape (head_count, q_blocks, kv_blocks, block_q, block_kv).
+  This shape allows sharding across both head count and query sequence dimensions.
+
+  Note: The leading dimensions (head_count, q_blocks, kv_blocks) must be
+  collapsed into a single dimension before being passed to the kernel.
+  """
+  def _collapse_partial_mask_blocks(mask_info: mask_info_lib.MaskInfo | None):
+    if mask_info is None or mask_info.partial_mask_blocks is None:
+        return mask_info
+
+    return mask_info._replace(
+        partial_mask_blocks=mask_info.partial_mask_blocks.reshape(
+            -1, *mask_info.partial_mask_blocks.shape[-2:]
+        )
+    )
+
+  fwd_mask_info = _collapse_partial_mask_blocks(fwd_mask_info)
+  dq_mask_info = _collapse_partial_mask_blocks(dq_mask_info)
+  dkv_mask_info = _collapse_partial_mask_blocks(dkv_mask_info)
   return _splash_attention_custom(
       fwd_mask_info,
       dq_mask_info,
@@ -2352,13 +2372,16 @@ def manual_sharding_spec(self, sharding: jax.sharding.NamedSharding):
     spec = sharding.spec
     assert len(spec) == 2
     replicated = jax.sharding.PartitionSpec()
+    partial_mask_blocks_spec = (
+        spec if self.fwd_mask_info.is_dynamic_mask else replicated
+    )
     # Shard q_sequence over the sequence dimension only.
     q_sequence_spec = jax.sharding.PartitionSpec(spec[1])
     mask_info_specs = mask_info_lib.MaskInfo(  # pytype: disable=wrong-arg-types
         data_next=spec if self.fwd_mask_info.data_next is not None else None,
         mask_next=spec if self.fwd_mask_info.mask_next is not None else None,
         block_mask=spec if self.fwd_mask_info.block_mask is not None else None,
-        partial_mask_blocks=replicated
+        partial_mask_blocks=partial_mask_blocks_spec
         if self.fwd_mask_info.partial_mask_blocks is not None
         else None,
         q_sequence=q_sequence_spec
 
@@ -67,13 +67,18 @@ class MaskInfo(NamedTuple):
     q_sequence: A i32[q_sequence_length] NumPy array. When using causal masking,
       this contains the list of indices that correspond to q tokens. For plain
       causal this is just np.arange(q_sequence_length).
+    is_dynamic_mask: A bool indicating whether the mask is dynamic or static.
+      When True, the leading dimensions of `partial_mask_blocks` (num_heads,
+      q_blocks, kv_blocks) are not collapsed, allowing us to shard it along
+      those dimensions.
   """
 
   data_next: np.ndarray | jax.Array | None
   mask_next: np.ndarray | jax.Array | None
   block_mask: np.ndarray | jax.Array | None
   partial_mask_blocks: np.ndarray | jax.Array | None
   q_sequence: np.ndarray | None
+  is_dynamic_mask: bool = None
 
 
 def _downcast_to_small_type(array: np.ndarray) -> np.ndarray:
@@ -168,7 +173,7 @@ def __eq__(self, other: object) -> bool:
 def _get_mask_info_for_shard(
     output_shape: tuple[int, int, int],
     has_mask_next: bool,
-    mask: mask_lib.MultiHeadMask,
+    mask: mask_lib.MultiHeadMask | jax.Array,
     block_shape: tuple[int, int],
     coords_to_partial_mask_block_index: dict[tuple[int, int, int], int],
     masks_per_head_shard: int,
@@ -338,7 +343,8 @@ def _process_dynamic_mask(
       launched.
     q_seq_shards: Number of Q sequence shards of the mesh in which the kernel is
       launched.
-    shrink_grid: Whether or not we should apply the grid shrinking optimization. This is currently ignored.
+    shrink_grid: Whether or not we should apply the grid shrinking optimization.
+      This is currently ignored.
 
   Returns:
     `MaskInfo`, a sparse representation of the dense mask.
@@ -349,11 +355,6 @@ def _process_dynamic_mask(
   """
 
   del shrink_grid
-
-  # TODO(pobudzey): Properly support sharding.
-  if head_shards != 1 or q_seq_shards != 1:
-    raise ValueError('Dynamic mask processing does not support sharding.')
-
   if len(mask.shape) != 3:
     raise ValueError(f'Expected a 3-dim mask, instead got: {mask.shape}.')
 
@@ -370,6 +371,18 @@ def _process_dynamic_mask(
   if kv_mod != 0:
     raise ValueError(f'{kv_block_size=} should divide {kv_seq_len=}.')
 
+  q_seq_len_per_shard, mod = divmod(q_seq_len, q_seq_shards)
+  if mod != 0:
+    raise ValueError(f'{q_seq_shards=} should divide {q_seq_len=}.')
+
+  q_blocks_per_shard, mod = divmod(q_seq_len_per_shard, q_block_size)
+  if mod != 0:
+    raise ValueError(f'{q_block_size=} should divide {q_seq_len_per_shard=}.')
+
+  heads_per_shard, mod = divmod(head_count, head_shards)
+  if mod != 0:
+    raise ValueError(f'{head_shards=} should divide {head_count=}.')
+
   block_mask_shape = (
       head_count,
       q_blocks_count,
@@ -398,26 +411,66 @@ def _process_dynamic_mask(
   block_mask = jnp.where(is_full_mask, 2, block_mask)
   block_mask = jnp.where(is_empty_mask, 0, block_mask)
 
-  # TODO(pobudzey): Return the next valid mask index instead of 0 for a more efficient pipeline.
-  mask_next = jnp.where(
-      jnp.logical_or(is_empty_mask, is_full_mask),
-      0,
-      jnp.arange(math.prod(block_mask_shape), dtype=np.int32).reshape(
-          block_mask_shape
-      ),
-  )
+  q_sequence_axis = 1
+  head_axis = 0
 
-  # data_next stores the index of the next non-empty data block in the sequence.
-  # The indices of empty blocks are set to 0 to avoid copying extra data when
-  # pipeling.
-  if is_dkv:
-    data_next = jnp.arange(q_blocks_count, dtype=np.int32)[None, :, None]
-  else:
-    data_next = jnp.arange(kv_blocks_count, dtype=np.int32)[None, None, :]
-  data_next = jnp.broadcast_to(data_next, block_mask_shape)
-  data_next = jnp.where(is_empty_mask, 0, data_next)
+  # Each iteration of the loop processes a slice of the mask info
+  # tensors of this shape:
+  mask_info_slice_shape = (heads_per_shard, q_blocks_per_shard, kv_blocks_count)
+
+  # Collect mask_info shards along the head dimension, concatentate (or
+  # broadcast) them after the loop.
+  data_next_per_head_list, mask_next_per_head_list = [], []
+  for head_shard in range(head_shards):
+    head_start = head_shard * heads_per_shard
+    mask_head_slice = slice(head_start, head_start + heads_per_shard)
+
+    # Collect mask_info shards along the q_sequence dimension, concatenate them
+    # after the loop.
+    data_next_sequence_slices, mask_next_sequence_slices = [], []
+    for q_seq_len_shard in range(q_seq_shards):
+      q_seq_len_start = q_seq_len_shard * q_blocks_per_shard
+      blocked_q_seq_len_slice = slice(
+          q_seq_len_start, q_seq_len_start + q_blocks_per_shard
+      )
+      local_block_mask = block_mask[mask_head_slice, blocked_q_seq_len_slice]
+
+      mask_next_slice = jnp.arange(
+          math.prod(mask_info_slice_shape), dtype=np.int32
+      ).reshape(mask_info_slice_shape)
+      mask_next_slice = jnp.where(local_block_mask == 1, mask_next_slice, 0)
+
+      # data_next stores the index of the next non-empty data block in the sequence.
+      # The indices of empty blocks are set to 0 to avoid copying extra data when
+      # pipeling.
+      if is_dkv:
+        data_next_slice = jnp.arange(q_blocks_per_shard, dtype=np.int32)[
+            None, :, None
+        ]
+      else:
+        data_next_slice = jnp.arange(kv_blocks_count, dtype=np.int32)[
+            None, None, :
+        ]
+      data_next_slice = jnp.broadcast_to(data_next_slice, mask_info_slice_shape)
+      data_next_slice = jnp.where(local_block_mask == 0, 0, data_next_slice)
+
+      data_next_sequence_slices.append(data_next_slice)
+      mask_next_sequence_slices.append(mask_next_slice)
+
+    # Concatenate the sequence shards.
+    data_next_per_head = jnp.concatenate(
+        data_next_sequence_slices, axis=q_sequence_axis
+    )
+    data_next_per_head_list.append(data_next_per_head)
+    mask_next_per_head = jnp.concatenate(
+        mask_next_sequence_slices, axis=q_sequence_axis
+    )
+    mask_next_per_head_list.append(mask_next_per_head)
+
+  # Concatenate (or broadcast) the head shards.
+  data_next = jnp.concatenate(data_next_per_head_list, axis=head_axis)
+  mask_next = jnp.concatenate(mask_next_per_head_list, axis=head_axis)
 
-  partial_mask_blocks = partial_mask_blocks.reshape(-1, *block_shape)
   if is_dkv:
     partial_mask_blocks = partial_mask_blocks.swapaxes(-1, -2)
 
@@ -438,9 +491,11 @@ def _downcast(array: jax.Array, max_value: int) -> jax.Array:
   if downcast_smem_data:
     block_mask = block_mask.astype(np.int8)  # values are in the range [0, 1, 2]
     data_next = _downcast(
-      data_next, q_blocks_count if is_dkv else kv_blocks_count
+        data_next, q_blocks_per_shard if is_dkv else kv_blocks_count
+    )
+    mask_next = _downcast(
+        mask_next, heads_per_shard * q_blocks_per_shard * kv_blocks_count
     )
-    mask_next = _downcast(mask_next, math.prod(block_mask_shape))
 
   return (
       MaskInfo(
@@ -449,6 +504,7 @@ def _downcast(array: jax.Array, max_value: int) -> jax.Array:
           block_mask=block_mask,
           partial_mask_blocks=partial_mask_blocks,
           q_sequence=None,
+          is_dynamic_mask=True,
       ),
       None,
   )
 
@@ -540,6 +540,21 @@ jax_multiplatform_test(
     ] + py_deps("absl/testing") + py_deps("numpy") + py_deps("hypothesis"),
 )
 
+jax_multiplatform_test(
+    name = "tpu_splash_attention_kernel_sharded_test",
+    srcs = ["tpu_splash_attention_kernel_sharded_test.py"],
+    enable_configs = [
+        "tpu_v5e_4x2",
+        "tpu_v5p_2x2",
+    ],
+    shard_count = 5,
+    deps = [
+        "//jax:extend",
+        "//jax:pallas_tpu",
+        "//jax:pallas_tpu_ops",
+    ],
+)
+
 # This test doesn't need a TPU; it only tests numpy-using helpers.
 jax_py_test(
     name = "tpu_splash_attention_mask_test",