[pallas:mosaic_gpu] emit_pipeline now maintains the grid indices

superbobry · Google-ML-Automation · commit 15f30a9e9c71 · 2024-11-12T04:39:17.000-08:00
Previously, it was recomputing them at every loop iteration.

PiperOrigin-RevId: 695682116
diff --git a/jax/_src/pallas/mosaic_gpu/pipeline.py b/jax/_src/pallas/mosaic_gpu/pipeline.py
@@ -30,6 +30,7 @@
 from jax._src.pallas.mosaic_gpu import core as gpu_core
 from jax._src.pallas.mosaic_gpu import primitives as gpu_primitives
 from jax.experimental import pallas as pl
+import jax.numpy as jnp
 
 
 map = util.safe_map
@@ -72,15 +73,16 @@ def copy_out(self, slot, grid_indices):
 )
 
 
-def make_grid_indices(
-    step: jax.typing.ArrayLike, grid: Sequence[int]
+def _inc_grid_by_1(
+    indices: tuple[jax.Array, ...], grid: Sequence[int]
 ) -> tuple[jax.Array, ...]:
-  # TODO(slebedev): Maintain the grid index through the fori_loop instead.
-  indices = []
-  for size in reversed(grid):
-    indices.append(lax.rem(step, size))
-    step = lax.div(step, size)
-  return tuple(reversed(indices))
+  next_indices = []
+  carry: bool | jax.Array = True
+  for idx, size in reversed(list(zip(indices, grid))):
+    next_idx = lax.select(carry, idx + 1, idx)
+    carry = next_idx == size
+    next_indices.append(lax.select(carry, 0, next_idx).astype(idx.dtype))
+  return tuple(reversed(next_indices))
 
 
 def emit_pipeline(
@@ -143,15 +145,15 @@ def scoped_pipeline(
     ):
       map(lambda bref: bref.copy_in(step, indices, barrier_ref), in_brefs)
 
-    def loop_body(step, _):
+    def loop_body(step, carry):
       slot = step % max_concurrent_steps
+      indices, fetch_indices = carry
 
       # Wait for the current GMEM->SMEM copy to complete.
       gpu_primitives.barrier_wait(barrier_ref.at[slot])
       # Wait for the previous output SMEM->GMEM copy to complete.
       gpu_primitives.wait_smem_to_gmem(max_concurrent_steps - 1)
 
-      indices = make_grid_indices(step, grid)
       with pallas_core.grid_env(map(pallas_core.GridAxis, indices, grid)):
         body(
             *(bref.smem_ref.at[slot] for bref in it.chain(in_brefs, out_brefs))
@@ -166,17 +168,19 @@ def loop_body(step, _):
       jax.lax.cond(
           fetch_step < num_steps,
           lambda: map(
-              lambda bref: bref.copy_in(
-                  fetch_slot, make_grid_indices(fetch_step, grid), barrier_ref
-              ),
+              lambda bref: bref.copy_in(fetch_slot, fetch_indices, barrier_ref),
               in_brefs,
           ),
           lambda: [None] * len(in_brefs),
       )
 
-      return ()
+      return _inc_grid_by_1(indices, grid), _inc_grid_by_1(fetch_indices, grid)
 
-    lax.fori_loop(0, num_steps, loop_body, ())
+    indices = (jnp.asarray(0, dtype=lax.dtype(0)),) * len(grid)
+    fetch_indices = indices
+    for _ in range(max_concurrent_steps):
+      fetch_indices = _inc_grid_by_1(fetch_indices, grid)
+    lax.fori_loop(0, num_steps, loop_body, (indices, fetch_indices))
 
     # Finalize the pipeline.
     gpu_primitives.wait_smem_to_gmem(0)
diff --git a/jax/experimental/mosaic/gpu/fragmented_array.py b/jax/experimental/mosaic/gpu/fragmented_array.py
@@ -1184,7 +1184,11 @@ def select(self, on_true, on_false):
         or ir.IntegerType(self.mlir_dtype).width != 1
     ):
       raise NotImplementedError
-    return self._pointwise(arith.select, on_true, on_false)
+    # We change the receiver here, because the return type is defined by
+    # `on_true` and `on_false` and not the predicate `self`.
+    return on_true._pointwise(
+        lambda t, p, f: arith.select(p, t, f), self, on_false,
+    )
 
   def foreach(self, fn: Callable[[ir.Value, tuple[ir.Value, ...]], None]):
     """Call a function for each value and index."""