[pallas:mosaic_gpu] copy_smem_to_gmem now supports wait_read_only

superbobry · Google-ML-Automation · commit c76e5fe9a0d1 · 2024-11-20T04:29:33.000-08:00
PiperOrigin-RevId: 698343812
diff --git a/jax/_src/pallas/mosaic_gpu/pipeline.py b/jax/_src/pallas/mosaic_gpu/pipeline.py
@@ -207,7 +207,9 @@ def loop_body(step, carry):
         # Wait for the current GMEM->SMEM copy to complete.
         gpu_primitives.barrier_wait(barrier_ref.at[slot])
       # Wait for the previous output SMEM->GMEM copy to complete.
-      gpu_primitives.wait_smem_to_gmem(max_concurrent_steps - 1)
+      gpu_primitives.wait_smem_to_gmem(
+          max_concurrent_steps - 1, wait_read_only=True
+      )
 
       with pallas_core.grid_env(map(pallas_core.GridAxis, indices, grid)):
         body(
diff --git a/jax/_src/pallas/mosaic_gpu/primitives.py b/jax/_src/pallas/mosaic_gpu/primitives.py
@@ -363,20 +363,30 @@ def barrier_wait(barrier: pallas_core.AbstractMemoryRef) -> None:
 
 
 @wait_smem_to_gmem_p.def_effectful_abstract_eval
-def _wait_smem_to_gmem_abstract_eval(n):
-  del n  # Unused.
+def _wait_smem_to_gmem_abstract_eval(n, *, wait_read_only):
+  del n, wait_read_only  # Unused.
   return (), {gpu_core._memory_effect}
 
 
 @lowering.register_lowering_rule(wait_smem_to_gmem_p)
-def _wait_smem_to_gmem_lowering(ctx: lowering.LoweringRuleContext, n):
-  ctx.launch_ctx.await_async_copy(allow_groups=n)
+def _wait_smem_to_gmem_lowering(
+    ctx: lowering.LoweringRuleContext, n, *, wait_read_only
+):
+  ctx.launch_ctx.await_async_copy(
+      allow_groups=n, await_read_only=wait_read_only
+  )
   return ()
 
 
-def wait_smem_to_gmem(n: int) -> None:
-  """Waits until there are no more than ``n`` SMEM->GMEM copies in flight."""
-  wait_smem_to_gmem_p.bind(n)
+def wait_smem_to_gmem(n: int, wait_read_only: bool = False) -> None:
+  """Waits until there are no more than ``n`` SMEM->GMEM copies in flight.
+
+  Args:
+    n: The maximum number of copies in flight to wait for.
+    wait_read_only: If ``True``, wait for the in flight copies to finish
+      reading from SMEM. The writes to GMEM are not waited for.
+  """
+  wait_smem_to_gmem_p.bind(n, wait_read_only=wait_read_only)
 
 
 # WGMMA on an accumulator reference