[Pallas MGPU] Use multiple k/v_consumed_barriers in the attention kernel

apaszke · Google-ML-Automation · commit 8b656206af13 · 2024-12-06T09:16:18.000-08:00
There's nothing technically preventing the compute threads from running ahead and
signalling the consumption of k/v twice in case the memory thread ends up being
temporarily starved. I don't think this was ever a problem in practice since the
GPU hardware scheduler is surprisingly fair, but it's good not to have races :)

PiperOrigin-RevId: 703520322
diff --git a/jax/experimental/pallas/ops/gpu/attention_mgpu.py b/jax/experimental/pallas/ops/gpu/attention_mgpu.py
@@ -74,7 +74,7 @@ def kernel(q_ref, k_ref, v_ref, out_ref, scoped):
     wg_idx = lax.axis_index("wg")
     qo_smem2, k_smem, v_smem = smem_buffers
     k_barriers, v_barriers, q_barriers = buffer_barriers
-    k_consumed_barrier, v_consumed_barrier = consumed_barriers
+    k_consumed_barriers, v_consumed_barriers = consumed_barriers
     def perform_schedule_barrier():
       plgpu.barrier_arrive(schedule_barrier)
       plgpu.barrier_wait(schedule_barrier)
@@ -116,7 +116,7 @@ def compute_qk(acc_ref):
           perform_schedule_barrier()
           return acc_ref[...]
         qk = pl.run_scoped(compute_qk, plgpu.ACC((block_q, block_kv), jnp.float32))
-        plgpu.barrier_arrive(k_consumed_barrier)
+        plgpu.barrier_arrive(k_consumed_barriers.at[slot])
 
         # Softmax
         # We keep m scaled by log2e to use FMA instructions when computing p.
@@ -153,7 +153,7 @@ def compute_pv(acc_ref):
           def _wait():
             plgpu.barrier_wait(k_barriers.at[wait_slot])
         acc = pl.run_state(compute_pv)(plgpu.ACC.init(acc))
-        plgpu.barrier_arrive(v_consumed_barrier)
+        plgpu.barrier_arrive(v_consumed_barriers.at[slot])
         return acc, m_i, l_i
       if kv_seq_len % block_kv:
         raise ValueError(f"{kv_seq_len=} must be a multiple of {block_kv=}")
@@ -184,17 +184,12 @@ def kv_loop(kv_step, _):
         tma_step = kv_step + max_concurrent_steps
         tma_slot = lax.rem(kv_step, max_concurrent_steps)
         s = (batch, pl.ds(tma_step * block_kv, block_kv), kv_head)
-        plgpu.barrier_wait(k_consumed_barrier)
+        plgpu.barrier_wait(k_consumed_barriers.at[tma_slot])
         plgpu.copy_gmem_to_smem(k_ref.at[s], k_smem.at[tma_slot], k_barriers.at[tma_slot])
-        plgpu.barrier_wait(v_consumed_barrier)
+        plgpu.barrier_wait(v_consumed_barriers.at[tma_slot])
         plgpu.copy_gmem_to_smem(v_ref.at[s], v_smem.at[tma_slot], v_barriers.at[tma_slot])
       lax.fori_loop(0, kv_seq_len // block_kv - max_concurrent_steps, kv_loop, None)
 
-      def kv_epilogue(i, _):
-        plgpu.barrier_wait(k_consumed_barrier)
-        plgpu.barrier_wait(v_consumed_barrier)
-      lax.fori_loop(0, max_concurrent_steps, kv_epilogue, None)
-
   def run(refs):
     q_ref, k_ref, v_ref, out_ref = refs
 
@@ -210,7 +205,6 @@ def run(refs):
     @pl.core_map(mesh)
     def _kernel_entry():
       compute_wgs = 2
-      barrier_2wg = plgpu.Barrier(num_arrivals=compute_wgs)
       tiling = plgpu.TilingTransform((64, 64))
       swizzle = plgpu.SwizzleTransform(128)
       qo_scratch = plgpu.SMEM(
@@ -233,8 +227,8 @@ def _kernel_entry():
               plgpu.Barrier(1, num_barriers=max_concurrent_steps),
               plgpu.Barrier(1, num_barriers=compute_wgs),
           ),
-          (barrier_2wg, barrier_2wg),
-          barrier_2wg,
+          (plgpu.Barrier(num_arrivals=compute_wgs, num_barriers=max_concurrent_steps),) * 2,
+          plgpu.Barrier(num_arrivals=compute_wgs),
       )
 
   _, _, _, out = pl.run_state(run)((q, k, v, jnp.full_like(q, jnp.inf)))