[Mosaic GPU] Add an optimization barrier

apaszke · Google-ML-Automation · commit 11090be0b37e · 2024-12-04T06:54:48.000-08:00
The barrier is a no-op at runtime, but appears as a side-effecting op to LLVM
which prevents it from moving the (even pure) computations that involve the
supplied arrays past the barrier.

PiperOrigin-RevId: 702709125
diff --git a/jax/_src/pallas/mosaic_gpu/lowering.py b/jax/_src/pallas/mosaic_gpu/lowering.py
@@ -1681,6 +1681,12 @@ def _bitcast_convert_type_lowering_rule(
   )
 
 
+@register_lowering_rule(lax.optimization_barrier_p)
+def _optimization_barrier_lowering(ctx: LoweringRuleContext, *args):
+  args = (_ensure_fa(arg, aval.dtype) for arg, aval in zip(args, ctx.avals_in))
+  return mgpu.optimization_barrier(*args)
+
+
 def _bcast(
     x: ir.Value,
     y: ir.Value,
diff --git a/jax/experimental/mosaic/gpu/__init__.py b/jax/experimental/mosaic/gpu/__init__.py
@@ -43,6 +43,7 @@
     WGMMA_ROW_LAYOUT as WGMMA_ROW_LAYOUT,
     WGSplatFragLayout as WGSplatFragLayout,
     WGStridedFragLayout as WGStridedFragLayout,
+    optimization_barrier as optimization_barrier,
 )
 from .utils import (
     BarrierRef as BarrierRef,
diff --git a/jax/experimental/mosaic/gpu/fragmented_array.py b/jax/experimental/mosaic/gpu/fragmented_array.py
@@ -1866,3 +1866,108 @@ def subf(a: ir.Value, b: ir.Value):
 
 def mulf(a: ir.Value, b: ir.Value):
   return arith.mulf(a, b, fastmath=arith.FastMathFlags.contract)
+
+
+def optimization_barrier(*arrays: mgpu.FragmentedArray):
+  """Acts as an optimization barrier for LLVM.
+
+  Passing arrays through this function will make sure that they are computed
+  before any side-effecting operations that follow this barrier.
+  """
+  index = ir.IndexType.get()
+  i32 = ir.IntegerType.get_signless(32)
+
+  regs = []
+  reg_dtypes = []
+  reg_constraints = []
+  ptx_lines = ["// Optimization barrier"]
+  repack_fns = []
+  # We unpack each array into a flat list of registers, and prepare the
+  # functions that invert the transform in repack_fns.
+  for array in arrays:
+    ptx_lines.append("// Next array")
+    reg_ty = array.registers.flat[0].type
+    dtype = array.mlir_dtype
+    num_prev_cstr = len(reg_constraints)
+    if ir.F32Type.isinstance(dtype):
+      if ir.VectorType.isinstance(reg_ty):
+        [vec_len] = ir.VectorType(reg_ty).shape
+        array_regs = [  # pylint: disable=g-complex-comprehension
+            vector.extractelement(reg, position=c(pos, index))
+            for reg in array.registers.flat
+            for pos in range(vec_len)
+        ]
+        def _repack(regs, reg_ty=reg_ty):
+          reg = llvm.mlir_undef(reg_ty)
+          [vec_len] = ir.VectorType(reg_ty).shape
+          for i_elem in range(vec_len):
+            reg = llvm.insertelement(
+                reg, next(regs), arith.constant(i32, i_elem)
+            )
+          return reg
+        repack_fns.append(_repack)
+      else:
+        array_regs = list(array.registers.flat)
+        repack_fns.append(lambda regs: next(regs))
+      reg_constraint = "f"
+    elif ir.BF16Type.isinstance(dtype) or ir.F16Type.isinstance(dtype):
+      if not ir.VectorType.isinstance(reg_ty):
+        raise NotImplementedError(array.mlir_dtype)
+      [vec_len] = ir.VectorType(reg_ty).shape
+      if vec_len != 2:
+        raise NotImplementedError(vec_len)
+      i32_reg_ty = ir.VectorType.get((1,), i32)
+      array_regs = [
+          vector.extractelement(
+              vector.bitcast(i32_reg_ty, reg), position=c(0, index)
+          )
+          for reg in array.registers.flat
+      ]
+      reg_constraint = "r"
+      def _repack(regs, reg_ty=reg_ty, i32_reg_ty=i32_reg_ty):
+        return vector.bitcast(reg_ty, vector.splat(i32_reg_ty, next(regs)))
+      repack_fns.append(_repack)
+    else:
+      raise NotImplementedError(array.mlir_dtype)
+    regs += array_regs
+    reg_dtypes += [array_regs[0].type] * len(array_regs)
+    reg_constraints += [f"={reg_constraint}"] * len(array_regs)
+    reg_constraints += [reg_constraint] * len(array_regs)
+    ptx_lines += [
+        f"mov.b32 ${i}, ${len(array_regs)+i}"
+        for i in range(num_prev_cstr, num_prev_cstr + len(array_regs))
+    ]
+  reg_constraints = ",".join(reg_constraints)
+  ptx = ";\n\t".join(ptx_lines) + ";"
+  struct_ty = ir.Type.parse(
+      f"!llvm.struct<({','.join(map(str, reg_dtypes))})>"
+  )
+  result_struct = llvm.inline_asm(
+      struct_ty, regs, ptx, reg_constraints,
+      asm_dialect=0, has_side_effects=True,
+  )
+  regs = [
+      llvm.extractvalue(dtype, result_struct, [i])
+      for i, dtype in enumerate(reg_dtypes)
+  ]
+  i32 = ir.IntegerType.get_signless(32)
+  results = []
+  regs_it = iter(regs)
+  for array, repack_fn in zip(arrays, repack_fns, strict=True):
+    num_regs = array.registers.size
+    reg_ty = array.registers.flat[0].type
+    if ir.VectorType.isinstance(reg_ty):
+      reg_ty = ir.VectorType(reg_ty)
+    new_registers = np.empty((num_regs,), dtype=object)
+    for i_vreg in range(num_regs):
+      reg = repack_fn(regs_it)
+      assert reg.type == reg_ty, (reg.type, reg_ty)
+      new_registers[i_vreg] = reg
+    results.append(
+        FragmentedArray(
+            _registers=new_registers.reshape(array.registers.shape),
+                        _layout=array.layout,
+            _is_signed=array.is_signed,
+        )
+    )
+  return results[0] if len(arrays) == 1 else results
diff --git a/jax/experimental/mosaic/gpu/wgmma.py b/jax/experimental/mosaic/gpu/wgmma.py
@@ -23,6 +23,7 @@
 from jaxlib.mlir.dialects import arith
 from jaxlib.mlir.dialects import llvm
 from jaxlib.mlir.dialects import vector
+from jaxlib.mlir.dialects import nvvm
 import numpy as np
 
 import jax.experimental.mosaic.gpu as mgpu
@@ -445,58 +446,13 @@ def wgmma(
 def wgmma_fence(array: mgpu.FragmentedArray):
   """Fences the array construction from WGMMA instructions.
 
-  This is a little workaround to force LLVM to initialize the PTX registers
-  before the wgmma.fence.sync.aligned instruction. Otherwise, LLVM treats
-  in-register computation as pure and can move it after the fence, which is
-  explicitly disallowed by the PTX programming model.
+  LLVM treats in-register computation as pure and can move it after the fence,
+  which is explicitly disallowed by the PTX programming model. For that reason,
+  we insert an LLVM optimization barrier before the fence.
   """
-  i32 = ir.IntegerType.get_signless(32)
-  index = ir.IndexType.get()
-  dtype = array.mlir_dtype
-  src_vec_ty = ir.VectorType(array.registers.flat[0].type)
-  assert src_vec_ty.shape == [2]
-
-  if dtype == ir.F32Type.get():
-    regs = [  # pylint: disable=g-complex-comprehension
-        vector.extractelement(reg, position=c(pos, index))
-        for reg in array.registers.flat
-        for pos in range(2)
-    ]
-    reg_dtype = dtype
-    reg_constraints_list = ["=f"] * len(regs) + ["f"] * len(regs)
-    ptx_lines = [f"mov.f32 ${i}, ${len(regs)+i}" for i in range(len(regs))]
-  elif dtype == ir.F16Type.get() or dtype == ir.BF16Type.get():
-    regs = [_as_i32_reg(reg) for reg in array.registers.flat]
-    reg_dtype = i32
-    reg_constraints_list = ["=r"] * len(regs) + ["r"] * len(regs)
-    ptx_lines = [f"mov.b32 ${i}, ${len(regs)+i}" for i in range(len(regs))]
-  else:
-    raise NotImplementedError(dtype)
-  reg_constraints = ",".join(reg_constraints_list)
-  # Copy over the registers. ptxas should be able to remove the moves.
-  ptx_lines.append("wgmma.fence.sync.aligned")
-  ptx = ";\n".join(ptx_lines) + ";\n"
-  dtype_str = str(reg_dtype)
-  struct_ty = ir.Type.parse(
-      f"!llvm.struct<({','.join(dtype_str for _ in regs)})>"
-  )
-  acc_struct = llvm.inline_asm(
-      struct_ty, regs, ptx, reg_constraints,
-      asm_dialect=0, has_side_effects=True,
-  )
-  regs = [
-      llvm.extractvalue(reg_dtype, acc_struct, [i]) for i in range(len(regs))
-  ]
-  if dtype == ir.F32Type.get():
-    registers = _as_fragmented_reg_ndarray(
-          regs, array.mlir_dtype, array.registers.shape
-    )
-  elif dtype == ir.F16Type.get() or dtype == ir.BF16Type.get():
-    regs = [_unpack_i32(src_vec_ty, r) for r in regs]
-    registers = np.asarray(regs, dtype=object).reshape(array.registers.shape)
-  else:
-    raise NotImplementedError(dtype)
-  return mgpu.FragmentedArray(_registers=registers, _layout=array.layout, _is_signed=array.is_signed)
+  array = mgpu.optimization_barrier(array)
+  nvvm.wgmma_fence_aligned()
+  return array
 
 
 def _as_fragmented_reg_ndarray(flat_regs, dtype: ir.Type, shape: tuple[int, ...]):
diff --git a/jax/experimental/pallas/mosaic_gpu.py b/jax/experimental/pallas/mosaic_gpu.py
@@ -31,12 +31,12 @@
 from jax._src.pallas.mosaic_gpu.pipeline import emit_pipeline as emit_pipeline
 from jax._src.pallas.mosaic_gpu.primitives import barrier_arrive as barrier_arrive
 from jax._src.pallas.mosaic_gpu.primitives import barrier_wait as barrier_wait
+from jax._src.pallas.mosaic_gpu.primitives import broadcasted_iota as broadcasted_iota
 from jax._src.pallas.mosaic_gpu.primitives import commit_smem as commit_smem
 from jax._src.pallas.mosaic_gpu.primitives import copy_gmem_to_smem as copy_gmem_to_smem
 from jax._src.pallas.mosaic_gpu.primitives import copy_smem_to_gmem as copy_smem_to_gmem
 from jax._src.pallas.mosaic_gpu.primitives import Layout as Layout
 from jax._src.pallas.mosaic_gpu.primitives import layout_cast as layout_cast
-from jax._src.pallas.mosaic_gpu.primitives import broadcasted_iota as broadcasted_iota
 from jax._src.pallas.mosaic_gpu.primitives import set_max_registers as set_max_registers
 from jax._src.pallas.mosaic_gpu.primitives import wait_smem_to_gmem as wait_smem_to_gmem
 from jax._src.pallas.mosaic_gpu.primitives import wgmma as wgmma
diff --git a/jax/experimental/pallas/ops/gpu/attention_mgpu.py b/jax/experimental/pallas/ops/gpu/attention_mgpu.py
@@ -129,10 +129,19 @@ def compute_qk(acc_ref):
         l_i *= alpha
         p16 = p.astype(dtype)
 
-        plgpu.barrier_wait(v_barriers.at[slot])
-        perform_schedule_barrier()
-
-        l_i += p.sum(axis=1)
+        def end_softmax_barriers():
+          plgpu.barrier_arrive(schedule_barrier)  # Done with softmax!
+          plgpu.barrier_wait(v_barriers.at[slot])
+          plgpu.barrier_wait(schedule_barrier)  # Wait until TensorCore is free.
+        # Can't fully explain why, but empirically the ordering here influences
+        # the performance of the final kernel quite significantly.
+        if head_dim <= 128:
+          l_i += p.sum(axis=1)
+          acc, l_i, m_i, p16 = lax.optimization_barrier((acc, l_i, m_i, p16))
+          end_softmax_barriers()
+        else:
+          end_softmax_barriers()
+          l_i += p.sum(axis=1)
 
         # PV
         def compute_pv(acc_ref):
diff --git a/tests/mosaic/gpu_test.py b/tests/mosaic/gpu_test.py
@@ -1674,6 +1674,20 @@ def kernel(ctx, inp, out, smem):
     )(x)
     np.testing.assert_array_equal(result, reference)
 
+  @parameterized.parameters(jnp.float32, jnp.float16, jnp.bfloat16)
+  def test_optimization_barrier(self, dtype):
+    def kernel(ctx, inp, out, smem):
+      del ctx, smem
+      arr = mgpu.FragmentedArray.load_strided(inp)
+      arr2 = arr * 2
+      arr, arr2 = mgpu.optimization_barrier(arr, arr2)
+      (arr + arr2).store_untiled(out)
+
+    x = jnp.arange(256, dtype=dtype)
+
+    f = mgpu.as_gpu_kernel(kernel, (1, 1, 1), (128, 1, 1), x, x, None)
+    np.testing.assert_array_equal(f(x), x * 3)
+
 
 class ProfilerTest(TestCase):
 

Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@`
`43`	`43`	`WGMMA_ROW_LAYOUT as WGMMA_ROW_LAYOUT,`
`44`	`44`	`WGSplatFragLayout as WGSplatFragLayout,`
`45`	`45`	`WGStridedFragLayout as WGStridedFragLayout,`
	`46`	`+ optimization_barrier as optimization_barrier,`
`46`	`47`	`)`
`47`	`48`	`from .utils import (`
`48`	`49`	`BarrierRef as BarrierRef,`