[Mosaic GPU] Add support for allocation and lowering of scratch semaphores

apaszke · Google-ML-Automation · commit 39fb2a00a6b4 · 2025-03-28T05:43:53.000-07:00
The semaphore arrays are allocated in GMEM and zeroed by XLA before the kernel begins.

PiperOrigin-RevId: 741494241
diff --git a/jax/_src/pallas/mosaic_gpu/BUILD b/jax/_src/pallas/mosaic_gpu/BUILD
@@ -48,7 +48,7 @@ pytype_strict_library(
         "//jax:mlir",
         "//jax:mosaic_gpu",
         "//jax/_src/pallas",
-    ],
+    ] + py_deps("numpy"),
 )
 
 pytype_strict_library(
diff --git a/jax/_src/pallas/mosaic_gpu/core.py b/jax/_src/pallas/mosaic_gpu/core.py
@@ -120,6 +120,25 @@ def __call__(
     return GPUMemoryRef(shape, dtype, memory_space=self, transforms=transforms)
 
 
+class SemaphoreType(enum.Enum):
+  REGULAR = "regular"
+  BARRIER = "barrier"
+
+  def __call__(self, shape: tuple[int, ...]):
+    dtype: Any
+    if self == SemaphoreType.BARRIER:
+      dtype = pallas_core.BarrierSemaphore()
+    else:
+      dtype = pallas_core.Semaphore()
+    return pallas_core.MemoryRef(shape, dtype, GPUMemorySpace.GMEM)
+
+  def get_array_aval(self) -> jax_core.ShapedArray:
+    return self(()).get_array_aval()
+
+  def get_ref_aval(self) -> pallas_core.TransformedRef | AbstractMemoryRef:
+    return self(()).get_ref_aval()
+
+
 def kernel(
     body: Callable[..., None],
     out_shape: object,
diff --git a/jax/_src/pallas/mosaic_gpu/lowering.py b/jax/_src/pallas/mosaic_gpu/lowering.py
@@ -418,8 +418,9 @@ class LoweringResult:
   module: ir.Module
   grid: tuple[int, ...]
   block: tuple[int, ...]
-  out_structs: tuple[jax.ShapeDtypeStruct, ...]
+  new_out_shapes: tuple[jax.ShapeDtypeStruct, ...]  # Does not include gmem scratch!
   profiler_context: ProfilerContext | None
+  gmem_scratch_shapes: tuple[jax.ShapeDtypeStruct, ...]
 
 
 @dataclasses.dataclass(frozen=True)
@@ -588,16 +589,41 @@ def ref_for_aval(aval: jax_core.AbstractValue):
     else:
       return gpu_core.SMEM(aval.shape, aval.dtype)
 
+  sem_placeholder = None
+  semaphore_ref_avals = []
+  scratch_avals = []
+  # Need to unzip semaphores
+  for v in jaxpr.invars[grid_mapping.slice_scratch_ops]:
+    aval = v.aval
+    if (isinstance(aval, pallas_core.AbstractMemoryRef) and
+        jnp.issubdtype(aval.dtype, pallas_core.semaphore_dtype)):
+      if aval.memory_space != gpu_core.GPUMemorySpace.GMEM:
+        raise ValueError(
+            "Only GMEM memory space is supported for semaphores in Mosaic GPU."
+        )
+      semaphore_ref_avals.append(aval)
+      scratch_avals.append(sem_placeholder)
+    else:
+      scratch_avals.append(aval)
+
   def pipeline_fn(*refs):
-    return primitives.run_scoped(
-        functools.partial(scoped_pipeline_fn, *refs),
+    sem_refs = []
+    if semaphore_ref_avals:
+      refs, sem_refs = util.split_list(refs, [-len(semaphore_ref_avals)])
+    primitives.run_scoped(
+        functools.partial(scoped_pipeline_fn, *refs, sem_refs=sem_refs),
         scratch_refs=[
-            ref_for_aval(v.aval)
-            for v in jaxpr.invars[grid_mapping.slice_scratch_ops]
+            ref_for_aval(aval) if aval is not sem_placeholder else aval
+            for aval in scratch_avals
         ],
     )
+    return ()  # ``wrap_init`` does not support functions returning None.
 
-  def scoped_pipeline_fn(*refs, scratch_refs):
+  def scoped_pipeline_fn(*refs, sem_refs, scratch_refs):
+    sem_refs_it = iter(sem_refs)
+    scratch_refs = [
+        next(sem_refs_it) if r is sem_placeholder else r for r in scratch_refs
+    ]
     def body_fn(*refs):
       grid_env = pallas_core.current_grid_env()
       assert grid_env is not None  # Set by ``emit_pipeline``.
@@ -628,17 +654,13 @@ def body_fn(*refs):
 
   with grid_mapping.trace_env():
     new_jaxpr, _, new_consts, () = pe.trace_to_jaxpr_dynamic(
-        lu.wrap_init(
-            # ``wrap_init`` does not support functions returning None.
-            lambda *args: pipeline_fn(*args) or (),
-            debug_info=jaxpr.debug_info,
-        ),
+        lu.wrap_init(pipeline_fn, debug_info=jaxpr.debug_info),
         [
             gpu_core.GMEM(
                 bm.array_shape_dtype.shape, bm.array_shape_dtype.dtype
             ).get_ref_aval()
             for bm in block_mappings
-        ],
+        ] + semaphore_ref_avals,
     )
     assert not new_consts
 
@@ -655,6 +677,10 @@ def body_fn(*refs):
         mesh.cluster if mesh is not None else (),
         [bm.array_shape_dtype for bm in in_block_mappings],
         [bm.array_shape_dtype for bm in out_block_mappings],
+        [
+            jax.ShapeDtypeStruct(r.shape, np.dtype(np.int32))
+            for r in semaphore_ref_avals
+        ],
         new_jaxpr,
         compiler_params,
         new_consts,
@@ -668,6 +694,7 @@ def lower_jaxpr_to_module(
     cluster: Sequence[int],
     in_shapes: Sequence[jax.ShapeDtypeStruct],
     out_shapes: Sequence[jax.ShapeDtypeStruct],
+    gmem_scratch_shapes: Sequence[jax.ShapeDtypeStruct],
     jaxpr: jax_core.Jaxpr,
     compiler_params: dict[str, Any],
     consts=(),
@@ -754,14 +781,14 @@ def body(launch_ctx: mgpu.LaunchContext, *buffers: ir.Value):
     # Each range is 2 events, each event is 4 bytes.
     prof_spec = mgpu_profiler.ProfilerSpec(prof_space * 2 * 4)
     prof_ctx = ProfilerContext(params["profile_dir"], prof_spec)
-  module, out_structs_gmem, _, launch_ctx, scratch_arr = (
+  module, new_out_shapes, _, launch_ctx, scratch_arr = (
       mgpu_core._lower_as_gpu_kernel(
           body,
           grid=tuple(map(operator.mul, parallel_grid, cluster)),
           cluster=cluster,
           block=block,
           in_shapes=in_shapes,
-          out_shape=out_shapes,
+          out_shape=(*out_shapes, *gmem_scratch_shapes),
           smem_scratch_shape=scratch_buffers,
           module_name=mlir.sanitize_name(debug_info.func_name),
           prof_spec=prof_spec,
@@ -777,8 +804,11 @@ def body(launch_ctx: mgpu.LaunchContext, *buffers: ir.Value):
 
   mgpu_core._initialize_scratch(launch_ctx, scratch_arr)
 
+  if gmem_scratch_shapes:
+    new_out_shapes = new_out_shapes[:-len(gmem_scratch_shapes)]
+
   return LoweringResult(
-      module, parallel_grid, block, out_structs_gmem, prof_ctx
+      module, parallel_grid, block, new_out_shapes, prof_ctx, tuple(gmem_scratch_shapes)
   )
 
 
diff --git a/jax/_src/pallas/mosaic_gpu/pallas_call_registration.py b/jax/_src/pallas/mosaic_gpu/pallas_call_registration.py
@@ -23,11 +23,13 @@
 import warnings
 
 import jax
+from jax import lax
 from jax._src import core as jax_core
 from jax._src.interpreters import mlir
 from jax._src.pallas import core as pallas_core
 from jax._src.pallas.mosaic_gpu import lowering
 from jax.experimental.mosaic import gpu as mgpu
+import numpy as np
 
 
 def pallas_call_lowering(
@@ -74,16 +76,30 @@ def pallas_call_lowering(
     print(lowering_result.module.operation)
 
   module = lowering_result.module
-  new_avals_out = [
-      jax_core.ShapedArray(t.shape, t.dtype) for t in lowering_result.out_structs
-  ]
+  new_avals_in = list(ctx.avals_in)
+  new_avals_out = list(map(_as_shaped_array, lowering_result.new_out_shapes))
+  scratch_args = ()
+  if lowering_result.gmem_scratch_shapes:
+    input_output_aliases += tuple(
+        (len(new_avals_in) + i, len(new_avals_out) + i)
+        for i in range(len(lowering_result.gmem_scratch_shapes))
+    )
+    new_avals_in.extend(map(_as_shaped_array, lowering_result.gmem_scratch_shapes))
+    new_avals_out.extend(map(_as_shaped_array, lowering_result.gmem_scratch_shapes))
+    def zero_init_gmem_scratch():
+      return [lax.zeros_like_array(s) for s in lowering_result.gmem_scratch_shapes]
+    scratch_args = mlir.lower_fun(
+        zero_init_gmem_scratch, multiple_results=True
+    )(ctx.replace(avals_in=()))
   outs = mgpu.core._mosaic_gpu_lowering_rule(
-      ctx.replace(avals_out=new_avals_out),
-      *args,
+      ctx.replace(avals_in=new_avals_in, avals_out=new_avals_out),
+      *args, *scratch_args,
       module=module,
-      out_types=lowering_result.out_structs,
+      out_types=(*lowering_result.new_out_shapes, *lowering_result.gmem_scratch_shapes),
       input_output_aliases=input_output_aliases,
   )
+  if lowering_result.gmem_scratch_shapes:  # Drop the GMEM scratch.
+    outs = outs[:-len(lowering_result.gmem_scratch_shapes)]
   if (prof_ctx := lowering_result.profiler_context) is not None:
     *outs, prof_buffer = outs
     if (dump_path := prof_ctx.dump_path) == "sponge":
@@ -112,3 +128,7 @@ def do_callback(prof_buffer):
         ctx.replace(avals_in=(new_avals_out[-1],)), prof_buffer
     )
   return outs
+
+
+def _as_shaped_array(t: jax.ShapeDtypeStruct) -> jax_core.ShapedArray:
+  return jax_core.ShapedArray(t.shape, np.dtype(t.dtype))
diff --git a/jax/experimental/mosaic/gpu/core.py b/jax/experimental/mosaic/gpu/core.py
@@ -104,6 +104,7 @@ def _mosaic_gpu_lowering_rule(
     out_types,
     input_output_aliases: tuple[tuple[int, int], ...] = (),
 ):
+  assert len(args) == len(ctx.avals_in)
   assert len(out_types) == len(ctx.avals_out)
   module = _run_serde_pass(
       module,
diff --git a/jax/experimental/pallas/mosaic_gpu.py b/jax/experimental/pallas/mosaic_gpu.py
@@ -23,6 +23,7 @@
 from jax._src.pallas.mosaic_gpu.core import GPUMemorySpace as GPUMemorySpace
 from jax._src.pallas.mosaic_gpu.core import GPUMesh as GPUMesh
 from jax._src.pallas.mosaic_gpu.core import kernel as kernel
+from jax._src.pallas.mosaic_gpu.core import SemaphoreType as SemaphoreType
 from jax._src.pallas.mosaic_gpu.core import SwizzleTransform as SwizzleTransform
 from jax._src.pallas.mosaic_gpu.core import TilingTransform as TilingTransform
 from jax._src.pallas.mosaic_gpu.core import transpose_ref as transpose_ref
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -2408,6 +2408,30 @@ def compute(l_smem, r_smem, o_smem):
     out = plgpu.kernel(body, out_shape=x, grid=(2,), axis_names=("rows",))(x, x)
     np.testing.assert_allclose(out, x + x)
 
+  def test_semaphore_lowering(self):
+    # This is a smoke test until we add support for lowering of semaphore ops.
+    def body(i_ref1, i_ref2, o_ref, sem_ref):
+      del i_ref2  # Only here to have a different number of inputs and outputs.
+      assert sem_ref.shape == (4,)
+      assert jnp.issubdtype(sem_ref.dtype, pl.semaphore)
+      o_ref[...] = i_ref1[...]
+    x = jnp.arange(128, dtype=jnp.float32).reshape((128,))
+    kernel = pl.pallas_call(
+        body, out_shape=x, scratch_shapes=[plgpu.SemaphoreType.REGULAR((4,))],
+    )
+    text = jax.jit(kernel).lower(x, x).as_text()
+    self.assertIn(
+        r"output_operand_aliases ="
+        r" [#stablehlo.output_operand_alias<output_tuple_indices = [1],"
+        r" operand_index = 2, operand_tuple_indices = []>]",
+        text,
+    )
+    self.assertIn(
+        r"(tensor<128xf32>, tensor<128xf32>, tensor<4xi32>) ->"
+        r" (tensor<128xf32>, tensor<4xi32>)",
+        text,
+    )
+
 
 class ExamplesSm90ATest(PallasSm90ATest):
 

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ pytype_strict_library(`
`48`	`48`	`"//jax:mlir",`
`49`	`49`	`"//jax:mosaic_gpu",`
`50`	`50`	`"//jax/_src/pallas",`
`51`		`- ],`
	`51`	`+ ] + py_deps("numpy"),`
`52`	`52`	`)`
`53`	`53`
`54`	`54`	`pytype_strict_library(`