[Mosaic GPU] Add preliminary TMEM allocation support for Pallas/Mosaic GPU.

justinjfu · Google-ML-Automation · commit d0b71fa1ceb1 · 2025-03-20T14:05:28.000-07:00
PiperOrigin-RevId: 738932990
diff --git a/jax/_src/pallas/mosaic_gpu/core.py b/jax/_src/pallas/mosaic_gpu/core.py
@@ -101,6 +101,8 @@ class GPUMemorySpace(enum.Enum):
   GMEM = "gmem"
   #: Shared memory.
   SMEM = "smem"
+  #: Tensor memory.
+  TMEM = "tmem"
   #: Registers.
   REGS = "regs"
 
@@ -452,6 +454,7 @@ def to_block_mapping(
 
 GMEM = GPUMemorySpace.GMEM
 SMEM = GPUMemorySpace.SMEM
+TMEM = GPUMemorySpace.TMEM
 REGS = GPUMemorySpace.REGS
 
 
diff --git a/jax/_src/pallas/mosaic_gpu/lowering.py b/jax/_src/pallas/mosaic_gpu/lowering.py
@@ -59,6 +59,7 @@
 from jax.experimental.mosaic.gpu import core as mgpu_core
 from jax.experimental.mosaic.gpu import profiler as mgpu_profiler
 from jax.experimental.mosaic.gpu import utils as mgpu_utils
+from jax.experimental.mosaic.gpu import tcgen05
 import jax.numpy as jnp
 import numpy as np
 
@@ -100,6 +101,7 @@ def arrival_multiplier(self) -> int:
 @dataclasses.dataclass(kw_only=True, frozen=True)
 class Resources:
   smem_scratch_bytes: int = 0
+  tmem_scratch_cols: int = 0
   barrier_counts: collections.Counter[mgpu.Barrier] = dataclasses.field(
       default_factory=collections.Counter
   )
@@ -110,6 +112,12 @@ def __post_init__(self):
         "smem_scratch_bytes",
         _align_to(self.smem_scratch_bytes, _SMEM_ALIGNMENT),
     )
+    object.__setattr__(
+        self,
+        "tmem_scratch_cols",
+        # TMEM must be allocated in 128x8 chunks.
+        _align_to(self.tmem_scratch_cols, 8),
+    )
 
   @property
   def barriers(self) -> Sequence[mgpu.Barrier]:
@@ -122,6 +130,7 @@ def __add__(self, other: Resources) -> Resources:
     # we will allocate two barriers, even though one would be enough.
     return Resources(
         smem_scratch_bytes=self.smem_scratch_bytes + other.smem_scratch_bytes,
+        tmem_scratch_cols=self.tmem_scratch_cols + other.tmem_scratch_cols,
         barrier_counts=self.barrier_counts + other.barrier_counts,
     )
 
@@ -130,6 +139,9 @@ def __or__(self, other: Resources) -> Resources:
         smem_scratch_bytes=max(
             self.smem_scratch_bytes, other.smem_scratch_bytes
         ),
+        tmem_scratch_cols=max(
+            self.tmem_scratch_cols, other.tmem_scratch_cols
+        ),
         barrier_counts=self.barrier_counts | other.barrier_counts,
     )
 
@@ -218,10 +230,26 @@ def _run_scoped_resource_estimator(
               )
           ])
       )
-    else:
+    elif aval.memory_space == gpu_core.TMEM:
+      if aval.dtype.itemsize != 4:
+        raise ValueError("TMEM only supports 32-bit types.")
+      if len(aval.shape) != 2:
+        raise ValueError("TMEM allocations must be 2D.")
+      if aval.shape[0] % tcgen05.TMEM_ROWS != 0:
+        raise ValueError("TMEM shape[0] must be a multiple of 128.")
+      if aval.shape[1] % 8 != 0:
+        raise ValueError("TMEM shape[1] must be a multiple of 8.")
+      rs += Resources(tmem_scratch_cols=aval.shape[1])
+    elif aval.memory_space == gpu_core.SMEM:
       rs += Resources(
           smem_scratch_bytes=math.prod(aval.shape) * aval.dtype.itemsize
       )
+    elif aval.memory_space == gpu_core.REGS:
+      # Don't need to allocate anything.
+      pass
+    else:
+      raise NotImplementedError(
+          f"Unsupported memory space: {aval.memory_space}")
   return rs + _estimate_resources(ctx, jaxpr)
 
 
@@ -267,6 +295,9 @@ class ModuleContext:
   single_wg_lane_predicate: ir.Value
   smem_requested_bytes: int
   smem_used_bytes: int
+  tmem_requested_cols: int
+  tmem_used_cols: int
+  tmem_base_ptr: ir.Value
   runtime_barriers: MutableMapping[
       mgpu.Barrier, MutableSequence[mgpu.BarrierRef]
   ]
@@ -286,6 +317,27 @@ def reserve_barrier(self, barrier: mgpu.Barrier) -> mgpu.BarrierRef:
       raise RuntimeError(f"Barrier {barrier} is already reserved")
     return available.pop()
 
+  @contextlib.contextmanager
+  def alloc_tmem(
+      self,
+      struct: jax.ShapeDtypeStruct,
+      layout: tcgen05.TMEMLayout | None = None
+  ) -> ir.Value:
+    if self.tmem_used_cols > 0:
+      raise NotImplementedError(
+          "Multiple TMEM allocations are not implemented.")
+    if layout is None:
+      layout = tcgen05._infer_tmem_layout(struct.shape, collective=False)
+    cols_used = np.prod(struct.shape) // tcgen05.TMEM_ROWS
+    self.tmem_used_cols += cols_used
+    off = self.tmem_base_ptr
+    tmem_ref = tcgen05.TMEMRef(address=off,
+                               shape=struct.shape,
+                               dtype=mgpu_utils.dtype_to_ir_type(struct.dtype),
+                               layout=layout)
+    yield tmem_ref
+    self.tmem_used_cols -= cols_used
+
   # TODO(cperivol): Only return the shapes and figure out the sizes when freeing.
   @contextlib.contextmanager
   def scratch_view(
@@ -642,11 +694,15 @@ def lower_jaxpr_to_module(
     parallel_grid = (math.prod(grid[:-2]), *grid[-2:])
 
   def body(launch_ctx: mgpu.LaunchContext, *buffers: ir.Value):
-    *buffers_gmem, (runtime_smem, runtime_barriers) = buffers
+    *buffers_gmem, (runtime_smem, runtime_barriers, runtime_tmem) = buffers
 
     grouped_barriers = collections.defaultdict(list)
     for barrier, barrier_ref in zip(rs.barriers, runtime_barriers):
       grouped_barriers[barrier].append(barrier_ref)
+    if runtime_tmem is not None:
+      tmem_cols = math.prod(runtime_tmem.shape) // tcgen05.TMEM_ROWS
+    else:
+      tmem_cols = 0
     module_ctx = ModuleContext(
         mlir.sanitize_name(debug_info.func_name),
         axis_names,
@@ -655,6 +711,9 @@ def body(launch_ctx: mgpu.LaunchContext, *buffers: ir.Value):
         mgpu.single_thread_predicate(per_block=False),
         smem_requested_bytes=math.prod(ir.MemRefType(runtime_smem.type).shape),
         smem_used_bytes=0,
+        tmem_requested_cols=tmem_cols,
+        tmem_used_cols=0,
+        tmem_base_ptr=runtime_tmem.address if runtime_tmem else None,
         runtime_barriers=grouped_barriers,
         name_stack=source_info_util.NameStack(),
         traceback_caches=mlir.TracebackCaches(),
@@ -671,6 +730,18 @@ def body(launch_ctx: mgpu.LaunchContext, *buffers: ir.Value):
   smem_scratch_bytes = params.get("smem_scratch_bytes")
   if smem_scratch_bytes is None:
     smem_scratch_bytes = rs.smem_scratch_bytes
+  tmem_scratch_cols = rs.tmem_scratch_cols
+
+  scratch_buffers = [
+    jax.ShapeDtypeStruct(shape=[smem_scratch_bytes], dtype=np.int8),
+    rs.barriers,
+  ]
+  if tmem_scratch_cols > 0:
+    scratch_buffers.append(
+      mgpu.TMEM(shape=[tcgen05.TMEM_ROWS, tmem_scratch_cols], dtype=np.int32),
+    )
+  else:
+    scratch_buffers.append(None)
 
   prof_ctx = prof_spec = None
   if prof_space := params.get("profile_space", 0):
@@ -685,10 +756,7 @@ def body(launch_ctx: mgpu.LaunchContext, *buffers: ir.Value):
           block=block,
           in_shapes=in_shapes,
           out_shape=out_shapes,
-          smem_scratch_shape=(
-              jax.ShapeDtypeStruct(shape=[smem_scratch_bytes], dtype=np.int8),
-              rs.barriers,
-          ),
+          smem_scratch_shape=scratch_buffers,
           module_name=mlir.sanitize_name(debug_info.func_name),
           prof_spec=prof_spec,
       )
@@ -990,14 +1058,26 @@ def _ndindexer_indices(indexer: indexing.NDIndexer) -> tuple[gpu_core.Index, ...
 
 
 @register_lowering_rule(sp.get_p, mgpu.ThreadSemantics.Lane)
-def _get_lowering_rule(ctx: LoweringRuleContext, x_smem, *leaves, tree):
-  if not isinstance(x_smem, ir.Value) and ir.MemRefType.isinstance(x_smem):
-    raise TypeError(f"Can only load from references (got {x_smem}).")
+def _get_lowering_rule(ctx: LoweringRuleContext, x_ref, *leaves, tree):
+  if isinstance(x_ref, tcgen05.TMEMRef):
+    transforms = jax.tree.unflatten(tree, leaves)
+    if len(transforms) != 1 or not isinstance(
+        transforms[0], indexing.NDIndexer):
+      raise NotImplementedError(
+          "Only a single indexing transform is supported for TMEM refs.")
+    indexer = cast(indexing.NDIndexer, transforms[0])
+    if not gpu_core.is_trivial_index(indexer.indices, x_ref.shape):
+      raise NotImplementedError(
+          "Only trivial indexing is supported for TMEM refs.")
+    return x_ref[:]
+
+  if not isinstance(x_ref, ir.Value) and ir.MemRefType.isinstance(x_ref):
+    raise TypeError(f"Can only load from references (got {x_ref}).")
 
   x_aval = ctx.avals_in[0]
 
   transforms = jax.tree.unflatten(tree, leaves)
-  x_smem, transforms = _handle_reshaping(x_smem, transforms)
+  x_smem, transforms = _handle_reshaping(x_ref, transforms)
   x_smem, transforms = _handle_indexing(x_smem, transforms)
 
   match transforms:
@@ -1784,6 +1864,14 @@ def _run_scoped_lowering_rule(
       )
       input_refs.append(input_ref)
       should_discharge.append(False)
+    elif aval.memory_space == gpu_core.TMEM:
+      input_ref = alloc_stack.enter_context(
+          ctx.module_ctx.alloc_tmem(
+              jax.ShapeDtypeStruct(shape=aval.shape, dtype=aval.dtype),
+          )
+      )
+      input_refs.append(input_ref)
+      should_discharge.append(False)
     else:
       raise ValueError(f"Can't convert to ref: {aval}")
 
diff --git a/jax/experimental/mosaic/gpu/core.py b/jax/experimental/mosaic/gpu/core.py
@@ -307,6 +307,8 @@ def _smem_tree_size(smem_buffers: ShapeTree) -> int:
           raise NotImplementedError("Misaligned barrier allocation")
         size += num_barriers * utils.MBARRIER_BYTES
       case TMEM(_):
+        # TODO(justinfu): This can trigger misaligned barrier allocations
+        # if TMEM is requested before barriers b/c it's not divisible by 8.
         size += 4  # i32 takes up 4 bytes
       case _:
         size += _count_buffer_bytes(l)
diff --git a/jax/experimental/pallas/mosaic_gpu.py b/jax/experimental/pallas/mosaic_gpu.py
@@ -51,3 +51,5 @@
 GMEM = GPUMemorySpace.GMEM
 #: Alias of :data:`jax.experimental.pallas.mosaic_gpu.GPUMemorySpace.SMEM`.
 SMEM = GPUMemorySpace.SMEM
+#: Alias of :data:`jax.experimental.pallas.mosaic_gpu.GPUMemorySpace.TMEM`.
+TMEM = GPUMemorySpace.TMEM
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -83,6 +83,13 @@ def setUp(self):
     super().setUp()
 
 
+class PallasSm100ATest(PallasTest, jtu.CudaArchSpecificTest):
+
+  def setUp(self):
+    self.skip_unless_sm100a()
+    super().setUp()
+
+
 class PallasCallTest(PallasTest):
 
   @parameterized.product(
@@ -1531,6 +1538,28 @@ def scope(acc_ref):
     np.testing.assert_allclose(res, a @ b, rtol=1e-3)
 
 
+class PallasCallSm100ATest(PallasSm100ATest):
+
+  def test_tmem_alloc(self):
+    mesh = plgpu.GPUMesh(num_threads=1, axis_names=("x"))
+    @pl.run_state
+    def inner(y_ref):
+      @pl.core_map(mesh)
+      def _():
+        def scope(tmem_ref, smem_ref):
+          # Issue a write so the TMEM load is not DCE'd.
+          smem_ref[...] = tmem_ref[...]
+          plgpu.commit_smem()
+          plgpu.copy_smem_to_gmem(smem_ref, y_ref)
+          plgpu.wait_smem_to_gmem(0)
+        pl.run_scoped(scope,
+          plgpu.TMEM((128, 128), jnp.float32),
+          plgpu.SMEM((128, 128), jnp.float32))
+    y_init = jnp.zeros((128, 128), np.float32)
+    # Test that this runs without errors.
+    jax.block_until_ready(inner(y_init))
+
+
 class PipelineTest(PallasTest):
 
   def test_pipeline_mode(self):