[mgpu pallas] Layout iota operation.

cperivol · Google-ML-Automation · commit 8477580d9513 · 2024-11-27T08:34:10.000-08:00
PiperOrigin-RevId: 700711177
diff --git a/jax/_src/pallas/mosaic_gpu/BUILD b/jax/_src/pallas/mosaic_gpu/BUILD
@@ -91,7 +91,7 @@ pytype_strict_library(
         ":lowering",
         "//jax",
         "//jax:core",
-        "//jax:effects",
+        "//jax:mlir",
         "//jax:mosaic_gpu",
         "//jax:tree_util",
         "//jax:util",
diff --git a/jax/_src/pallas/mosaic_gpu/primitives.py b/jax/_src/pallas/mosaic_gpu/primitives.py
@@ -25,8 +25,10 @@
 from jax._src import state
 from jax._src import tree_util
 from jax._src import util
+from jax._src.interpreters import mlir
 from jax._src.lib.mlir import ir
 from jax._src.lib.mlir.dialects import arith as arith_dialect
+from jax._src.lib.mlir.dialects import llvm as llvm_dialect
 from jax._src.lib.mlir.dialects import nvvm as nvvm_dialect
 from jax._src.pallas import core as pallas_core
 from jax._src.pallas.mosaic_gpu import core as gpu_core
@@ -692,3 +694,31 @@ def _commit_smem_lowering(ctx: lowering.LoweringRuleContext):
 def commit_smem():
   """Commits all writes to SMEM, making them visible to loads, TMA and WGMMA."""
   commit_smem_p.bind()
+
+
+broadcasted_iota_p = jax_core.Primitive("broadcasted_iota")
+
+@broadcasted_iota_p.def_abstract_eval
+def _broadcasted_iota_abstract_eval(dtype, shape, dimension, layout):
+  del layout, dimension
+  return jax_core.ShapedArray(shape, dtype)
+
+@lowering.register_lowering_rule(broadcasted_iota_p)
+def _broadcasted_iota_lowering(ctx: lowering.LoweringRuleContext, dtype, shape, dimension, layout):
+  del ctx
+  undef = llvm_dialect.mlir_undef(mlir.dtype_to_ir_type(dtype))
+  is_signed = (
+      jnp.issubdtype(dtype, jnp.signedinteger)
+      if jnp.issubdtype(dtype, jnp.integer)
+      else None
+  )
+  mlir_dtype = mlir.dtype_to_ir_type(dtype)
+  return mgpu.FragmentedArray.splat(
+      undef, shape, layout.value, is_signed=is_signed
+  ).foreach(
+      lambda _, idx: arith_dialect.index_cast(mlir_dtype, idx[dimension]), create_array=True, is_signed=is_signed
+  )
+
+
+def broadcasted_iota(dtype, shape, dimension, *, layout: Layout | None = None):
+  return broadcasted_iota_p.bind(dtype=jnp.dtype(dtype), shape=shape, dimension=dimension, layout=layout)
diff --git a/jax/experimental/pallas/mosaic_gpu.py b/jax/experimental/pallas/mosaic_gpu.py
@@ -36,6 +36,7 @@
 from jax._src.pallas.mosaic_gpu.primitives import copy_smem_to_gmem as copy_smem_to_gmem
 from jax._src.pallas.mosaic_gpu.primitives import Layout as Layout
 from jax._src.pallas.mosaic_gpu.primitives import layout_cast as layout_cast
+from jax._src.pallas.mosaic_gpu.primitives import broadcasted_iota as broadcasted_iota
 from jax._src.pallas.mosaic_gpu.primitives import set_max_registers as set_max_registers
 from jax._src.pallas.mosaic_gpu.primitives import wait_smem_to_gmem as wait_smem_to_gmem
 from jax._src.pallas.mosaic_gpu.primitives import wgmma as wgmma
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -241,6 +241,17 @@ def kernel(x_ref, o_ref):
     # are never written to.
     np.testing.assert_array_equal(kernel(x)[:, :16], y[:, :16])
 
+  def test_iota(self):
+    dtype, dimension = jnp.int8, 1
+    @functools.partial(
+        pl.pallas_call,
+        out_shape=jax.ShapeDtypeStruct((128, 128), dtype),
+    )
+    def kernel(o_ref):
+      o_ref[...] = plgpu.broadcasted_iota(dtype, (128, 128), dimension, layout=plgpu.Layout.WGMMA)
+
+    np.testing.assert_array_equal(kernel(), jax.lax.broadcasted_iota(dtype, (128, 128), dimension))
+
   @parameterized.product(indexer=[..., slice(128), slice(None, 128)])
   def test_copy_smem_to_gmem(self, indexer):
     @functools.partial(