[pallas] pallas_call_p is now parameterized by a mesh

superbobry · Google-ML-Automation · commit 051687dc4c89 · 2025-03-17T16:30:40.000-07:00
The mesh is necessary to add support for clusters to the Mosaic GPU backend.

PiperOrigin-RevId: 737792129
diff --git a/jax/_src/pallas/core.py b/jax/_src/pallas/core.py
@@ -15,6 +15,7 @@
 """Module for pallas-core functionality."""
 from __future__ import annotations
 
+import collections
 from collections.abc import Callable, Iterable, Iterator, Sequence
 import contextlib
 import copy
@@ -1068,16 +1069,26 @@ def _core_map_abstract_eval(*args, jaxpr, mesh, **_):
   return [], effs
 
 
+class Mesh(Protocol):
+
+  @property
+  def backend(self) -> str:
+    ...
+
+  @property
+  def shape(self) -> collections.OrderedDict[object, int]:
+    ...
+
+
 _core_map_mesh_rules: dict[type[Any], Callable[..., Any]] = {}
 
 
 def default_mesh_discharge_rule(
     in_avals,
     out_avals,
     *args,
-    grid,
+    mesh,
     compiler_params,
-    backend,
     jaxpr,
     debug,
     interpret,
@@ -1100,19 +1111,22 @@ def body(*args):
       if isinstance(eff, state_types.WriteEffect)
   )
   any_spec = BlockSpec(memory_space=MemorySpace.ANY)
+  grid_spec = GridSpec(
+      grid=tuple(mesh.shape.items()),
+      in_specs=[any_spec] * len(in_avals),
+      out_specs=[any_spec] * len(modified_idxs),
+  )
   from jax._src.pallas import pallas_call  # Avoid circular dependency.
-  outs = pallas_call.pallas_call(
+  outs = pallas_call._pallas_call(
       body,
       name=name,
       out_shape=[in_avals[idx] for idx in modified_idxs],
-      in_specs=[any_spec] * len(in_avals),
-      out_specs=[any_spec] * len(modified_idxs),
       input_output_aliases={
           in_idx: out_idx for out_idx, in_idx in enumerate(modified_idxs)
       },
-      grid=grid,
+      grid_spec=grid_spec,
+      mesh=mesh,
       compiler_params=compiler_params,
-      backend=backend,
       interpret=interpret,
       debug=debug,
       cost_estimate=cost_estimate,
diff --git a/jax/_src/pallas/hlo_interpreter.py b/jax/_src/pallas/hlo_interpreter.py
@@ -340,11 +340,12 @@ def pallas_call_hlo_interpret(
     debug: bool,
     input_output_aliases: tuple[tuple[int, int], ...],
     grid_mapping: GridMapping,
+    mesh: pallas_core.Mesh | None,
     compiler_params: Any,
     cost_estimate: CostEstimate,
     out_avals: tuple[jax_core.AbstractValue, ...],
 ):
-  del compiler_params, cost_estimate, out_avals
+  del mesh, compiler_params, cost_estimate, out_avals
   debug_info = jaxpr.debug_info
   # If we're in interpret mode, we *scan* over the grid and eval the
   # discharged jaxpr.
diff --git a/jax/_src/pallas/mosaic/core.py b/jax/_src/pallas/mosaic/core.py
@@ -211,6 +211,10 @@ class TensorCoreMesh:
   devices: np.ndarray
   axis_names: Sequence[str]
 
+  @property
+  def backend(self) -> str:
+    return "mosaic_tpu"
+
   @property
   def shape(self):
     return collections.OrderedDict(zip(self.axis_names, self.devices.shape))
@@ -259,7 +263,6 @@ def _tensorcore_mesh_discharge_rule(
     compiler_params = TPUCompilerParams()
   if len(mesh.shape) > 1:
     raise NotImplementedError("Mesh must be 1D")
-  core_axis_name, num_cores = list(mesh.shape.items())[0]
   if compiler_params.dimension_semantics is not None:
     raise ValueError(
         "dimension_semantics must be None for TensorCoreMesh"
@@ -269,13 +272,12 @@ def _tensorcore_mesh_discharge_rule(
       out_avals,
       *args,
       jaxpr=jaxpr,
-      grid=((core_axis_name, num_cores),),
+      mesh=mesh,
       compiler_params=compiler_params.replace(
           dimension_semantics=(PARALLEL,)
       ),
       debug=debug,
       interpret=interpret,
-      backend="mosaic_tpu",
       cost_estimate=cost_estimate,
       name=name,
   )
diff --git a/jax/_src/pallas/mosaic/interpret.py b/jax/_src/pallas/mosaic/interpret.py
@@ -1351,12 +1351,13 @@ def interpret_pallas_call(
     debug: bool,
     input_output_aliases: tuple[tuple[int, int], ...],
     grid_mapping: GridMapping,
+    mesh: pallas_core.Mesh | None,
     compiler_params: Any,
     cost_estimate: CostEstimate,
     out_avals: tuple[jax_core.AbstractValue, ...],
     interpret_params: TPUInterpretParams,
 ):
-  del debug, cost_estimate, out_avals
+  del debug, mesh, cost_estimate, out_avals
 
   # args contains: *dynamic_grid_sizes, *index, *inputs.  (No consts?)
   dynamic_grid_args, scalars, input_args = split_list(
diff --git a/jax/_src/pallas/mosaic/pallas_call_registration.py b/jax/_src/pallas/mosaic/pallas_call_registration.py
@@ -108,6 +108,7 @@ def pallas_call_tpu_lowering_rule(
     *in_nodes,
     jaxpr: jax_core.Jaxpr,
     grid_mapping: core.GridMapping,
+    mesh: pallas_core.Mesh | None,
     input_output_aliases: tuple[tuple[int, int], ...],
     debug: bool,
     interpret: bool,
@@ -116,7 +117,8 @@ def pallas_call_tpu_lowering_rule(
     out_avals: tuple[jax_core.AbstractValue, ...],
 ):
   """Lowers a pallas_call to a Mosaic TPU custom call."""
-  del interpret
+  del mesh, interpret  # Unused.
+
   debug_info = jaxpr._debug_info
   if debug:
     print(f"\nThe kernel jaxpr for pallas_call {debug_info.func_src_info}:")
@@ -126,11 +128,11 @@ def pallas_call_tpu_lowering_rule(
   else:
     mosaic_params = {}
 
-  mesh = None
+  jax_mesh = None
   axis_context = ctx.module_context.axis_context
   if axis_context is not None:
     if isinstance(axis_context, sharding_impls.SPMDAxisContext):
-      mesh = axis_context.mesh
+      jax_mesh = axis_context.mesh
   mlir_ctx = mlir.JaxIrContext()
   mlir_ctx.append_dialect_registry(mlir.upstream_dialects)
   mlir_ctx.load_all_available_dialects()
@@ -147,7 +149,7 @@ def lower_module(for_verification: bool):
           grid_mapping,
           jaxpr,
           dimension_semantics=dimension_semantics,
-          mesh=mesh,
+          mesh=jax_mesh,
           for_verification=for_verification,
           dynamic_shape_replacement_enabled=pallas_core.dynamic_shapes_export_enabled(),
       )
@@ -164,11 +166,11 @@ def lower_module(for_verification: bool):
   )
 
   if promela_dump_path := _DUMP_PROMELA_TO.value:
-    num_devices = 1 if mesh is None else mesh.devices.size
+    num_devices = 1 if jax_mesh is None else jax_mesh.devices.size
     num_cores = (
         jax.devices()[0].num_cores
-        if mesh is None
-        else mesh.devices[0].num_cores
+        if jax_mesh is None
+        else jax_mesh.devices[0].num_cores
     )
     verification_module, _ = lower_module(for_verification=True)
     model = verification.export_promela_model(
diff --git a/jax/_src/pallas/mosaic_gpu/core.py b/jax/_src/pallas/mosaic_gpu/core.py
@@ -18,7 +18,7 @@
 
 import abc
 import collections
-from collections.abc import Sequence
+from collections.abc import Iterable, Sequence
 import dataclasses
 import enum
 import itertools as it
@@ -519,9 +519,16 @@ def __post_init__(self):
       )
 
   @property
-  def shape(self):
+  def backend(self) -> str:
+    return "mosaic_gpu"
+
+  @property
+  def shape(self) -> collections.OrderedDict[object, int]:
+    pairs: Iterable[tuple[object, int]]
     if self.num_threads is not None:
-      pairs = zip(self.axis_names, (*self.grid, *self.cluster, self.num_threads))
+      pairs = zip(
+          self.axis_names, (*self.grid, *self.cluster, self.num_threads)
+      )
     else:
       pairs = tuple(
           zip(
@@ -563,8 +570,7 @@ def _gpu_mesh_discharge_rule(
       out_avals,
       *args,
       jaxpr=jaxpr,
-      grid=tuple(mesh.shape.items()),
-      backend="mosaic_gpu",
+      mesh=mesh,
       compiler_params=compiler_params,
       debug=debug,
       interpret=interpret,
diff --git a/jax/_src/pallas/mosaic_gpu/lowering.py b/jax/_src/pallas/mosaic_gpu/lowering.py
@@ -450,6 +450,7 @@ def index_map(*indices):
 
 def lower_pipelined_jaxpr_to_module(
     grid_mapping: pallas_core.GridMapping,
+    mesh: pallas_core.Mesh | None,
     jaxpr: jax_core.Jaxpr,
     compiler_params: dict[str, Any],
     cost_estimate: pallas_core.CostEstimate | None,
@@ -473,7 +474,10 @@ def lower_pipelined_jaxpr_to_module(
       block_mappings, [grid_mapping.num_inputs]
   )
 
-  if grid_mapping.grid_names:  # Last dim corresponds to the warpgroup count
+  if mesh is not None:
+    assert isinstance(mesh, gpu_core.GPUMesh)
+  if mesh and mesh.num_threads is not None:
+    # Last dim corresponds to the warpgroup count.
     block = (128 * grid_mapping.grid[-1], 1, 1)
     grid = grid_mapping.grid[:-1]
   else:
@@ -566,6 +570,7 @@ def body_fn(*refs):
         parallel_grid,
         grid_mapping.grid_names,
         block,
+        mesh.cluster if mesh is not None else (),
         [bm.array_shape_dtype for bm in in_block_mappings],
         [bm.array_shape_dtype for bm in out_block_mappings],
         new_jaxpr,
@@ -578,6 +583,7 @@ def lower_jaxpr_to_module(
     grid: Sequence[int],
     grid_names: Sequence[str],
     block: Sequence[int],
+    cluster: Sequence[int],
     in_shapes: Sequence[jax.ShapeDtypeStruct],
     out_shapes: Sequence[jax.ShapeDtypeStruct],
     jaxpr: jax_core.Jaxpr,
@@ -640,7 +646,7 @@ def body(launch_ctx: mgpu.LaunchContext, *buffers: ir.Value):
       mgpu_core._lower_as_gpu_kernel(
           body,
           grid=parallel_grid,
-          cluster=(),
+          cluster=cluster,
           block=block,
           in_shapes=in_shapes,
           out_shape=out_shapes,
diff --git a/jax/_src/pallas/mosaic_gpu/pallas_call_registration.py b/jax/_src/pallas/mosaic_gpu/pallas_call_registration.py
@@ -38,6 +38,7 @@ def pallas_call_lowering(
     debug: bool,
     input_output_aliases: tuple[tuple[int, int], ...],
     grid_mapping: pallas_core.GridMapping,
+    mesh: pallas_core.Mesh | None,
     compiler_params: dict[str, Any],
     cost_estimate: pallas_core.CostEstimate | None,
     out_avals: tuple[jax_core.AbstractValue, ...],
@@ -63,6 +64,7 @@ def pallas_call_lowering(
 
   lowering_result = lowering.lower_pipelined_jaxpr_to_module(
       grid_mapping,
+      mesh,
       jaxpr,
       compiler_params,
       cost_estimate,
diff --git a/jax/_src/pallas/pallas_call.py b/jax/_src/pallas/pallas_call.py
diff --git a/jax/_src/pallas/triton/pallas_call_registration.py b/jax/_src/pallas/triton/pallas_call_registration.py