Implement the extension to the custom_partitioning API.

bixia1 · Google-ML-Automation · commit c4ac0dd6bd93 · 2025-01-08T13:34:47.000-08:00
Add a sharding rule string and trailing factor_sizes to def_partition, to
provide a sharding rule specification when Shardy is used. We use this
information to construct a SdyShardingRule and invoke SdyShardingRule.build
during MLIR lowering.

Extend custom_partitioner tests in  pjit_test.py for Shardy sharding rule.

PiperOrigin-RevId: 713399604
diff --git a/jax/_src/custom_partitioning.py b/jax/_src/custom_partitioning.py
@@ -28,13 +28,15 @@
 import jax
 from jax import tree_util
 from jax._src import api_util
+from jax._src import config
 from jax._src import core
 from jax._src import custom_api_util
 from jax._src import dispatch
 from jax._src import linear_util as lu
 from jax._src import mesh as mesh_lib
 from jax._src import sharding_impls
 from jax._src import xla_bridge as xb
+from jax._src.custom_partitioning_sharding_rule import sdy_sharding_rule_to_mlir, SdyShardingRule, str_to_sdy_sharding_rule
 from jax._src.interpreters import mlir
 from jax._src.interpreters import partial_eval as pe
 from jax._src.lib import xla_client as xc
@@ -225,18 +227,20 @@ def _custom_partitioning_abstract_eval(*avals, call, in_tree, out_tree,
                                        propagate_user_sharding, partition,
                                        infer_sharding_from_operands,
                                        decode_shardings,
+                                       sharding_rule,
                                        static_args):
   del in_tree, out_tree, propagate_user_sharding, partition
-  del infer_sharding_from_operands, decode_shardings, static_args
+  del infer_sharding_from_operands, decode_shardings, sharding_rule
+  del static_args
   return call.out_avals
 
 
 def _custom_partitioning_impl(*args, call, in_tree, out_tree,
                               propagate_user_sharding,
                               partition, infer_sharding_from_operands,
-                              decode_shardings, static_args):
+                              decode_shardings, sharding_rule, static_args):
   del in_tree, out_tree, propagate_user_sharding, partition
-  del infer_sharding_from_operands, decode_shardings, static_args
+  del infer_sharding_from_operands, decode_shardings, static_args, sharding_rule
   return core.jaxpr_as_fun(call)(*args)
 
 
@@ -281,7 +285,14 @@ def infer_sharding_from_operands(mesh, arg_shapes, shape):
       arg_shardings = jax.tree.map(lambda x: x.sharding, arg_shapes)
 
 
-    f.def_partition(partition, propagate_user_sharding, infer_sharding_from_operands)
+    f.def_partition(partition, propagate_user_sharding,
+                    infer_sharding_from_operands=infer_sharding_from_operands,
+                    sharding_rule='i j -> 'i j')
+    When config.use_shardy_partitioner.value is True, the sharding_rule is
+    used; otherwise, propagate_user_sharding and infer_sharding_from_operands
+    are used.
+    Instead of using an Einsum-like notation string, sharding_rule can also be
+    a SdyShardingRule object, such as sharding_rule=SdyShardingRule(("i", "j"), ("i", "j")).
 
   The args to ``def_partition`` are as follows:
 
@@ -298,6 +309,10 @@ def infer_sharding_from_operands(mesh, arg_shapes, shape):
   * ``decode_shardings``: When set to True, convert input ``GSPMDSharding``s to
     ``NamedSharding`` if possible. This may not be possible if the user does not
     provide a contextual mesh.
+  * ``sharding_rule``: Either an SdyShardingRule object or an Einsum-like
+    notation string that describes the sharding rule. We borrow the idea from
+    the einops.rearrange string , to use a space separator between factors and
+    allow multiple letters factor names.
 
   Positional arguments can be specified as static using static_argnums. JAX uses
   :code:`inspect.signature(fun)` to resolve these positional arguments.
@@ -350,9 +365,16 @@ def infer_sharding_from_operands(mesh, arg_shapes, result_shape):
       def my_fft(x):
           return fft(x)
 
+      # Use Einsum-like notation to specify the sharding rule.
       my_fft.def_partition(
-          infer_sharding_from_operands=infer_sharding_from_operands,
-          partition=partition)
+        infer_sharding_from_operands=infer_sharding_from_operands,
+        partition=partition,
+        sharding_rule='...i -> ...i')
+      # Use SdyShardingRule object to specify the sharding rule.
+      my_fft.def_partition(
+        infer_sharding_from_operands=infer_sharding_from_operands,
+        partition=partition,
+        sharding_rule=SdyShardingRule(operand_mappings=((SDY_BATCHING, 'i'),), result_mappings=((SDY_BATCHING, 'i'),))))
 
     Now create a 2D array sharded along the first axis, pass it through ``my_fft``
     and notice how it is still sharded as expected, and identical to the output
@@ -425,15 +447,25 @@ def __init__(self, fun, static_argnums=()):
     self.static_argnums = static_argnums
     self.propagate_user_sharding = None
     self.infer_sharding_from_operands = None
+    self.sharding_rule = None
 
   __getattr__: Any = custom_api_util.forward_attr
 
   def def_partition(self, partition, infer_sharding_from_operands,
-                    propagate_user_sharding=None, decode_shardings=True):
+                    propagate_user_sharding=None, decode_shardings=True,
+                    sharding_rule=None):
+    if config.use_shardy_partitioner.value:
+      infer_sharding_from_operands = None
+      propagate_user_sharding = None
+    else:
+      sharding_rule = None
     self.partition = partition
     self.propagate_user_sharding = propagate_user_sharding
     self.infer_sharding_from_operands = infer_sharding_from_operands
     self.decode_shardings = decode_shardings
+    self.sharding_rule = None if sharding_rule is None \
+      else sharding_rule if isinstance(sharding_rule, SdyShardingRule) \
+          else str_to_sdy_sharding_rule(sharding_rule)
     return partition
 
   def __call__(self, *args, **kwargs):
@@ -471,6 +503,7 @@ def __call__(self, *args, **kwargs):
         propagate_user_sharding=self.propagate_user_sharding,
         infer_sharding_from_operands=self.infer_sharding_from_operands,
         decode_shardings=self.decode_shardings,
+        sharding_rule=self.sharding_rule,
         in_tree=in_tree,
         out_tree=out_tree(),
         static_args=static_args
@@ -483,6 +516,7 @@ def _custom_partitioning_lowering_rule(ctx: mlir.LoweringRuleContext, *values,
                                        propagate_user_sharding, partition,
                                        infer_sharding_from_operands,
                                        decode_shardings,
+                                       sharding_rule,
                                        static_args):
   axis_context = ctx.module_context.axis_context
   if (isinstance(axis_context, sharding_impls.SPMDAxisContext) and
@@ -539,6 +573,9 @@ def to_mesh_pspec_sharding(hlo_sharding: xc.HloSharding | None, ndim):
       backend_config=ir.StringAttr.get(key),
       operand_layouts=None,
       result_layouts=None)
+  if sharding_rule is not None:
+    value_types = [mlir.aval_to_ir_type(s) for s in call.in_avals]
+    out.attributes['sdy.sharding_rule'] = sdy_sharding_rule_to_mlir(sharding_rule, value_types, result_types)
   return out.results
 
 mlir.register_lowering(custom_partitioning_p,
diff --git a/jax/_src/custom_partitioning_sharding_rule.py b/jax/_src/custom_partitioning_sharding_rule.py
@@ -27,6 +27,9 @@
 # leading ... into factors.
 _BATCHING_DIM_FACTOR_PREFIX = "?"
 
+# A Jax value in general corresponds to an ir.Type or a tuple of ir.Types.
+IrTypes = ir.Type | tuple[ir.Type, ...]
+
 def _check_factor(factor:str):
   """Validates a factor.
 
@@ -278,8 +281,8 @@ def str_to_sdy_sharding_rule(rule: str, **factor_sizes) -> SdyShardingRule:
 
 def sdy_sharding_rule_to_mlir(
   rule: SdyShardingRule,
-  operand_types: list[ir.Type],
-  result_types: list[ir.Type],) -> ir.Attribute:
+  operand_types: list[IrTypes],
+  result_types: list[IrTypes],) -> ir.Attribute:
   """Builds the MLIR representation for the sharding rule.
 
   This is done by verifying that the rule is consistent with the types of
@@ -294,6 +297,10 @@ def sdy_sharding_rule_to_mlir(
     raise ValueError(
       f"Sharding rule has {len(rule.result_mappings)} results, but the operation"
       f" has {len(result_types)} results")
+  if not all(isinstance(t, ir.Type) for t in operand_types + result_types):
+    raise TypeError(
+        f"operand_types and result_types must be a list of ir.Type, but got"
+        f" {operand_types} and {result_types}")
 
   factors_to_indices_sizes: OrderedDict[str, list[int]] = OrderedDict()
   types = operand_types + result_types
diff --git a/jax/experimental/custom_partitioning.py b/jax/experimental/custom_partitioning.py
@@ -19,3 +19,10 @@
     custom_partitioning as custom_partitioning,
     custom_partitioning_p as custom_partitioning_p,
 )
+
+from jax._src.custom_partitioning_sharding_rule import (
+    BATCHING as BATCHING,
+    CompoundFactor as CompoundFactor,
+    ArrayMapping as ArrayMapping,
+    SdyShardingRule as SdyShardingRule,
+)
diff --git a/tests/pjit_test.py b/tests/pjit_test.py
@@ -43,7 +43,7 @@
 from jax.sharding import PartitionSpec as P, Mesh
 from jax.experimental import multihost_utils
 from jax.experimental.shard_map import shard_map
-from jax.experimental.custom_partitioning import custom_partitioning
+from jax.experimental.custom_partitioning import custom_partitioning, SdyShardingRule, BATCHING
 from jax._src import array
 from jax._src.sharding import Sharding, common_devices_indices_map
 from jax._src import op_shardings
@@ -1320,9 +1320,6 @@ class CustomPartitionerTest(jtu.JaxTestCase):
   def skip_if_custom_partitioning_not_supported(self):
     if jtu.is_cloud_tpu():
       raise unittest.SkipTest("Custom partitioning is not supported on libtpu.")
-    if config.use_shardy_partitioner.value:
-      self.skipTest(
-          'Custom partitioning is not supported with Shardy yet.')
 
   @jtu.skip_on_devices('cpu')  # Collectives don't seem to work on CPU.
   @jtu.with_mesh([('x', 4), ('y', 2)])
@@ -1366,7 +1363,8 @@ def f(x, y, precision=None):
 
     f.def_partition(
         infer_sharding_from_operands=infer_sharding_from_operands,
-        partition=partition)
+        partition=partition,
+        sharding_rule=SdyShardingRule(operand_mappings=(('i', 'j'), ('j', 'k')), result_mappings=(('i', 'k'), ('i', 'k'))))
 
     pjit_f = pjit(f, in_shardings=(P('x'), P('y')), out_shardings=P('x'))
     x = np.asarray(np.random.randint(0, 20, (32, 16)), dtype=np.float32)
@@ -1406,6 +1404,7 @@ def f(x):
         infer_sharding_from_operands=infer_sharding_from_operands,
         partition=partition,
         propagate_user_sharding=propagate_user_sharding,
+        sharding_rule='i j -> i j',
     )
 
     def f2(a):
@@ -1442,7 +1441,7 @@ def f(x):
     f.def_partition(
         infer_sharding_from_operands=infer_sharding_from_operands,
         partition=partition,
-    )
+        sharding_rule=SdyShardingRule(operand_mappings=((BATCHING, 'i'),), result_mappings=((BATCHING, 'i'),)))
 
     pjit_f = pjit(f, in_shardings=(P(None, 'x')), out_shardings=P('x'))
     x = np.asarray(np.random.randint(0, 20, (32, 16)), dtype=np.float32)
@@ -1474,6 +1473,7 @@ def f(x):
     f.def_partition(
         infer_sharding_from_operands=infer_sharding_from_operands,
         partition=partition,
+        sharding_rule='i j -> i j',
     )
 
     pjit_f = pjit(f, in_shardings=(P(None, 'x')), out_shardings=P('x'))
@@ -1520,6 +1520,7 @@ def infer_sharding_from_operands(mesh, arg_shapes, result_shape):
     f.def_partition(
         infer_sharding_from_operands=infer_sharding_from_operands,
         partition=partition,
+        sharding_rule='i -> i',
     )
 
     jit_f = jax.jit(f)
@@ -1552,7 +1553,8 @@ def f(carry, x):
     f.def_partition(
         partition,
         infer_sharding_from_operands=lambda mesh, *_: NamedSharding(mesh, P()),
-        propagate_user_sharding=lambda _, user_shape: user_shape.sharding)
+        propagate_user_sharding=lambda _, user_shape: user_shape.sharding,
+        sharding_rule='i j -> ')  # Result is a scalar.
 
     pjit_f = pjit(f, in_shardings=P(None, 'x'))
     xs = jnp.ones([32, 16])
@@ -1588,6 +1590,7 @@ def infer_sharding_from_operands(mesh, arg_shapes, result_shape):
     f.def_partition(
         infer_sharding_from_operands=infer_sharding_from_operands,
         partition=partition,
+        sharding_rule='i -> i',
     )
 
     mesh = jtu.create_mesh((4,), ('x',))