[sharding_in_types] Functions like einsum, reshape, broadcast_in_dim, broadcasted_iota, convert_element_type and sharding_cast that take out_sharding as an argument in their signature should also allow PartitionSpec instead of just NamedSharding as an input.

yashk2810 · Google-ML-Automation · commit 3848f0d2ac55 · 2025-01-08T11:11:16.000-08:00
If PartitionSpec is passed, the mesh is read from the context. The primitives though take `NamedSharding` only. The conversion from `PartitionSpec` to `NamedSharding` happens above `.bind`.

We also raise an error if `PartitionSpec` contain mesh axis names that are of type Auto or Collective for the above functions.

PiperOrigin-RevId: 713352542
diff --git a/jax/BUILD b/jax/BUILD
@@ -866,6 +866,7 @@ pytype_strict_library(
         ":partition_spec",
         ":sharding",
         ":sharding_specs",
+        ":source_info_util",
         ":tree_util",
         ":util",
         ":xla_bridge",
diff --git a/jax/_src/core.py b/jax/_src/core.py
@@ -1696,6 +1696,9 @@ def __init__(self, shape, dtype, weak_type=False, sharding=None):
     self.weak_type = weak_type
     if config.sharding_in_types.value:
       self.sharding = get_sharding(sharding, len(self.shape))
+      if not isinstance(self.sharding.mesh, mesh_lib.AbstractMesh):
+        raise ValueError(
+            f"Mesh of an aval must be an AbstractMesh. Got {self.sharding.mesh}")
 
   def update(self, shape=None, dtype=None, weak_type=None, **kwargs):
     if shape is None:
diff --git a/jax/_src/lax/lax.py b/jax/_src/lax/lax.py
@@ -64,7 +64,7 @@
 from jax._src.lib.mlir.dialects import chlo
 from jax._src.lib.mlir.dialects import hlo
 from jax._src.sharding_impls import (PmapSharding, NamedSharding,
-                                     PartitionSpec as P)
+                                     PartitionSpec as P, canonicalize_sharding)
 from jax._src.typing import Array, ArrayLike, DimSize, DuckTypedArray, DTypeLike, Shape
 from jax._src.util import (NumpyComplexWarning, cache, canonicalize_axis,
                            safe_map, safe_zip, split_list, weakref_lru_cache)
@@ -586,6 +586,8 @@ def _convert_element_type(
       isinstance(operand, Array)):
     sharding = operand.sharding
 
+  sharding = canonicalize_sharding(sharding, check_mesh_consistency=False)  # type: ignore
+
   if (warn_on_complex_to_real_cast and
       dtypes.issubdtype(old_dtype, np.complexfloating) and
       not dtypes.issubdtype(new_dtype, np.complexfloating)):
@@ -1431,6 +1433,7 @@ def broadcast_in_dim(operand: ArrayLike, shape: Shape,
   if not config.sharding_in_types.value and sharding is not None:
     raise NotImplementedError("sharding argument to broadcast_in_dim is only "
                               "allowed when sharding_in_types config is on.")
+  sharding = canonicalize_sharding(sharding)
   if (np.ndim(operand) == len(shape) and not len(broadcast_dimensions) and
       isinstance(operand, Array) and sharding is None):
     return operand
@@ -1505,7 +1508,7 @@ def reshape(operand: ArrayLike, new_sizes: Shape,
     return operand
   else:
     dyn_shape, static_new_sizes = _extract_tracers_dyn_shape(new_sizes)
-
+    sharding = canonicalize_sharding(sharding)
     return reshape_p.bind(
       operand, *dyn_shape, new_sizes=tuple(static_new_sizes),
       dimensions=None if dims is None or same_dims else dims,
@@ -1947,19 +1950,20 @@ def iota(dtype: DTypeLike, size: int) -> Array:
   return broadcasted_iota(dtype, (size,), 0)
 
 def broadcasted_iota(dtype: DTypeLike, shape: Shape, dimension: int,
-                     _sharding=None) -> Array:
+                     sharding=None) -> Array:
   """Convenience wrapper around ``iota``."""
   dtype = dtypes.canonicalize_dtype(dtype)
   shape = canonicalize_shape(shape)
   dynamic_shape = [d for d in shape if isinstance(d, core.Tracer)]
   static_shape = [None if isinstance(d, core.Tracer) else d for d in shape]
   dimension = core.concrete_or_error(
       int, dimension, "dimension argument of lax.broadcasted_iota")
-  if not config.sharding_in_types.value and _sharding is not None:
+  if not config.sharding_in_types.value and sharding is not None:
     raise NotImplementedError('sharding support for broadcasted_iota is not '
                               'implemented outside of sharding_in_types mode.')
+  sharding = canonicalize_sharding(sharding)
   return iota_p.bind(*dynamic_shape, dtype=dtype, shape=tuple(static_shape),
-                     dimension=dimension, sharding=_sharding)
+                     dimension=dimension, sharding=sharding)
 
 def _eye(dtype: DTypeLike, shape: Shape, offset: DimSize = 0) -> Array:
   """Like numpy.eye, create a 2D array with ones on a diagonal."""
@@ -5560,7 +5564,7 @@ def _compute_argminmax(value_comparator, get_identity,
   axis, = axes
   indices = broadcasted_iota(
       index_dtype, np.shape(operand), axis,
-      _sharding=operand.sharding if config.sharding_in_types.value else None)
+      sharding=operand.sharding if config.sharding_in_types.value else None)
   res = reduce([operand, indices],
                [get_identity(operand.dtype), np.array(0, index_dtype)],
                _ArgMinMaxReducer(value_comparator),
diff --git a/jax/_src/nn/functions.py b/jax/_src/nn/functions.py
@@ -671,7 +671,7 @@ def _one_hot(x: Array, num_classes: int, *,
   else:
     rhs_sharding = None
   rhs = lax.broadcasted_iota(x.dtype, rhs_shape, output_pos_axis,
-                             _sharding=rhs_sharding)
+                             sharding=rhs_sharding)
   return (lhs == rhs).astype(dtype)
 
 # TODO(slebedev): Change the type of `x` to `ArrayLike`.
diff --git a/jax/_src/numpy/lax_numpy.py b/jax/_src/numpy/lax_numpy.py
@@ -54,7 +54,7 @@
 from jax._src.core import ShapedArray
 from jax._src.custom_derivatives import custom_jvp
 from jax._src.lax import lax as lax_internal
-from jax._src.lax.lax import ( PrecisionLike,_array_copy,
+from jax._src.lax.lax import (PrecisionLike,_array_copy,
                               _sort_le_comparator, _sort_lt_comparator)
 from jax._src.lib import xla_client as xc
 from jax._src.numpy import reductions
@@ -69,8 +69,9 @@
     NumpyComplexWarning, canonicalize_axis as _canonicalize_axis,
     ceil_of_ratio, partition_list, safe_zip, set_module, unzip2,
     tuple_replace)
-from jax.sharding import (Sharding, SingleDeviceSharding, NamedSharding,
-                          PartitionSpec as P)
+from jax.sharding import Sharding
+from jax._src.sharding_impls import (SingleDeviceSharding, NamedSharding,
+                                     PartitionSpec as P, canonicalize_sharding)
 from jax.tree_util import tree_flatten, tree_leaves, tree_map
 import numpy as np
 import opt_einsum
@@ -9873,6 +9874,7 @@ def _einsum(
   if out_type is not None and not config.sharding_in_types.value:
     raise NotImplementedError("out_type only works when sharding_in_types "
                               "config is True.")
+  out_type = canonicalize_sharding(out_type)
   if out_type is not None and not isinstance(out_type, NamedSharding):
     raise NotImplementedError(
         "`out_type` argument of `einsum` only supports NamedSharding instances."
diff --git a/jax/_src/pjit.py b/jax/_src/pjit.py
@@ -67,7 +67,8 @@
 from jax._src.sharding_impls import (
     NamedSharding, GSPMDSharding,
     SingleDeviceSharding, PmapSharding, AUTO, UNSPECIFIED, UnspecifiedValue,
-    ParsedPartitionSpec, get_single_pspec, prepare_axis_resources, parse_flatten_op_sharding)
+    ParsedPartitionSpec, get_single_pspec, prepare_axis_resources,
+    parse_flatten_op_sharding, canonicalize_sharding)
 from jax._src.layout import Layout, DeviceLocalLayout, AutoLayout
 from jax._src.state import discharge as state_discharge, RefEffect, AbstractRef
 from jax._src.traceback_util import api_boundary
@@ -2670,13 +2671,20 @@ def _sharding_constraint_batcher(
 
 def sharding_cast(xs, shardings):
   if isinstance(shardings, NamedSharding):
-    return tree_map(lambda x: sharding_cast_p.bind(
-        x, src_sharding=x.sharding, dst_sharding=shardings), xs)
+    return tree_map(
+        lambda x: sharding_cast_p.bind(
+            x, src_sharding=x.sharding, dst_sharding=canonicalize_sharding(
+                shardings, check_mesh_consistency=False)),
+        xs)
 
   x_flat, treedef = tree_flatten(xs)
   shardings_flat = flatten_axes("sharding_cast shardings", treedef, shardings)
-  out_flat = [sharding_cast_p.bind(x, src_sharding=x.sharding, dst_sharding=s)
-              for x, s in safe_zip(x_flat, shardings_flat)]
+  out_flat = [
+      sharding_cast_p.bind(
+          x, src_sharding=x.sharding,
+          dst_sharding=canonicalize_sharding(s, check_mesh_consistency=False))
+      for x, s in safe_zip(x_flat, shardings_flat)
+  ]
   return tree_unflatten(treedef, out_flat)
 
 sharding_cast_p = core.Primitive('sharding_cast')
diff --git a/jax/_src/sharding_impls.py b/jax/_src/sharding_impls.py
@@ -24,11 +24,13 @@
 from typing import Any, NamedTuple, Union, cast
 
 from jax._src import core
+from jax._src import config
 from jax._src import mesh as mesh_lib
-from jax._src import sharding
+from jax._src import sharding as jsharding
 from jax._src import sharding_specs
 from jax._src import tree_util
 from jax._src import util
+from jax._src import source_info_util
 from jax._src import xla_bridge
 from jax._src import mesh_utils
 from jax._src.lib import xla_client as xc
@@ -45,7 +47,7 @@
 Index = tuple[slice, ...]
 XLADeviceAssignment = tuple[Device, ...]
 # TODO(yashkatariya): Remove this after 3 months of deprecation.
-XLACompatibleSharding = sharding.Sharding
+XLACompatibleSharding = jsharding.Sharding
 
 @dataclasses.dataclass(frozen=True)
 class TransferToMemoryKind:
@@ -219,7 +221,7 @@ def named_sharding_to_xla_hlo_sharding(
 
 
 @use_cpp_class(xc.NamedSharding)
-class NamedSharding(sharding.Sharding):
+class NamedSharding(jsharding.Sharding):
   r"""A :class:`NamedSharding` expresses sharding using named axes.
 
   A :class:`NamedSharding` is a pair of a :class:`Mesh` of devices and
@@ -388,9 +390,6 @@ def with_spec(self, spec: PartitionSpec | Sequence[Any]) -> NamedSharding:
       spec = PartitionSpec(*spec)
     return NamedSharding(self.mesh, spec, memory_kind=self.memory_kind)
 
-  def with_mesh(self, new_mesh: mesh_lib.Mesh) -> NamedSharding:
-    return NamedSharding(new_mesh, self.spec, memory_kind=self.memory_kind)
-
   def _to_xla_hlo_sharding(self, num_dimensions: int) -> xc.HloSharding:
     return named_sharding_to_xla_hlo_sharding(self, num_dimensions)
 
@@ -415,7 +414,7 @@ def get_replicated_hlo_sharding():
 
 
 @use_cpp_class(xc.SingleDeviceSharding)
-class SingleDeviceSharding(sharding.Sharding):
+class SingleDeviceSharding(jsharding.Sharding):
   """A :class:`Sharding` that places its data on a single device.
 
   Args:
@@ -503,7 +502,7 @@ def pmap_sharding_devices_indices_map(
 
 
 @use_cpp_class(xc.PmapSharding)
-class PmapSharding(sharding.Sharding):
+class PmapSharding(jsharding.Sharding):
   """Describes a sharding used by :func:`jax.pmap`."""
   devices: np.ndarray
   sharding_spec: sharding_specs.ShardingSpec
@@ -713,7 +712,7 @@ def _positional_sharding_to_xla_hlo_sharding(
   return xc.HloSharding.from_proto(pbuf)
 
 
-class PositionalSharding(sharding.Sharding):
+class PositionalSharding(jsharding.Sharding):
   _devices: tuple[xc.Device, ...]
   _memory_kind: str | None
   _ids: np.ndarray  # dtype DeviceIdSet
@@ -820,7 +819,7 @@ def with_memory_kind(self, kind: str) -> PositionalSharding:
   def is_fully_replicated(self) -> bool:
     return self.shape == (1,) * self.ndim
 
-  # sharding.Sharding interface
+  # jsharding.Sharding interface
 
   @property
   def _device_assignment(self) -> XLADeviceAssignment:
@@ -868,7 +867,7 @@ def __eq__(self, other) -> bool:
 
 
 @use_cpp_class(xc.GSPMDSharding)
-class GSPMDSharding(sharding.Sharding):
+class GSPMDSharding(jsharding.Sharding):
   _devices: tuple[Device, ...]
   _hlo_sharding: xc.HloSharding
   _memory_kind: str | None
@@ -1122,7 +1121,7 @@ def prepare_axis_resources(axis_resources, arg_name,
   for entry in entries:
     if isinstance(entry, (UnspecifiedValue, AUTO)) or entry is None:
       new_entries.append(entry)
-    elif isinstance(entry, sharding.Sharding):
+    elif isinstance(entry, jsharding.Sharding):
       if isinstance(entry, PmapSharding):
         raise ValueError(f'One of {what} got sharding {entry} which is not '
                          'allowed.')
@@ -1138,7 +1137,7 @@ def prepare_axis_resources(axis_resources, arg_name,
 def _check_unique_resources(axis_resources, arg_name):
   for arg_axis_resources in axis_resources:
     if not arg_axis_resources: continue
-    if isinstance(arg_axis_resources, (UnspecifiedValue, AUTO, sharding.Sharding)):
+    if isinstance(arg_axis_resources, (UnspecifiedValue, AUTO, jsharding.Sharding)):
       continue
     constrained_dims = [d for d in arg_axis_resources if d is not None]
     resource_counts = collections.Counter(
@@ -1371,7 +1370,7 @@ class NonUniformShardingError(ValueError):
 
 
 def get_process_index_and_count(
-    tensor_sharding: sharding.Sharding, dim: int, ndims: int) -> tuple[int, int]:
+    tensor_sharding: jsharding.Sharding, dim: int, ndims: int) -> tuple[int, int]:
   """Get current process index and number of unique processes for given dimension.
 
   This function facilitates mapping of process-level data to individual
@@ -1486,7 +1485,7 @@ def get_process_index_and_count(
 
 
 def local_to_global_shape(
-    sharding: sharding.Sharding, local_shape: Shape) -> tuple[int | None, ...]:
+    sharding: jsharding.Sharding, local_shape: Shape) -> tuple[int | None, ...]:
   """Computes the global shape given the per process if possible.
 
   The returned shape will have the size of the global tensor in that dimension
@@ -1545,7 +1544,7 @@ def local_to_global_shape(
 
 
 def num_addressable_indices(
-    tensor_sharding: sharding.Sharding, dim: int, global_shape: Shape) -> int:
+    tensor_sharding: jsharding.Sharding, dim: int, global_shape: Shape) -> int:
   """Returns the number of indices for given dimension this host has access to.
 
   Each host can have multiple number of devices that are spanning
@@ -1579,7 +1578,7 @@ def num_addressable_indices(
   """
   # TODO(sandler, yashkatariya): Consider making this function public.
   addressables = tensor_sharding.addressable_devices_indices_map(global_shape)
-  addressables = cast(Mapping[sharding.Device, Index], addressables)
+  addressables = cast(Mapping[jsharding.Device, Index], addressables)
   num_unique_slices = len({
       _slice_as_tuple(addressable[dim]) for addressable in addressables.values()
   })
@@ -1596,7 +1595,7 @@ def physical_hlo_sharding(aval, hlo_sharding: xc.HloSharding) -> xc.HloSharding:
   new_op_sharding.tile_assignment_dimensions = tad
   return xc.HloSharding.from_proto(new_op_sharding)
 
-def is_single_device_sharding(sharding: sharding.Sharding) -> bool:
+def is_single_device_sharding(sharding: jsharding.Sharding) -> bool:
   # Special case PmapSharding here because PmapSharding maps away an axis
   # and needs to be handled separately.test_pjit_single_device_sharding_add
   return sharding.num_devices == 1 and not isinstance(sharding, PmapSharding)
@@ -1625,7 +1624,7 @@ def make_key_array_phys_sharding(aval, sharding):
 
 
 def physical_sharding(
-    aval, sharding: sharding.Sharding) -> sharding.Sharding:
+    aval, sharding: jsharding.Sharding) -> jsharding.Sharding:
   return make_key_array_phys_sharding(aval, sharding)
 
 
@@ -1642,7 +1641,7 @@ def get_logical_gspmd_sharding(aval, phys_sharding):
   return GSPMDSharding(phys_sharding._device_assignment,
                        xc.HloSharding.from_proto(logical_op_sharding))
 
-def check_replicated_trailing_dims(sharding: sharding.Sharding, aval):
+def check_replicated_trailing_dims(sharding: jsharding.Sharding, aval):
   if isinstance(sharding, PmapSharding):
     return
   phys_aval = core.physical_aval(aval)
@@ -1655,7 +1654,7 @@ def check_replicated_trailing_dims(sharding: sharding.Sharding, aval):
         f" sharding: {sharding}, partitions: {partitions}, "
         f"num_trailing_dims: {num_trailing_dims}")
 
-def logical_sharding(aval, phys_sharding) -> sharding.Sharding:
+def logical_sharding(aval, phys_sharding) -> jsharding.Sharding:
   # The trailing dims should always be replicated.
   check_replicated_trailing_dims(phys_sharding, aval)
 
@@ -1695,6 +1694,44 @@ def _gspmd_to_named_sharding_via_mesh(
       mesh, parsed_pspec.get_partition_spec(), parsed_pspec,
       out_s.memory_kind)
 
+def flatten_spec(spec):
+  out = []
+  for s in spec:
+    if s is None:
+      continue
+    if isinstance(s, tuple):
+      out.extend(s)
+    else:
+      out.append(s)
+  return out
+
+def canonicalize_sharding(sharding: NamedSharding | PartitionSpec | None,
+                          check_mesh_consistency: bool = True
+                          ) -> NamedSharding | None:
+  if not config.sharding_in_types.value:
+    return sharding  # type: ignore
+  if sharding is None:
+    return sharding
+
+  if isinstance(sharding, PartitionSpec):
+    sharding = NamedSharding(mesh_lib.get_abstract_mesh(), sharding)  # type: ignore
+  else:
+    if (check_mesh_consistency and
+        sharding.mesh != mesh_lib.get_abstract_mesh()):
+      raise ValueError(
+          f'Context mesh {mesh_lib.get_abstract_mesh()} should match the mesh'
+          f' of sharding {sharding.mesh}. This error occurs at source: '
+          f' {source_info_util.summarize(source_info_util.current())}')
+
+  for s in flatten_spec(sharding.spec):
+    if sharding.mesh._name_to_type[s] in {
+        mesh_lib.AxisTypes.Auto, mesh_lib.AxisTypes.Collective}:
+      raise ValueError(
+          'PartitionSpec cannot contain axis names that are of type Auto or'
+          f' Collective. Got PartitionSpec: {sharding.spec} with axis name:'
+          f' {s} or type: {sharding.mesh._name_to_type[s]}')
+  return sharding
+
 
 def make_mesh(axis_shapes: Sequence[int], axis_names: Sequence[str],
               *, devices: Sequence[xc.Device] | None = None,
diff --git a/tests/pjit_test.py b/tests/pjit_test.py