Make changes to shard_map to prepare for setting varying_axes_in_types to True.

yashk2810 · mattjj · Google-ML-Automation · commit 8301c304c1ea · 2025-04-08T13:47:13.000-07:00
The main changes here are:

* Don't take the `_efficient_transpose_rewrite` transformation path anymore. In other words, `RewriteTrace` and all the rewriting machinery is dead.

* Wherever internally we were setting `check_rep=False` explicitly like `_prim_applier`, `_match`, `_unmatch`, `_shard_map_partial_eval`, `_shard_map_partial_eval_custom` (for remat), don't do that anymore. Instead set `check_rep` to the `check_rep` value so that it can be True if the user hasn't passed `check_rep=False`.

* Introduce an internal `_check_rep` context manager and set it wherever `extend_axis_env_nd` is used so that if `check_rep=False` on `shard_map`, JAX will set `vma` in `ShapedArray` to empty `frozenset`.

* Because of point (2), if `check_rep=True`, we can't set `in_specs` and `out_specs` of shmap internally to all manual axes of the mesh on the 0th dim. It needs to be whatever the argument was varying on.

Co-authored-by: Matthew Johnson &lt;mattjj@google.com&gt;
PiperOrigin-RevId: 745276474
diff --git a/jax/_src/checkify.py b/jax/_src/checkify.py
@@ -966,13 +966,15 @@ def shard_map_error_check(
   new_vals_in = [*err_vals, *vals_in]
   in_avals = list(map(core.get_aval, new_vals_in))
   auto = kwargs.get('auto')
+  check_rep = kwargs.get('check_rep')
   for i, v in enumerate(in_avals):
     if not (sharder := core.shard_aval_handlers.get(type(v))):
       raise ValueError(f'Unsupported aval type: {type(v)}')
-    in_avals[i] = sharder(mesh, auto, new_in_names[i], v)
+    in_avals[i] = sharder(mesh, auto, check_rep, new_in_names[i], v)
 
   with (shard_map._extend_axis_env(mesh, auto),
-        mesh_lib.use_abstract_mesh(shard_map._as_manual_mesh(mesh, auto))):
+        mesh_lib.use_abstract_mesh(shard_map._as_manual_mesh(mesh, auto)),
+        config._check_rep(check_rep)):
     # jaxpr to checked_jaxpr
     checked_jaxpr, out_tree, _ = jaxpr_to_checkify_jaxpr(
         pe.close_jaxpr(jaxpr), enabled_errors, err_tree, *in_avals
@@ -985,7 +987,7 @@ def expand_errors_leading_dim(*xs):
     errs = [lax.expand_dims(e, [0]) for e in errs]
     return *errs, *outs
 
-  with core.extend_axis_env_nd(mesh.shape.items()):
+  with core.extend_axis_env_nd(mesh.shape.items()), config._check_rep(check_rep):
     jaxpr, _, consts, () = pe.trace_to_jaxpr_dynamic(
         lu.wrap_init(expand_errors_leading_dim,
                      debug_info=checked_jaxpr.jaxpr.debug_info),
diff --git a/jax/_src/config.py b/jax/_src/config.py
@@ -240,6 +240,7 @@ def trace_context():
           disable_jit.value,
           debug_key_reuse.value,
           jax_xla_profile_version.value,
+          _check_rep.value,
           # Technically this affects jaxpr->stablehlo lowering, not tracing.
           hlo_source_file_canonicalization_regex.value,
           pgle_profiling_runs.value,
@@ -1099,6 +1100,13 @@ def enum_flag(name, default, *args, **kwargs) -> Flag[str]:
           ' transpose rewrite machinery in shard_map'),
     include_in_jit_key=True)
 
+# TODO make it so people don't use this, this is internal...
+_check_rep = bool_state(
+    name='check_rep',
+    default=False,
+    help='internal implementation detail of shard_map, DO NOT USE',
+    include_in_jit_key=True)
+
 softmax_custom_jvp = bool_state(
     name='jax_softmax_custom_jvp',
     default=False,
diff --git a/jax/_src/core.py b/jax/_src/core.py
@@ -1895,12 +1895,15 @@ def str_short_aval(shape, dtype, mesh, spec, vma,
   return f'{dt_str}[{shapestr}]{vma}{mesh_axes}'
 
 def get_vma(vma, mesh):
+  assert isinstance(vma, frozenset)
+  return vma
+  if mesh.empty:
+    return vma
   for i in vma:
     if mesh._name_to_type[i] != AxisType.Manual:
       raise ValueError(
           "Axes mentioned in `vma` field of ShapedArray should"
           f" be of type `Manual`. Got axis: {i} of type {mesh._name_to_type[i]}")
-  assert isinstance(vma, frozenset)
   return vma
 
 class ShapedArray(UnshapedArray):
@@ -1994,6 +1997,8 @@ def primal_dtype_to_tangent_dtype(primal_dtype):
 def standard_insert_pbroadcast(*args):
   if not config.varying_axes_in_types.value:
     return args
+  if not config._check_rep.value:
+    return args
   if not args:
     return args
   # TODO(yashkatariya): Move pbroadcast out of shard_map
@@ -2005,6 +2010,10 @@ def standard_insert_pbroadcast(*args):
           if out_vma - src else arg for arg, src in zip(args, in_vma)]
 
 def standard_vma_rule(prim_name, *avals, **kwargs) -> frozenset[AxisName]:
+  if not config.varying_axes_in_types.value:
+    return frozenset()
+  if not config._check_rep.value:
+    return frozenset()
   avals = tuple(a for a in avals if a is not abstract_token)
   if not avals:
     return frozenset()
@@ -2567,9 +2576,9 @@ def unmapped_aval(size: AxisSize, axis: int | None,
 
 def _map_shaped_array(
     size: int, axis: int | None, aval: ShapedArray) -> ShapedArray:
-  assert axis is None or aval.shape[axis] == size
-  # TODO: Extend the named shape
-  if axis is None: return aval
+  # assert axis is None or aval.shape[axis] == size
+  if axis is None:
+    return aval
   sharding = aval.sharding.with_spec(tuple_delete(aval.sharding.spec, axis))
   return ShapedArray(tuple_delete(aval.shape, axis), aval.dtype,
                      weak_type=aval.weak_type, sharding=sharding, vma=aval.vma)
diff --git a/jax/_src/interpreters/batching.py b/jax/_src/interpreters/batching.py
@@ -408,6 +408,10 @@ def __init__(self, trace, val, batch_dim: NotMapped | int | RaggedAxis,
   @property
   def aval(self):
     aval = core.get_aval(self.val)
+    if self._trace.axis_data.spmd_name is not None:
+      if config._check_rep.value and config.varying_axes_in_types.value:
+        aval = aval.update(
+            vma=aval.vma - frozenset(self._trace.axis_data.spmd_name))
     if self.batch_dim is not_mapped:
       return aval
     elif type(self.batch_dim) is int:
@@ -771,10 +775,17 @@ def _batch_jaxpr2(
       handle_ragged(closed_jaxpr.in_avals, dim, aval)
       if isinstance(dim, RaggedAxis) else (dim, aval)
       for dim, aval in zip(in_axes, closed_jaxpr.in_avals)])
-  avals_in2 = [core.unmapped_aval(axis_data.size, b, aval,
-                                  axis_data.explicit_mesh_axis)
-               if b is not not_mapped else aval
-               for aval, b in unsafe_zip(avals_in, in_axes2)]
+  avals_in2 = []
+  for aval, b in unsafe_zip(avals_in, in_axes2):
+    if b is not_mapped:
+      avals_in2.append(aval)
+    else:
+      aval = core.unmapped_aval(
+          axis_data.size, b, aval, axis_data.explicit_mesh_axis)
+      if axis_data.spmd_name is not None:
+        if config._check_rep.value and config.varying_axes_in_types.value:
+          aval = aval.update(vma=aval.vma | frozenset(axis_data.spmd_name))  # type: ignore
+      avals_in2.append(aval)
   jaxpr_out, _, consts, () = pe.trace_to_jaxpr_dynamic(f, avals_in2)
   return core.ClosedJaxpr(jaxpr_out, consts), out_axes()
 
@@ -1111,8 +1122,16 @@ def broadcast(x, sz, axis, mesh_axis=None):
   # TODO(dougalm, yashkatariya): Delete this context manager once we figure
   # out how to ensure jaxpr arguments always have the context mesh.
   with mesh_lib.use_abstract_mesh(sharding.mesh):
-    return jax.lax.broadcast_in_dim(x, shape, broadcast_dims,
-                                    out_sharding=sharding)
+    x = jax.lax.broadcast_in_dim(x, shape, broadcast_dims, out_sharding=sharding)
+    if config._check_rep.value and config.varying_axes_in_types.value:
+      # TODO(yashkatariya,parkers): don't do this, fix during fixit week 2026
+      spmd_names = core.get_axis_env().spmd_axis_names
+      if len(spmd_names) > 1:
+        raise NotImplementedError
+      if spmd_names:
+        from jax.experimental.shard_map import pbroadcast
+        x = pbroadcast(x, tuple(spmd_names))
+    return x
 
 def matchaxis(axis_name, sz, mesh_axis, src, dst, x, sum_match=False):
   if dst == jumble_axis:
diff --git a/jax/_src/interpreters/partial_eval.py b/jax/_src/interpreters/partial_eval.py
@@ -501,6 +501,24 @@ def partial_eval_wrapper_nounits(
   store.store((*maybe_fwds, out_knowns, out_avals, jaxpr, env))
   return (*out_consts, *res)
 
+@lu.transformation_with_aux2
+def partial_eval_wrapper_nounits2(
+    f: Callable,
+    store: lu.Store,
+    in_knowns: Sequence[bool],
+    in_avals: Sequence[AbstractValue],
+    *in_consts: Any):
+  in_avals_, in_consts_ = iter(in_avals), iter(in_consts)
+  in_pvals = [PartialVal.known(next(in_consts_)) if known else
+              PartialVal.unknown(next(in_avals_)) for known in in_knowns]
+  sentinel = object()
+  assert next(in_avals_, sentinel) is next(in_consts_, sentinel) is sentinel
+  jaxpr, (*maybe_fwds, out_pvals, res, env) = f(in_pvals)
+  out_knowns, _, out_consts = partition_pvals(out_pvals)
+  res_avals = [core.typeof(r) for r in res]
+  store.store((*maybe_fwds, out_knowns, res_avals, jaxpr, env))
+  return (*out_consts, *res)
+
 custom_partial_eval_rules: dict[Primitive, Callable] = {}
 call_partial_eval_rules: dict[Primitive, Callable] = {}
 call_param_updaters: dict[Primitive, Callable] = {}
diff --git a/jax/_src/lax/control_flow/loops.py b/jax/_src/lax/control_flow/loops.py
@@ -550,9 +550,10 @@ def _split_leading(sz, x):
 def _concat(a, b): return lax.concatenate([a, b], 0)
 
 def _empty_array(prefix, length_spec, aval):
+  from jax.experimental.shard_map import pbroadcast
   sharding = aval.sharding.with_spec((*length_spec, *aval.sharding.spec))
-  return lax.broadcast(lax.empty(aval.dtype), (*prefix, *aval.shape),
-                       out_sharding=sharding)
+  empty = pbroadcast(lax.empty(aval.dtype), tuple(aval.vma))
+  return lax.broadcast(empty, (*prefix, *aval.shape), out_sharding=sharding)
 
 eval_jaxpr_p = core.Primitive('eval_jaxpr')
 eval_jaxpr_p.multiple_results = True
@@ -2248,12 +2249,7 @@ def _batch_and_remainder(x, batch_size: int):
   return scan_tree, remainder_tree
 
 @api_boundary
-def map(
-  f,
-  xs,
-  *,
-  batch_size: int | None = None,
-):
+def map(f, xs, *, batch_size: int | None = None):
   """Map a function over leading array axes.
 
   Like Python's builtin map, except inputs and outputs are in the form of
diff --git a/jax/_src/lax/parallel.py b/jax/_src/lax/parallel.py
@@ -117,6 +117,8 @@ def psum(x, axis_name, *, axis_index_groups=None):
   """
   if not isinstance(axis_name, (tuple, list)):
     axis_name = (axis_name,)
+  if not axis_name:
+    return x
   if any(isinstance(axis, int) for axis in axis_name) and axis_index_groups is not None:
     raise ValueError("axis_index_groups only supported for sums over just named axes")
   _validate_reduce_axis_index_groups(axis_index_groups)
@@ -141,7 +143,7 @@ def pos_reduce(x):
       size = math.prod([core.get_axis_env().axis_size(name) for name in named_axes])
     out_flat = tuple(lax._const(leaf, size) * pos_reduce(leaf) for leaf in leaves)
   else:
-    if config.varying_axes_in_types.value:
+    if config.varying_axes_in_types.value and config._check_rep.value:
       out_flat = bind_psum2_p(leaves, axes=tuple(axis_name),
                               axis_index_groups=axis_index_groups)
     else:
@@ -828,6 +830,9 @@ def _psum2_abstract_eval(name, *args, axes, axis_index_groups):
   if not config.varying_axes_in_types.value:
     return psum_p.abstract_eval(
         *args, axes=axes, axis_index_groups=axis_index_groups)
+  if not config._check_rep.value:
+    return psum_p.abstract_eval(
+        *args, axes=axes, axis_index_groups=axis_index_groups)
 
   assert isinstance(axes, tuple)
   _check_axis_names(axes)
@@ -863,6 +868,9 @@ def _pmin_pmax_abstract_eval(name, *args, axes, axis_index_groups):
   if not config.varying_axes_in_types.value:
     return _allreduce_effectful_abstract_eval(
         *args, axes=axes, axis_index_groups=axis_index_groups)
+  if not config._check_rep.value:
+    return _allreduce_effectful_abstract_eval(
+        *args, axes=axes, axis_index_groups=axis_index_groups)
   return _psum2_abstract_eval(name, *args, axes=axes,
                               axis_index_groups=axis_index_groups)
 
@@ -1411,6 +1419,8 @@ def _ragged_all_to_all_batched_collective(axis_data, vals_in, dims_in,
 def insert_collective_pbroadcast(axis_name, x):
   if not config.varying_axes_in_types.value:
     return x
+  if not config._check_rep.value:
+    return x
 
   from jax.experimental import shard_map
   axis_name = (axis_name,) if not isinstance(axis_name, tuple) else axis_name
@@ -1546,6 +1556,8 @@ def _all_gather_lowering(ctx, x, *, all_gather_dimension, axis_name,
 def collective_vma_rule(prim_name, axis_name, x_aval):
   if not config.varying_axes_in_types.value:
     return frozenset()
+  if not config._check_rep.value:
+    return frozenset()
   axis_name = (axis_name,) if not isinstance(axis_name, tuple) else axis_name
   if any(a not in x_aval.vma for a in axis_name):
     raise ValueError(
@@ -1912,7 +1924,8 @@ def _axis_index_effectful_abstract_eval(*, axis_name):
   mesh = get_abstract_mesh()
   sharding = NamedSharding(mesh, P())
   vma = ((frozenset(axis_name) if mesh._any_axis_manual else frozenset())
-         if config.varying_axes_in_types.value else frozenset())
+         if config.varying_axes_in_types.value and config._check_rep.value
+         else frozenset())
   return ShapedArray((), np.int32, sharding=sharding, vma=vma), effect
 
 def _axis_index_batcher(axis_data, vals_in, dims_in, *, axis_name):
diff --git a/jax/_src/lax/slicing.py b/jax/_src/lax/slicing.py
@@ -173,6 +173,8 @@ def dynamic_slice(
   else:
     dynamic_sizes = []
     static_sizes = core.canonicalize_shape(slice_sizes)  # type: ignore
+  operand, *start_indices = core.standard_insert_pbroadcast(
+      operand, *start_indices)
   return dynamic_slice_p.bind(operand, *start_indices, *dynamic_sizes,
                               slice_sizes=tuple(static_sizes))
 
@@ -234,7 +236,8 @@ def dynamic_update_slice(
   """
   start_indices = _dynamic_slice_indices(
       operand, start_indices, allow_negative_indices)
-  operand, update = core.standard_insert_pbroadcast(operand, update)
+  operand, update, *start_indices = core.standard_insert_pbroadcast(
+      operand, update, *start_indices)
   return dynamic_update_slice_p.bind(operand, update, *start_indices)
 
 
diff --git a/jax/_src/state/types.py b/jax/_src/state/types.py
@@ -456,15 +456,15 @@ def shaped_array_ref(
     shape: tuple[int, ...], dtype, weak_type: bool = False) -> AbstractRef:
   return AbstractRef(core.ShapedArray(shape, dtype, weak_type=weak_type))
 
-def _shard_ref(mesh, auto, names, ref_aval: AbstractRef):
+def _shard_ref(mesh, auto, check_rep, names, ref_aval: AbstractRef):
   del mesh
   if names:
     # Can't actually shard a ref, can only close over it.
     raise NotImplementedError("Can't shard a Ref.")
   return ref_aval
 core.shard_aval_handlers[AbstractRef] = _shard_ref
 
-def _unshard_ref(mesh, names, ref_aval: AbstractRef):
+def _unshard_ref(mesh, check_rep, names, ref_aval: AbstractRef):
   del mesh
   if names:
     # Can't actually shard a ref, can only close over it.
diff --git a/jax/experimental/shard_map.py b/jax/experimental/shard_map.py
diff --git a/tests/debug_info_test.py b/tests/debug_info_test.py
diff --git a/tests/shard_map_test.py b/tests/shard_map_test.py