ROCm
diff --git a/‎.bazelrc‎
Lines changed: 3 additions & 2 deletions b/‎.bazelrc‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎build/build.py‎
Lines changed: 1 addition & 0 deletions b/‎build/build.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/utilities/setup_build_environment.sh‎
Lines changed: 23 additions & 1 deletion b/‎ci/utilities/setup_build_environment.sh‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎docs/installation.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/installation.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/jax.experimental.custom_dce.rst‎
Lines changed: 13 additions & 0 deletions b/‎docs/jax.experimental.custom_dce.rst‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎docs/jax.experimental.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/jax.experimental.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎jax/_src/core.py‎
Lines changed: 23 additions & 10 deletions b/‎jax/_src/core.py‎
Lines changed: 23 additions & 10 deletions
diff --git a/‎jax/_src/custom_dce.py‎
Lines changed: 9 additions & 9 deletions b/‎jax/_src/custom_dce.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎jax/_src/lax/lax.py‎
Lines changed: 22 additions & 5 deletions b/‎jax/_src/lax/lax.py‎
Lines changed: 22 additions & 5 deletions
@@ -124,9 +124,10 @@ build:cuda --@local_config_cuda//:enable_cuda
 # Default hermetic CUDA and CUDNN versions.
 build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.3.2"
 build:cuda --repo_env=HERMETIC_CUDNN_VERSION="9.1.1"
+build:cuda --@local_config_cuda//cuda:include_cuda_libs=true
 
-# This flag is needed to include CUDA libraries for bazel tests.
-test:cuda --@local_config_cuda//cuda:include_cuda_libs=true
+# This config is used for building targets with CUDA libraries from stubs.
+build:cuda_libraries_from_stubs --@local_config_cuda//cuda:include_cuda_libs=false
 
 # Force the linker to set RPATH, not RUNPATH. When resolving dynamic libraries,
 # ld.so prefers in order: RPATH, LD_LIBRARY_PATH, RUNPATH. JAX sets RPATH to
 
@@ -16,6 +16,12 @@ When releasing, please add the new-release-boilerplate to docs/pallas/CHANGELOG.
 
 ## Unreleased
 
+* New Features
+  * Added an experimental {func}`jax.experimental.custom_dce.custom_dce`
+    decorator to support customizing the behavior of opaque functions under
+    JAX-level dead code elimination (DCE). See {jax-issue}`#25956` for more
+    details.
+
 ## jax 0.5.0 (Jan 17, 2025)
 
 As of this release, JAX now uses
 
@@ -532,6 +532,7 @@ async def main():
 
   if "cuda" in args.wheels:
     wheel_build_command_base.append("--config=cuda")
+    wheel_build_command_base.append("--config=cuda_libraries_from_stubs")
     if args.use_clang:
       wheel_build_command_base.append(
           f"--action_env=CLANG_CUDA_COMPILER_PATH=\"{clang_path}\""
 
@@ -75,4 +75,26 @@ if [[ $(uname -s) =~ "MSYS_NT" ]]; then
   echo 'Converting MSYS Linux-like paths to Windows paths (for Bazel, Python, etc.)'
   # Convert all "JAXCI.*DIR" variables
   source <(python3 ./ci/utilities/convert_msys_paths_to_win_paths.py --convert $(env | grep "JAXCI.*DIR" | awk -F= '{print $1}'))
-fi
+fi
+
+function retry {
+  local cmd="$1"
+  local max_attempts=3
+  local attempt=1
+  local delay=10
+
+  while [[ $attempt -le $max_attempts ]] ; do
+    if eval "$cmd"; then
+      return 0
+    fi
+    echo "Attempt $attempt failed. Retrying in $delay seconds..."
+    sleep $delay # Prevent overloading
+
+    attempt=$((attempt + 1))
+  done
+  echo "$cmd failed after $max_attempts attempts."
+  exit 1
+}
+
+# Retry "bazel --version" 3 times to avoid flakiness when downloading bazel.
+retry "bazel --version"
@@ -30,7 +30,7 @@ The table below shows all supported platforms and installation options. Check if
 
 |                  | Linux, x86_64                         | Linux, aarch64                  | Mac, x86_64                           | Mac, aarch64                          | Windows, x86_64          | Windows WSL2, x86_64                     |
 |------------------|---------------------------------------|---------------------------------|---------------------------------------|---------------------------------------|--------------------------|------------------------------------------|
-| CPU              | {ref}`yes <install-cpu>`              | {ref}`yes <install-cpu>`        | {ref}`yes <install-cpu>`              | {ref}`yes <install-cpu>`              | {ref}`yes <install-cpu>` | {ref}`yes <install-cpu>`                 |
+| CPU              | {ref}`yes <install-cpu>`              | {ref}`yes <install-cpu>`        | {ref}`jax≤0.4.38 only <install-cpu>`  | {ref}`yes <install-cpu>`              | {ref}`yes <install-cpu>` | {ref}`yes <install-cpu>`                 |
 | NVIDIA GPU       | {ref}`yes <install-nvidia-gpu>`       | {ref}`yes <install-nvidia-gpu>` | no                                    | n/a                                   | no                       | {ref}`experimental <install-nvidia-gpu>` |
 | Google Cloud TPU | {ref}`yes <install-google-tpu>`       | n/a                             | n/a                                   | n/a                                   | n/a                      | n/a                                      |
 | AMD GPU          | {ref}`experimental <install-amd-gpu>` | no                              | {ref}`experimental <install-mac-gpu>` | n/a                                   | no                       | no                                       |
 
@@ -0,0 +1,13 @@
+``jax.experimental.custom_dce`` module
+======================================
+
+.. automodule:: jax.experimental.custom_dce
+
+API
+---
+
+.. autosummary::
+  :toctree: _autosummary
+
+  custom_dce
+  custom_dce.def_dce
@@ -16,6 +16,7 @@ Experimental Modules
 
     jax.experimental.checkify
     jax.experimental.compilation_cache
+    jax.experimental.custom_dce
     jax.experimental.custom_partitioning
     jax.experimental.jet
     jax.experimental.key_reuse
 
@@ -1731,14 +1731,20 @@ def _invalid_shape_error(shape: Shape, context: str=""):
 
   return TypeError(msg)
 
+def _make_lengths_same(sharding, ndim):
+  if ndim > len(sharding.spec):
+    return sharding.with_spec(sharding.spec._normalized_spec(ndim))
+  if ndim < len(sharding.spec):
+    return sharding.with_spec(sharding.spec[:ndim])
+  assert False, "unreachable"
+
+
 # TODO(yashkatariya): Only works with User/Auto. Generalize it to work with
 # Collective too.
 def modify_spec_for_auto_manual(spec, mesh) -> P:
-  if all(s is None for s in spec):
-    return spec
   new_spec = []  # type: ignore
   for s in spec:
-    if s is None:
+    if not s:
       new_spec.append(s)
     else:
       temp_s = s[0] if isinstance(s, tuple) else s
@@ -1748,22 +1754,29 @@ def modify_spec_for_auto_manual(spec, mesh) -> P:
           else s)
   return P(*new_spec)
 
-def _maybe_modify_sharding(sharding):
+def _maybe_modify_sharding(sharding, ndim):
   if sharding.mesh._are_all_axes_explicit:
-    return sharding
-  new_spec = modify_spec_for_auto_manual(sharding.spec, sharding.mesh)
-  return sharding.with_spec(new_spec)
+    out = sharding
+  elif all(s is None for s in sharding.spec):
+    out = sharding
+  else:
+    out = sharding.with_spec(modify_spec_for_auto_manual(
+        sharding.spec, sharding.mesh))
+  if (len(out.spec) != ndim and
+      (out.mesh._are_all_axes_auto or out.mesh._are_all_axes_manual)):
+    out = _make_lengths_same(out, ndim)
+  return out
 
 
 def get_sharding(sharding, ndim):
   from jax._src.sharding_impls import NamedSharding  # type: ignore
 
   if sharding is not None:
-    if len(sharding.spec) != ndim:
+    out_s = _maybe_modify_sharding(sharding, ndim)
+    if len(out_s.spec) != ndim:
       raise ValueError(
           "Length of sharding.spec must be equal to aval's ndim. Got"
-          f" sharding.spec {sharding.spec} and aval.ndim {ndim}")
-    out_s = _maybe_modify_sharding(sharding)
+          f" sharding.spec {out_s.spec} and aval.ndim {ndim}")
   else:
     context_mesh = mesh_lib.get_abstract_mesh()
     if not context_mesh:
 
@@ -75,9 +75,9 @@ class custom_dce:
     ...       x * jnp.sin(y) if used_outs[1] else None,
     ...   )
 
-  In this example, ``used_outs`` is a ``tuple`` with two ``bool``s indicating
-  which outputs are required. The DCE rule only computes the required outputs,
-  replacing the unused outputs with ``None``.
+  In this example, ``used_outs`` is a ``tuple`` with two ``bool`` values,
+  indicating which outputs are required. The DCE rule only computes the
+  required outputs, replacing the unused outputs with ``None``.
 
   If the ``static_argnums`` argument is provided to ``custom_dce``, the
   indicated arguments are treated as static when the function is traced, and
@@ -108,12 +108,12 @@ def def_dce(
 
     Args:
       dce_rule: A function that takes (a) any arguments indicated as static
-        using ``static_argnums``, (b) a Pytree of ``bool``s (``used_outs``)
-        indicating which outputs should be computed, and (c) the rest of the
-        (non-static) arguments to the original function. The rule should return
-        a Pytree with with the same structure as the output of the original
-        function, but any unused outputs (as indicated by ``used_outs``) can be
-        replaced with ``None``.
+        using ``static_argnums``, (b) a Pytree of ``bool`` values
+        (``used_outs``) indicating which outputs should be computed, and (c)
+        the rest of the (non-static) arguments to the original function. The
+        rule should return a Pytree with with the same structure as the output
+        of the original function, but any unused outputs (as indicated by
+        ``used_outs``) can be replaced with ``None``.
     """
     self.dce_rule = dce_rule
     return dce_rule
 
@@ -2149,7 +2149,7 @@ def full_like(x: ArrayLike | DuckTypedArray,
   if dtypes.issubdtype(dtype, dtypes.extended):
     return dtype._rules.full(fill_shape, fill_value, dtype)  # type: ignore[union-attr]
 
-  if (config.sharding_in_types.value and sharding is None and
+  if (config.sharding_in_types.value and sharding is None and shape is None and
       isinstance(x, Array)):
     sharding = x.aval.sharding
   else:
@@ -4577,6 +4577,9 @@ def _clamp_shape_rule(min, operand, max):
                     f"(), got max.shape={max.shape}, {operand.shape=}.")
   return operand.shape
 
+def _clamp_sharding_rule(min, operand, max):
+  return operand.sharding
+
 _clamp_dtype_rule = partial(naryop_dtype_rule, _input_dtype, [_any, _any, _any],
                             'clamp')
 
@@ -4617,7 +4620,8 @@ def _clamp_batch_rule(batched_args, batch_dims, **params):
     x = broadcast(x, min.shape)
   return clamp_p.bind(min, x, max), 0
 
-clamp_p = standard_primitive(_clamp_shape_rule, _clamp_dtype_rule, 'clamp')
+clamp_p = standard_primitive(_clamp_shape_rule, _clamp_dtype_rule, 'clamp',
+                             sharding_rule=_clamp_sharding_rule)
 ad.defjvp(clamp_p,
           lambda g, min, operand, max:
           select(bitwise_and(gt(min, operand), lt(min, max)),
@@ -5165,18 +5169,28 @@ def _rev_shape_rule(operand, *, dimensions):
     raise TypeError(msg.format(dimensions, operand.ndim))
   return operand.shape
 
+def _rev_sharding_rule(operand, *, dimensions):
+  # TODO(yashkatariya): Will lead to data movement. Maybe just error out and
+  # require the operand to be unsharded?
+  return operand.sharding
+
 def _rev_batch_rule(batched_args, batch_dims, *, dimensions):
   operand, = batched_args
   bdim, = batch_dims
   new_dimensions = [i + 1 if i >= bdim else i for i in dimensions]
   return rev(operand, new_dimensions), bdim
 
-rev_p = standard_primitive(_rev_shape_rule, _input_dtype, 'rev')
+rev_p = standard_primitive(_rev_shape_rule, _input_dtype, 'rev',
+                           sharding_rule=_rev_sharding_rule)
 ad.deflinear2(rev_p, lambda t, _, dimensions: [rev(t, dimensions)])
 batching.primitive_batchers[rev_p] = _rev_batch_rule
 
 def _rev_lower(ctx, x, *, dimensions):
-  return [hlo.reverse(x, mlir.dense_int_array(dimensions))]
+  aval_out, = ctx.avals_out
+  out = hlo.reverse(x, mlir.dense_int_array(dimensions))
+  if config.sharding_in_types.value:
+    return [mlir.lower_sharding_under_shit(ctx, out, aval_out)]
+  return [out]
 mlir.register_lowering(rev_p, _rev_lower)
 
 
@@ -5932,7 +5946,10 @@ def _sort_lower(ctx, *operands, dimension, is_stable, num_keys):
                     mlir.flatten_ir_values(operands),
                     dimension=mlir.i64_attr(dimension),
                     is_stable=ir.BoolAttr.get(is_stable))
-  scalar_avals = [aval.update(shape=()) for aval in ctx.avals_in]
+  scalar_s = (lambda a: a.sharding.with_spec(P())
+              if config.sharding_in_types.value else lambda _: None)
+  scalar_avals = [aval.update(shape=(), sharding=scalar_s(aval))
+                  for aval in ctx.avals_in]
   scalar_types = safe_map(mlir.aval_to_ir_type, scalar_avals)
   comparator = sort.comparator.blocks.append(
       *util.flatten(zip(scalar_types, scalar_types)))