ROCm
diff --git a/‎.github/workflows/requirements_lock_3_13_ft.patch‎
Lines changed: 6 additions & 7 deletions b/‎.github/workflows/requirements_lock_3_13_ft.patch‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎.github/workflows/tsan.yaml‎
Lines changed: 16 additions & 3 deletions b/‎.github/workflows/tsan.yaml‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎jax/_src/array.py‎
Lines changed: 2 additions & 2 deletions b/‎jax/_src/array.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎jax/_src/blocked_sampler.py‎
Lines changed: 7 additions & 5 deletions b/‎jax/_src/blocked_sampler.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎jax/_src/custom_partitioning.py‎
Lines changed: 1 addition & 1 deletion b/‎jax/_src/custom_partitioning.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎jax/_src/interpreters/partial_eval.py‎
Lines changed: 3 additions & 2 deletions b/‎jax/_src/interpreters/partial_eval.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎jax/_src/lax/lax.py‎
Lines changed: 143 additions & 23 deletions b/‎jax/_src/lax/lax.py‎
Lines changed: 143 additions & 23 deletions
@@ -1,21 +1,20 @@
 diff --git a/build/requirements_lock_3_13_ft.txt b/build/requirements_lock_3_13_ft.txt
-index dfefaf042..2700e140e 100644
+index e7a2968e9..d37e11ee3 100644
 --- a/build/requirements_lock_3_13_ft.txt
 +++ b/build/requirements_lock_3_13_ft.txt
-@@ -4,6 +4,12 @@
+@@ -4,6 +4,11 @@
  #
  #    pip-compile --allow-unsafe --generate-hashes --output-file=build/requirements_lock_3_13_ft.txt build/requirements.in
  #
 +
 +--pre
 +--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple
 +numpy
-+
 +
  absl-py==2.1.0 \
      --hash=sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308 \
      --hash=sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff
-@@ -328,68 +334,6 @@ mpmath==1.3.0 \
+@@ -328,68 +333,6 @@ mpmath==1.3.0 \
      --hash=sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f \
      --hash=sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c
      # via -r build/test-requirements.txt
@@ -81,6 +80,6 @@ index dfefaf042..2700e140e 100644
 -    #   matplotlib
 -    #   ml-dtypes
 -    #   scipy
- opt-einsum==3.4.0 \
-     --hash=sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd \
-     --hash=sha256:96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac
+ nvidia-cublas-cu12==12.8.3.14 ; sys_platform == "linux" \
+     --hash=sha256:3f0e05e7293598cf61933258b73e66a160c27d59c4422670bf0b79348c04be44 \
+     --hash=sha256:93a4e0e386cc7f6e56c822531396de8170ed17068a1e18f987574895044cd8c3 \
@@ -173,12 +173,18 @@ jobs:
             --bazel_options=--copt=-g \
             --clang_path=/usr/bin/clang-18
 
-          # Update the patch to use TSAN instrumented numpy
+          # Patch build/requirements_lock_3_13_ft.txt to use TSAN instrumented NumPy
           sed -i "s|+--extra-index-url.*|+--extra-index-url file://${GITHUB_WORKSPACE}/wheelhouse/|" .github/workflows/requirements_lock_3_13_ft.patch
           cat .github/workflows/requirements_lock_3_13_ft.patch
+          git apply .github/workflows/requirements_lock_3_13_ft.patch || exit 1
 
-          # Apply a patch to numpy in requirements lock 3.13 ft to use the nightly version
-          git apply .github/workflows/requirements_lock_3_13_ft.patch
+          # Display the content for debugging in logs
+          cat build/requirements_lock_3_13_ft.txt | head -15
+          # Check the patch
+          cat build/requirements_lock_3_13_ft.txt | head -15 | grep -E "(--pre|.*${GITHUB_WORKSPACE}/wheelhouse/|numpy)"
+          if [ "$?" == "1" ]; then echo "Could not find the patch in the requirements_lock_3_13_ft.txt"; exit 1; fi
+          cat build/requirements_lock_3_13_ft.txt | grep -E "(numpy==)"
+          if [ "$?" == "0" ]; then "Found original numpy dependency in the requirements_lock_3_13_ft.txt"; exit 1; fi
 
           echo "JAX_NUM_GENERATED_CASES=$JAX_NUM_GENERATED_CASES"
           echo "JAX_ENABLE_X64=$JAX_ENABLE_X64"
@@ -188,6 +194,13 @@ jobs:
           bazel_exec=($(ls bazel-*))
           ln -s ${bazel_exec} bazel
 
+          # Check python version
+          ./bazel run --@rules_python//python/config_settings:py_freethreaded="yes" @python//:python3 -- -VV
+
+          # Check numpy version
+          ./bazel cquery @pypi_numpy//:* | grep whl
+
+          # Build JAX and run tests
           ./bazel test \
               --test_env=JAX_NUM_GENERATED_CASES=$JAX_NUM_GENERATED_CASES \
               --test_env=JAX_ENABLE_X64=$JAX_ENABLE_X64 \
 
@@ -33,7 +33,6 @@
 from jax._src import profiler
 from jax._src import util
 from jax._src import xla_bridge
-from jax._src.mesh import use_concrete_mesh
 from jax._src.interpreters import mlir
 from jax._src.interpreters import pxla
 from jax._src.interpreters import xla
@@ -43,7 +42,8 @@
 from jax._src.sharding import Sharding
 from jax._src.sharding_impls import (
     PmapSharding, SingleDeviceSharding,
-    device_replica_id_map, hashed_index, num_addressable_indices, local_to_global_shape)  # pyformat: disable
+    device_replica_id_map, hashed_index, num_addressable_indices,
+    local_to_global_shape, use_concrete_mesh)  # pyformat: disable
 from jax._src.typing import ArrayLike, DLDeviceType, DTypeLike
 from jax._src.util import safe_zip, unzip3, use_cpp_class, use_cpp_method, cache
 import numpy as np
 
@@ -29,16 +29,16 @@ def __call__(self, key: ArrayLike, *args, shape: Shape,
 
 
 def _compute_tile_index(block_index: Sequence[int],
-                        total_size_in_blocks: Shape,
                         block_size_in_tiles: Shape,
+                        total_size_in_tiles: Shape,
                         tile_index_in_block: Sequence[int]) -> int:
   ndims = len(block_index)
   dim_size = 1
   total_idx = 0
   for i in range(ndims-1, -1, -1):
     dim_idx = tile_index_in_block[i] + block_index[i] * block_size_in_tiles[i]
     total_idx += dim_idx * dim_size
-    dim_size *= total_size_in_blocks[i] * block_size_in_tiles[i]
+    dim_size *= total_size_in_tiles[i]
   return total_idx
 
 
@@ -103,15 +103,17 @@ def blocked_fold_in(
       _shape // _element for _shape, _element in zip(block_size, tile_size)
   )
 
-  total_size_in_blocks = tuple(
-      _shape // _element for _shape, _element in zip(total_size, block_size)
+  # Round up to make sure every tile is numbered.
+  total_size_in_tiles = tuple(
+      (_shape + _element - 1) // _element
+        for _shape, _element in zip(total_size, tile_size)
   )
 
   def _keygen_loop(axis, prefix):
     if axis == len(block_size_in_tiles):
       subtile_key = jax.random.fold_in(
           global_key, _compute_tile_index(
-              block_index, total_size_in_blocks, block_size_in_tiles, prefix))
+              block_index, block_size_in_tiles, total_size_in_tiles, prefix))
       return subtile_key
     else:
       keys = []
 
@@ -179,7 +179,7 @@ def _custom_partitioning_partition(arg_shapes, arg_shardings, result_shape,
       for sharding, s in zip(result_shardings, result_shapes)
   ]
   closed_jaxpr = jax.make_jaxpr(lower_fn, axis_env=list(mesh.shape.items()))(
-      *tiled_args
+      *info.in_tree.unflatten(tiled_args)
   )
   if ([(o.shape, o.dtype) for o in closed_jaxpr.out_avals] !=
       [(t.shape, t.dtype) for t in tiled_results]):
 
@@ -41,7 +41,7 @@
                            JaxprEqn, Primitive, ShapedArray, DShapedArray,
                            mapped_aval, unmapped_aval, DBIdx, InDBIdx, OutDBIdx,
                            InputType, OutputType, get_referent, JaxprEqnContext)
-from jax._src.state.types import AbstractRef
+from jax._src.state.types import AbstractRef, ReadEffect
 from jax._src.tree_util import (PyTreeDef, treedef_tuple,
                                 tree_flatten, tree_structure)
 from jax._src.util import (unzip2, safe_zip, safe_map, toposort, split_list,
@@ -1423,7 +1423,8 @@ def dce_jaxpr_consts(jaxpr: Jaxpr, used_outputs: Sequence[bool],
 
 
 def has_effects(eqn: JaxprEqn) -> bool:
-  effs = {e for e in eqn.effects if not isinstance(e, core.NamedAxisEffect)}
+  effs = {e for e in eqn.effects if not isinstance(e, core.NamedAxisEffect)
+          and not isinstance(e, ReadEffect)}
   return bool(effs)
 
 
 
@@ -615,8 +615,23 @@ def tanh(x: ArrayLike) -> Array:
   """
   return tanh_p.bind(x)
 
+@export
 def logistic(x: ArrayLike) -> Array:
-  r"""Elementwise logistic (sigmoid) function: :math:`\frac{1}{1 + e^{-x}}`."""
+  r"""Elementwise logistic (sigmoid) function: :math:`\frac{1}{1 + e^{-x}}`.
+
+  There is no HLO logistic/sigmoid primitive, so this lowers to a sequence
+  of HLO arithmetic operations.
+
+  Args:
+    x: input array. Must have floating point or complex dtype.
+
+  Returns:
+    Array of the same shape and dtype as ``x`` containing the element-wise
+    logistic/sigmoid function.
+
+  See also:
+    - :func:`jax.nn.sigmoid`: an alternative API for this functionality.
+  """
   return logistic_p.bind(x)
 
 @export
@@ -1018,12 +1033,45 @@ def bitwise_xor(x: ArrayLike, y: ArrayLike) -> Array:
   """
   return xor_p.bind(x, y)
 
+@export
 def population_count(x: ArrayLike) -> Array:
-  r"""Elementwise popcount, count the number of set bits in each element."""
+  r"""Elementwise popcount, count the number of set bits in each element.
+
+  This function lowers directly to the `stablehlo.popcnt`_ operation.
+
+  Args:
+    x: Input array. Must have integer dtype.
+
+  Returns:
+    An array of the same shape and dtype as ``x``, containing the number of
+    set bits in the input.
+
+  See also:
+    - :func:`jax.lax.clz`: Elementwise count leading zeros.
+    - :func:`jax.numpy.bitwise_count`: More flexible NumPy-style API for bit counts.
+
+  .. _stablehlo.popcnt: https://openxla.org/stablehlo/spec#popcnt
+  """
   return population_count_p.bind(x)
 
+@export
 def clz(x: ArrayLike) -> Array:
-  r"""Elementwise count-leading-zeros."""
+  r"""Elementwise count-leading-zeros.
+
+  This function lowers directly to the `stablehlo.count_leading_zeros`_ operation.
+
+  Args:
+    x: Input array. Must have integer dtype.
+
+  Returns:
+    An array of the same shape and dtype as ``x``, containing the number of
+    set bits in the input.
+
+  See also:
+    - :func:`jax.lax.population_count`: Count the number of set bits in each element.
+
+  .. _stablehlo.count_leading_zeros: https://openxla.org/stablehlo/spec#count_leading_zeros
+  """
   return clz_p.bind(x)
 
 @export
@@ -1124,31 +1172,81 @@ def div(x: ArrayLike, y: ArrayLike) -> Array:
   """
   return div_p.bind(x, y)
 
+@export
 def rem(x: ArrayLike, y: ArrayLike) -> Array:
   r"""Elementwise remainder: :math:`x \bmod y`.
 
-  The sign of the result is taken from the dividend,
-  and the absolute value of the result is always
-  less than the divisor's absolute value.
+  This function lowers directly to the `stablehlo.remainder`_ operation.
+  The sign of the result is taken from the dividend, and the absolute value
+  of the result is always less than the divisor's absolute value.
 
-  Integer division overflow
-  (remainder by zero or remainder of INT_SMIN with -1)
+  Integer division overflow (remainder by zero or remainder of INT_SMIN with -1)
   produces an implementation defined value.
+
+  Args:
+    x, y: Input arrays. Must have matching int or float dtypes. If neither
+      is a scalar, ``x`` and ``y`` must have the same number of dimensions
+      and be broadcast compatible.
+
+  Returns:
+    An array of the same dtype as ``x`` and ``y`` containing the remainder.
+
+  See also:
+    - :func:`jax.numpy.remainder`: NumPy-style remainder with different
+      sign semantics.
+
+  .. _stablehlo.remainder: https://openxla.org/stablehlo/spec#remainder
   """
   return rem_p.bind(x, y)
 
+@export
 def max(x: ArrayLike, y: ArrayLike) -> Array:
-  r"""Elementwise maximum: :math:`\mathrm{max}(x, y)`
+  r"""Elementwise maximum: :math:`\mathrm{max}(x, y)`.
+
+  This function lowers directly to the `stablehlo.maximum`_ operation for
+  non-complex inputs. For complex numbers, this uses a lexicographic
+  comparison on the `(real, imaginary)` pairs.
+
+  Args:
+    x, y: Input arrays. Must have matching dtypes. If neither is a scalar,
+      ``x`` and ``y`` must have the same rank and be broadcast compatible.
 
-  For complex numbers, uses a lexicographic comparison on the
-  `(real, imaginary)` pairs."""
+  Returns:
+    An array of the same dtype as ``x`` and ``y`` containing the elementwise
+    maximum.
+
+  See also:
+    - :func:`jax.numpy.maximum`: more flexibly NumPy-style maximum.
+    - :func:`jax.lax.reduce_max`: maximum along an axis of an array.
+    - :func:`jax.lax.min`: elementwise minimum.
+
+  .. _stablehlo.maximum: https://openxla.org/stablehlo/spec#maximum
+  """
   return max_p.bind(x, y)
 
+@export
 def min(x: ArrayLike, y: ArrayLike) -> Array:
-  r"""Elementwise minimum:  :math:`\mathrm{min}(x, y)`
+  r"""Elementwise minimum: :math:`\mathrm{min}(x, y)`
+
+  This function lowers directly to the `stablehlo.minimum`_ operation for
+  non-complex inputs. For complex numbers, this uses a lexicographic
+  comparison on the `(real, imaginary)` pairs.
+
+  Args:
+    x, y: Input arrays. Must have matching dtypes. If neither is a scalar,
+      ``x`` and ``y`` must have the same rank and be broadcast compatible.
 
-  For complex numbers, uses a lexicographic comparison on the
-  `(real, imaginary)` pairs."""
+  Returns:
+    An array of the same dtype as ``x`` and ``y`` containing the elementwise
+    minimum.
+
+  See also:
+    - :func:`jax.numpy.minimum`: more flexibly NumPy-style minimum.
+    - :func:`jax.lax.reduce_min`: minimum along an axis of an array.
+    - :func:`jax.lax.max`: elementwise maximum.
+
+  .. _stablehlo.minimum: https://openxla.org/stablehlo/spec#minimum
+  """
   return min_p.bind(x, y)
 
 @export
@@ -1408,21 +1506,38 @@ def lt(x: ArrayLike, y: ArrayLike) -> Array:
   """
   return lt_p.bind(x, y)
 
+@export
 def convert_element_type(operand: ArrayLike,
                          new_dtype: DTypeLike | dtypes.ExtendedDType) -> Array:
   """Elementwise cast.
 
-  Wraps XLA's `ConvertElementType
-  <https://www.tensorflow.org/xla/operation_semantics#convertelementtype>`_
-  operator, which performs an elementwise conversion from one type to another.
-  Similar to a C++ `static_cast`.
+  This function lowers directly to the `stablehlo.convert`_ operation, which
+  performs an elementwise conversion from one type to another, similar to a
+  C++ ``static_cast``.
 
   Args:
     operand: an array or scalar value to be cast.
-    new_dtype: a NumPy dtype representing the target type.
+    new_dtype: a dtype-like object (e.g. a :class:`numpy.dtype`, a scalar type,
+      or a valid dtype name) representing the target dtype.
 
   Returns:
-    An array with the same shape as `operand`, cast elementwise to `new_dtype`.
+    An array with the same shape as ``operand``, cast elementwise to ``new_dtype``.
+
+  .. note::
+
+     If ``new_dtype`` is a 64-bit type and `x64 mode`_ is not enabled,
+     the appropriate 32-bit type will be used in its place.
+
+     If the input is a JAX array and the input dtype and output dtype match, then
+     the input array will be returned unmodified.
+
+  See also:
+    - :func:`jax.numpy.astype`: NumPy-style dtype casting API.
+    - :meth:`jax.Array.astype`: dtype casting as an array method.
+    - :func:`jax.lax.bitcast_convert_type`: cast bits directly to a new dtype.
+
+  .. _stablehlo.convert: https://openxla.org/stablehlo/spec#convert
+  .. _x64 mode: https://docs.jax.dev/en/latest/notebooks/Common_Gotchas_in_JAX.html#double-64bit-precision
   """
   return _convert_element_type(operand, new_dtype, weak_type=False)  # type: ignore[unused-ignore,bad-return-type]
 
@@ -1500,12 +1615,11 @@ def _convert_element_type(
         operand, new_dtype=new_dtype, weak_type=bool(weak_type),
         sharding=sharding)
 
+@export
 def bitcast_convert_type(operand: ArrayLike, new_dtype: DTypeLike) -> Array:
   """Elementwise bitcast.
 
-  Wraps XLA's `BitcastConvertType
-  <https://www.tensorflow.org/xla/operation_semantics#bitcastconverttype>`_
-  operator, which performs a bit cast from one type to another.
+  This function lowers directly to the `stablehlo.bitcast_convert`_ operation.
 
   The output shape depends on the size of the input and output dtypes with
   the following logic::
@@ -1525,6 +1639,12 @@ def bitcast_convert_type(operand: ArrayLike, new_dtype: DTypeLike) -> Array:
   Returns:
     An array of shape `output_shape` (see above) and type `new_dtype`,
     constructed from the same bits as operand.
+
+  See also:
+    - :func:`jax.lax.convert_element_type`: value-preserving dtype conversion.
+    - :func:`jax.Array.view`: NumPy-style API for bitcast type conversion.
+
+  .. _stablehlo.bitcast_convert: https://openxla.org/stablehlo/spec#bitcast_convert
   """
   new_dtype = dtypes.canonicalize_dtype(new_dtype)
   return bitcast_convert_type_p.bind(operand, new_dtype=new_dtype)
Original file line number	Diff line number	Diff line change
`@@ -179,7 +179,7 @@ def _custom_partitioning_partition(arg_shapes, arg_shardings, result_shape,`
`179`	`179`	`for sharding, s in zip(result_shardings, result_shapes)`
`180`	`180`	`]`
`181`	`181`	`closed_jaxpr = jax.make_jaxpr(lower_fn, axis_env=list(mesh.shape.items()))(`
`182`		`- *tiled_args`
	`182`	`+ *info.in_tree.unflatten(tiled_args)`
`183`	`183`	`)`
`184`	`184`	`if ([(o.shape, o.dtype) for o in closed_jaxpr.out_avals] !=`
`185`	`185`	`[(t.shape, t.dtype) for t in tiled_results]):`