ROCm
diff --git a/‎.github/workflows/ci-build.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci-build.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/tsan.yaml‎
Lines changed: 73 additions & 1 deletion b/‎.github/workflows/tsan.yaml‎
Lines changed: 73 additions & 1 deletion
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎build/test-requirements.txt‎
Lines changed: 2 additions & 1 deletion b/‎build/test-requirements.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/autodidax.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎docs/autodidax.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/autodidax.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/autodidax.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/autodidax.py‎
Lines changed: 1 addition & 1 deletion b/‎docs/autodidax.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/jax.random.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/jax.random.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/persistent_compilation_cache.md‎
Lines changed: 30 additions & 0 deletions b/‎docs/persistent_compilation_cache.md‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎jax/_src/abstract_arrays.py‎
Lines changed: 10 additions & 5 deletions b/‎jax/_src/abstract_arrays.py‎
Lines changed: 10 additions & 5 deletions
@@ -96,7 +96,7 @@ jobs:
         echo "JAX_THREEFRY_PARTITIONABLE=$JAX_THREEFRY_PARTITIONABLE"
         echo "JAX_ENABLE_CHECKS=$JAX_ENABLE_CHECKS"
         echo "JAX_SKIP_SLOW_TESTS=$JAX_SKIP_SLOW_TESTS"
-        pytest -n auto --tb=short --maxfail=20 tests examples
+        pytest -n 4 --tb=short --maxfail=20 tests examples
 
 
   documentation:
 
@@ -44,6 +44,11 @@ jobs:
           repository: python/cpython
           path: cpython
           ref: "3.13"
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: numpy/numpy
+          path: numpy
+          submodules: true
 
       - name: Restore cached CPython with TSAN
         id: cache-cpython-tsan-restore
@@ -67,7 +72,7 @@ jobs:
           # Create archive to be used with bazel as hermetic python:
           cd ${GITHUB_WORKSPACE} && tar -czpf python-tsan.tgz cpython-tsan
 
-      - name: Save CPython with TSAN
+      - name: Save TSAN CPython
         id: cache-cpython-tsan-save
         if: steps.cache-cpython-tsan-restore.outputs.cache-hit != 'true'
         uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
@@ -76,6 +81,73 @@ jobs:
             ./python-tsan.tgz
           key: ${{ runner.os }}-cpython-tsan-${{ hashFiles('cpython/configure.ac') }}
 
+      - name: Get year & week number
+        id: get-date
+        run: echo "date=$(/bin/date "+%Y-%U")" >> $GITHUB_OUTPUT
+        shell: bash -l {0}
+
+      - name: Restore cached TSAN Numpy
+        id: cache-numpy-tsan-restore
+        uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
+        with:
+          path: |
+            ./wheelhouse
+          key: ${{ runner.os }}-numpy-tsan-${{ hashFiles('numpy/pyproject.toml') }}-${{ steps.get-date.outputs.date }}
+
+      - name: Build TSAN Numpy wheel
+        if: steps.cache-numpy-tsan-restore.outputs.cache-hit != 'true'
+        run: |
+          cd numpy
+
+          # If we restored cpython from cache, we need to get python interpreter from python-tsan.tgz
+          if [ ! -d ${GITHUB_WORKSPACE}/cpython-tsan/bin/ ]; then
+            echo "Extract cpython from python-tsan.tgz"
+            pushd .
+            ls ${GITHUB_WORKSPACE}/python-tsan.tgz
+            cd ${GITHUB_WORKSPACE} && tar -xzf python-tsan.tgz
+            ls ${GITHUB_WORKSPACE}/cpython-tsan/bin/
+            popd
+          fi
+
+          export PATH=${GITHUB_WORKSPACE}/cpython-tsan/bin/:$PATH
+
+          python3 -m pip install -r requirements/build_requirements.txt
+          # Make sure to install a compatible Cython version (master branch is best for now)
+          python3 -m pip install -U git+https://github.com/cython/cython
+
+          CC=clang-18 CXX=clang++-18 python3 -m pip wheel --wheel-dir dist -v . --no-build-isolation -Csetup-args=-Db_sanitize=thread -Csetup-args=-Dbuildtype=debugoptimized
+
+          # Create simple index and copy the wheel
+          mkdir -p ${GITHUB_WORKSPACE}/wheelhouse/numpy
+
+          numpy_whl_name=($(cd dist && ls numpy*.whl))
+          if [ -z "${numpy_whl_name}" ]; then exit 1; fi
+
+          echo "Built TSAN Numpy wheel: ${numpy_whl_name}"
+
+          cp dist/${numpy_whl_name} ${GITHUB_WORKSPACE}/wheelhouse/numpy
+
+          cat << EOF > ${GITHUB_WORKSPACE}/wheelhouse/index.html
+          <!DOCTYPE html><html><body>
+          <a href="numpy">numpy></a></br>
+          </body></html>
+          EOF
+
+          cat << EOF > ${GITHUB_WORKSPACE}/wheelhouse/numpy/index.html
+          <!DOCTYPE html><html><body>
+          <a href="${numpy_whl_name}">${numpy_whl_name}</a></br>
+          </body></html>
+          EOF
+
+      - name: Save TSAN Numpy wheel
+        id: cache-numpy-tsan-save
+        if: steps.cache-numpy-tsan-restore.outputs.cache-hit != 'true'
+        uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
+        with:
+          path: |
+            ./wheelhouse
+          key: ${{ runner.os }}-numpy-tsan-${{ hashFiles('numpy/pyproject.toml') }}-${{ steps.get-date.outputs.date }}
+
       - name: Build Jax and run tests
         timeout-minutes: 120
         env:
 
@@ -21,6 +21,7 @@ When releasing, please add the new-release-boilerplate to docs/pallas/CHANGELOG.
     decorator to support customizing the behavior of opaque functions under
     JAX-level dead code elimination (DCE). See {jax-issue}`#25956` for more
     details.
+  * Added {func}`jax.random.multinomial`.
 
 * Changes
   * `JAX_CPU_COLLECTIVES_IMPLEMENTATION` and `JAX_NUM_CPU_DEVICES` now work as
 
@@ -7,7 +7,8 @@ flatbuffers
 hypothesis
 mpmath>=1.3
 pillow>=10.4.0
-portpicker
+# TODO(kanglan): Remove once psutil from portpicker supports python 3.13t 
+portpicker; python_version<"3.13"
 pytest-xdist
 wheel
 rich
 
@@ -146,7 +146,7 @@
     "around calls to `bind`. These wrappers let us control how arguments are passed\n",
     "to `bind`, and in particular we follow a handy internal convention: when we\n",
     "call `bind`, we pass values representing array data as positional arguments,\n",
-    "and we pass metadata like the `axis` argument to `sum_p` via keyword. This\n",
+    "and we pass metadata like the `axis` argument to `reduce_sum_p` via keyword. This\n",
     "calling convention simplifies some core logic (since e.g. instances of the\n",
     "`Tracer` class to be defined below can only occur in positional arguments to\n",
     "`bind`). The wrappers can also provide docstrings!\n",
 
@@ -133,7 +133,7 @@ The functions that user code calls, like `add` and `sin`, are just wrappers
 around calls to `bind`. These wrappers let us control how arguments are passed
 to `bind`, and in particular we follow a handy internal convention: when we
 call `bind`, we pass values representing array data as positional arguments,
-and we pass metadata like the `axis` argument to `sum_p` via keyword. This
+and we pass metadata like the `axis` argument to `reduce_sum_p` via keyword. This
 calling convention simplifies some core logic (since e.g. instances of the
 `Tracer` class to be defined below can only occur in positional arguments to
 `bind`). The wrappers can also provide docstrings!
 
@@ -123,7 +123,7 @@ def bind1(prim, *args, **params):
 # around calls to `bind`. These wrappers let us control how arguments are passed
 # to `bind`, and in particular we follow a handy internal convention: when we
 # call `bind`, we pass values representing array data as positional arguments,
-# and we pass metadata like the `axis` argument to `sum_p` via keyword. This
+# and we pass metadata like the `axis` argument to `reduce_sum_p` via keyword. This
 # calling convention simplifies some core logic (since e.g. instances of the
 # `Tracer` class to be defined below can only occur in positional arguments to
 # `bind`). The wrappers can also provide docstrings!
 
@@ -53,6 +53,7 @@ Random Samplers
     logistic
     lognormal
     maxwell
+    multinomial
     multivariate_normal
     normal
     orthogonal
 
@@ -168,6 +168,36 @@ so it is important for the persistent cache to be in a shared file system (eg: N
 If the persistent cache is local to rank 0, then all processes except rank 0 will once again compile
 in subsequent runs as a result of a compilation cache miss.
 
+### Pre-compiling multi-node programs on single node
+
+JAX can populate the compilation cache with compiled programs for multiple nodes
+on a single node. Preparing the cache on a single node helps to decrease the costly
+compilation time on a cluster. To compile and run multi-node programs on a single
+node, users can create fake remote devices using
+the `jax_mock_gpu_topology` configuration option.
+
+For instance, the snippet below instructs JAX to mock a cluster with four
+nodes, each node running eight processes with each process attached to one GPU.
+
+```python
+jax.config.update("jax_mock_gpu_topology", "4x8x1")
+```
+
+After populating the cache with this config, users can run the program
+without recompilation on four nodes, eight processes per node,
+one GPU per process.
+
+Important notes:
+
+* The process running the mocked program must have the same amount of GPUs
+  and the same GPU model as the nodes that would use the cache. For instance,
+  a mocked topology `8x4x2` must run in a process with two GPUs.
+
+* When running programs with mocked topology, the results of communications
+  with other nodes are undefined, so the outputs of JAX programs running
+  in mocked environments will likely be incorrect.
+
+
 ## Logging cache activity
 
 It can be helpful to examine what exactly is happening with the persistent compilation cache for debugging.
 
@@ -45,24 +45,28 @@
 
 
 def masked_array_error(*args, **kwargs):
-  raise ValueError("numpy masked arrays are not supported as direct inputs to JAX functions. "
-                   "Use arr.filled() to convert the value to a standard numpy array.")
+  raise ValueError(
+      "numpy masked arrays are not supported as direct inputs to JAX functions."
+      " Use arr.filled() to convert the value to a standard numpy array.")
 
 core.pytype_aval_mappings[np.ma.MaskedArray] = masked_array_error
 
 
 def _make_shaped_array_for_numpy_array(x: np.ndarray) -> ShapedArray:
   dtype = x.dtype
   dtypes.check_valid_dtype(dtype)
-  return ShapedArray(x.shape, dtypes.canonicalize_dtype(dtype))
+  return ShapedArray(x.shape, dtypes.canonicalize_dtype(dtype),
+                     sharding=core.get_cur_mesh_sharding(core.P(*[None] * x.ndim)))
 
 core.pytype_aval_mappings[np.ndarray] = _make_shaped_array_for_numpy_array
 
 
 def _make_shaped_array_for_numpy_scalar(x: np.generic) -> ShapedArray:
   dtype = np.dtype(x)
   dtypes.check_valid_dtype(dtype)
-  return ShapedArray(np.shape(x), dtypes.canonicalize_dtype(dtype))
+  shape = np.shape(x)
+  return ShapedArray(shape, dtypes.canonicalize_dtype(dtype),
+                     sharding=core.get_cur_mesh_sharding(core.P(*[None] * len(shape))))
 
 for t in numpy_scalar_types:
   core.pytype_aval_mappings[t] = _make_shaped_array_for_numpy_scalar
@@ -74,7 +78,8 @@ def _make_abstract_python_scalar(typ, val):
   # Note: all python scalar types are weak except bool, because bool only
   # comes in a single width.
   return ShapedArray((), dtypes._scalar_type_to_dtype(typ, val),
-                     weak_type=typ is not bool)
+                     weak_type=typ is not bool,
+                     sharding=core.get_cur_mesh_sharding())
 
 for t in dtypes.python_scalar_dtypes:
   core.pytype_aval_mappings[t] = partial(_make_abstract_python_scalar, t)